From d9d04de2555c1c9e7ca2d94c8af17b505c5c18ec Mon Sep 17 00:00:00 2001 From: Abhishek Kaushik Date: Sat, 21 Jun 2025 21:48:05 +0530 Subject: [PATCH 1/8] [X86] Combine `store + vselect` to `masked_store`` Add a new combine to replace ``` (store ch (vselect cond truevec (load ch ptr offset)) ptr offset) ``` to ``` (mstore ch truevec ptr offset cond) ``` --- llvm/lib/Target/X86/X86ISelLowering.cpp | 78 +++++ .../test/CodeGen/X86/combine-storetomstore.ll | 276 ++++++++++++++++++ 2 files changed, 354 insertions(+) create mode 100644 llvm/test/CodeGen/X86/combine-storetomstore.ll diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 307a237e2955c..1f8c5836f876d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -66,6 +66,7 @@ #include #include #include +#include using namespace llvm; #define DEBUG_TYPE "x86-isel" @@ -53388,6 +53389,80 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue foldToMaskedStore(StoreSDNode *Store, SelectionDAG &DAG, + const SDLoc &Dl, + const X86Subtarget &Subtarget) { + if (!Subtarget.hasAVX() && !Subtarget.hasAVX2() && !Subtarget.hasAVX512()) + return SDValue(); + + if (!Store->isSimple()) + return SDValue(); + + SDValue StoredVal = Store->getValue(); + SDValue StorePtr = Store->getBasePtr(); + SDValue StoreOffset = Store->getOffset(); + EVT VT = StoredVal.getValueType(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + if (!TLI.isTypeLegal(VT) || !TLI.isOperationLegalOrCustom(ISD::MSTORE, VT)) + return SDValue(); + + if (StoredVal.getOpcode() != ISD::VSELECT) + return SDValue(); + + SDValue Mask = StoredVal.getOperand(0); + SDValue TrueVec = StoredVal.getOperand(1); + SDValue FalseVec = StoredVal.getOperand(2); + + LoadSDNode *Load = cast(FalseVec.getNode()); + if (!Load || !Load->isSimple()) + return SDValue(); + + SDValue LoadPtr = Load->getBasePtr(); + SDValue LoadOffset = Load->getOffset(); + + if (StorePtr != LoadPtr || StoreOffset != LoadOffset) + return SDValue(); + + auto IsSafeToFold = [](StoreSDNode *Store, LoadSDNode *Load) { + std::queue Worklist; + + Worklist.push(Store->getChain()); + + while (!Worklist.empty()) { + SDValue Chain = Worklist.front(); + Worklist.pop(); + + SDNode *Node = Chain.getNode(); + if (!Node) + return false; + + if (const auto *MemNode = dyn_cast(Node)) + if (!MemNode->isSimple() || MemNode->writeMem()) + return false; + + if (Node == Load) + return true; + + if (Node->getOpcode() == ISD::TokenFactor) { + for (unsigned i = 0; i < Node->getNumOperands(); ++i) + Worklist.push(Node->getOperand(i)); + } else { + Worklist.push(Node->getOperand(0)); + } + } + + return false; + }; + + if (!IsSafeToFold(Store, Load)) + return SDValue(); + + return DAG.getMaskedStore(Store->getChain(), Dl, TrueVec, StorePtr, + StoreOffset, Mask, Store->getMemoryVT(), + Store->getMemOperand(), Store->getAddressingMode()); +} + static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -53713,6 +53788,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, St->getMemOperand()->getFlags()); } + if (SDValue MaskedStore = foldToMaskedStore(St, DAG, dl, Subtarget)) + return MaskedStore; + return SDValue(); } diff --git a/llvm/test/CodeGen/X86/combine-storetomstore.ll b/llvm/test/CodeGen/X86/combine-storetomstore.ll new file mode 100644 index 0000000000000..75d0dd85cafda --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-storetomstore.ll @@ -0,0 +1,276 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s -check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s -check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s -check-prefix=AVX512 + + +define void @test_masked_store_success(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +; AVX-LABEL: test_masked_store_success: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vpmaskmovd %ymm0, %ymm1, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpsllq $63, %zmm1, %zmm1 +; AVX512-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +; AVX-LABEL: test_masked_store_volatile_load: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmovaps (%rdi), %ymm2 +; AVX-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmovaps %ymm0, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_volatile_load: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_volatile_load: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpsllq $63, %zmm1, %zmm1 +; AVX512-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa %ymm1, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load volatile <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +; AVX-LABEL: test_masked_store_volatile_store: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmovaps (%rdi), %ymm2 +; AVX-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmovaps %ymm0, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_volatile_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_volatile_store: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpsllq $63, %zmm1, %zmm1 +; AVX512-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa %ymm1, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + store volatile <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +declare void @use_vec(<8 x i32>) + +define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +; AVX-LABEL: test_masked_store_intervening: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbx +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: subq $32, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 +; AVX-NEXT: .cfi_offset %rbx, -16 +; AVX-NEXT: movq %rdi, %rbx +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmovaps (%rdi), %ymm2 +; AVX-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovaps %ymm0, (%rdi) +; AVX-NEXT: callq use_vec@PLT +; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, (%rbx) +; AVX-NEXT: addq $32, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: popq %rbx +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_intervening: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: .cfi_def_cfa_offset 16 +; AVX2-NEXT: subq $32, %rsp +; AVX2-NEXT: .cfi_def_cfa_offset 48 +; AVX2-NEXT: .cfi_offset %rbx, -16 +; AVX2-NEXT: movq %rdi, %rbx +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-NEXT: callq use_vec@PLT +; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, (%rbx) +; AVX2-NEXT: addq $32, %rsp +; AVX2-NEXT: .cfi_def_cfa_offset 16 +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: .cfi_def_cfa_offset 8 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_intervening: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: subq $144, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 160 +; AVX512-NEXT: .cfi_offset %rbx, -16 +; AVX512-NEXT: movq %rdi, %rbx +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm0 +; AVX512-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512-NEXT: vmovaps (%rdi), %ymm0 +; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %ymm0, (%rdi) +; AVX512-NEXT: callq use_vec@PLT +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa %ymm1, (%rbx) +; AVX512-NEXT: addq $144, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x i32>, ptr %ptr, align 32 + store <8 x i32> zeroinitializer, ptr %ptr, align 32 + %tmp = load <8 x i32>, ptr %ptr + call void @use_vec(<8 x i32> %tmp) + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + + +define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %cmp, <8 x i1> %cmp2) { +; AVX-LABEL: foo: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX-NEXT: vpslld $31, %xmm4, %xmm4 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX-NEXT: vpslld $31, %xmm4, %xmm4 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX-NEXT: vmovaps (%rsi), %ymm4 +; AVX-NEXT: vblendvps %ymm3, %ymm1, %ymm4, %ymm1 +; AVX-NEXT: vmaskmovps %ymm0, %ymm2, (%rdi) +; AVX-NEXT: vmovaps %ymm1, (%rsi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: foo: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX2-NEXT: vpslld $31, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX2-NEXT: vpslld $31, %ymm3, %ymm3 +; AVX2-NEXT: vmovaps (%rsi), %ymm4 +; AVX2-NEXT: vblendvps %ymm3, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpmaskmovd %ymm0, %ymm2, (%rdi) +; AVX2-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: foo: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovsxwq %xmm2, %zmm2 +; AVX512-NEXT: vpsllq $63, %zmm2, %zmm2 +; AVX512-NEXT: vptestmq %zmm2, %zmm2, %k1 +; AVX512-NEXT: vpmovsxwq %xmm3, %zmm2 +; AVX512-NEXT: vpsllq $63, %zmm2, %zmm2 +; AVX512-NEXT: vptestmq %zmm2, %zmm2, %k2 +; AVX512-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512-NEXT: vmovdqa32 %zmm1, %zmm2 {%k2} +; AVX512-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x i32>, ptr %ptr1, align 32 + %load2 = load <8 x i32>, ptr %ptr2, align 32 + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + %sel2 = select <8 x i1> %cmp2, <8 x i32> %y, <8 x i32> %load2 + store <8 x i32> %sel, ptr %ptr1, align 32 + store <8 x i32> %sel2, ptr %ptr2, align 32 + ret void +} From 9eca2097585765cccbffdb9e76dd373c5e8d0edc Mon Sep 17 00:00:00 2001 From: Abhishek Kaushik Date: Sun, 22 Jun 2025 16:54:05 +0530 Subject: [PATCH 2/8] Use pattern match --- llvm/include/llvm/CodeGen/SDPatternMatch.h | 6 ++++ llvm/lib/Target/X86/X86ISelLowering.cpp | 40 ++++++++++------------ 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h index 83187b4a0241c..d9fbba7c01325 100644 --- a/llvm/include/llvm/CodeGen/SDPatternMatch.h +++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h @@ -537,6 +537,12 @@ m_Load(const T0_P &Ch, const T1_P &Ptr, const T2_P &Offset) { TernaryOpc_match(ISD::LOAD, Ch, Ptr, Offset)); } +template +inline TernaryOpc_match +m_Load(const T0_P &Ch, const T1_P &Ptr, const T2_P &Offset) { + return TernaryOpc_match(ISD::LOAD, Ch, Ptr, Offset); +} + template inline TernaryOpc_match m_InsertElt(const T0_P &Vec, const T1_P &Val, const T2_P &Idx) { diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1f8c5836f876d..42418188c9c25 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -53392,38 +53392,34 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, static SDValue foldToMaskedStore(StoreSDNode *Store, SelectionDAG &DAG, const SDLoc &Dl, const X86Subtarget &Subtarget) { + using namespace llvm::SDPatternMatch; + if (!Subtarget.hasAVX() && !Subtarget.hasAVX2() && !Subtarget.hasAVX512()) return SDValue(); - if (!Store->isSimple()) + if (!Store->isSimple() || Store->isTruncatingStore()) return SDValue(); SDValue StoredVal = Store->getValue(); SDValue StorePtr = Store->getBasePtr(); SDValue StoreOffset = Store->getOffset(); - EVT VT = StoredVal.getValueType(); + EVT VT = Store->getMemoryVT(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isTypeLegal(VT) || !TLI.isOperationLegalOrCustom(ISD::MSTORE, VT)) return SDValue(); - if (StoredVal.getOpcode() != ISD::VSELECT) + SDValue Mask, TrueVec, LoadCh; + if (!sd_match(StoredVal, + m_VSelect(m_Value(Mask), m_Value(TrueVec), + m_Load(m_Value(LoadCh), m_Specific(StorePtr), + m_Specific(StoreOffset))))) return SDValue(); - SDValue Mask = StoredVal.getOperand(0); - SDValue TrueVec = StoredVal.getOperand(1); - SDValue FalseVec = StoredVal.getOperand(2); - - LoadSDNode *Load = cast(FalseVec.getNode()); + LoadSDNode *Load = cast(StoredVal.getOperand(2)); if (!Load || !Load->isSimple()) return SDValue(); - SDValue LoadPtr = Load->getBasePtr(); - SDValue LoadOffset = Load->getOffset(); - - if (StorePtr != LoadPtr || StoreOffset != LoadOffset) - return SDValue(); - auto IsSafeToFold = [](StoreSDNode *Store, LoadSDNode *Load) { std::queue Worklist; @@ -53437,13 +53433,13 @@ static SDValue foldToMaskedStore(StoreSDNode *Store, SelectionDAG &DAG, if (!Node) return false; + if (Node == Load) + return true; + if (const auto *MemNode = dyn_cast(Node)) if (!MemNode->isSimple() || MemNode->writeMem()) return false; - if (Node == Load) - return true; - if (Node->getOpcode() == ISD::TokenFactor) { for (unsigned i = 0; i < Node->getNumOperands(); ++i) Worklist.push(Node->getOperand(i)); @@ -53459,8 +53455,8 @@ static SDValue foldToMaskedStore(StoreSDNode *Store, SelectionDAG &DAG, return SDValue(); return DAG.getMaskedStore(Store->getChain(), Dl, TrueVec, StorePtr, - StoreOffset, Mask, Store->getMemoryVT(), - Store->getMemOperand(), Store->getAddressingMode()); + StoreOffset, Mask, VT, Store->getMemOperand(), + Store->getAddressingMode()); } static SDValue combineStore(SDNode *N, SelectionDAG &DAG, @@ -53727,6 +53723,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, St->getMemOperand()); } + if (SDValue MaskedStore = foldToMaskedStore(St, DAG, dl, Subtarget)) + return MaskedStore; + // Turn load->store of MMX types into GPR load/stores. This avoids clobbering // the FP state in cases where an emms may be missing. // A preferable solution to the general problem is to figure out the right @@ -53788,9 +53787,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, St->getMemOperand()->getFlags()); } - if (SDValue MaskedStore = foldToMaskedStore(St, DAG, dl, Subtarget)) - return MaskedStore; - return SDValue(); } From c0d5cf02c08627c213510331e943374ba1506046 Mon Sep 17 00:00:00 2001 From: Abhishek Kaushik Date: Sun, 22 Jun 2025 16:54:31 +0530 Subject: [PATCH 3/8] Fix tests --- llvm/test/CodeGen/X86/combine-storetomstore.ll | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/test/CodeGen/X86/combine-storetomstore.ll b/llvm/test/CodeGen/X86/combine-storetomstore.ll index 75d0dd85cafda..7b39fa450cbf8 100644 --- a/llvm/test/CodeGen/X86/combine-storetomstore.ll +++ b/llvm/test/CodeGen/X86/combine-storetomstore.ll @@ -218,7 +218,7 @@ define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %cmp, <8 x i1> %cmp2) { -; AVX-LABEL: foo: +; AVX-LABEL: test_masked_store_multiple: ; AVX: # %bb.0: ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX-NEXT: vpslld $31, %xmm4, %xmm4 @@ -237,7 +237,7 @@ define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, p ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX2-LABEL: foo: +; AVX2-LABEL: test_masked_store_multiple: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX2-NEXT: vpslld $31, %ymm2, %ymm2 @@ -250,7 +250,7 @@ define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, p ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: foo: +; AVX512-LABEL: test_masked_store_multiple: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 From b3a45226e67009ce09a76d7290f6d8751ad6415b Mon Sep 17 00:00:00 2001 From: Abhishek Kaushik Date: Thu, 26 Jun 2025 23:06:57 +0530 Subject: [PATCH 4/8] Revert last 3 commits --- llvm/include/llvm/CodeGen/SDPatternMatch.h | 6 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 40 ++++++++++--------- .../test/CodeGen/X86/combine-storetomstore.ll | 6 +-- 3 files changed, 25 insertions(+), 27 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h index d9fbba7c01325..83187b4a0241c 100644 --- a/llvm/include/llvm/CodeGen/SDPatternMatch.h +++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h @@ -537,12 +537,6 @@ m_Load(const T0_P &Ch, const T1_P &Ptr, const T2_P &Offset) { TernaryOpc_match(ISD::LOAD, Ch, Ptr, Offset)); } -template -inline TernaryOpc_match -m_Load(const T0_P &Ch, const T1_P &Ptr, const T2_P &Offset) { - return TernaryOpc_match(ISD::LOAD, Ch, Ptr, Offset); -} - template inline TernaryOpc_match m_InsertElt(const T0_P &Vec, const T1_P &Val, const T2_P &Idx) { diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 42418188c9c25..1f8c5836f876d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -53392,34 +53392,38 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, static SDValue foldToMaskedStore(StoreSDNode *Store, SelectionDAG &DAG, const SDLoc &Dl, const X86Subtarget &Subtarget) { - using namespace llvm::SDPatternMatch; - if (!Subtarget.hasAVX() && !Subtarget.hasAVX2() && !Subtarget.hasAVX512()) return SDValue(); - if (!Store->isSimple() || Store->isTruncatingStore()) + if (!Store->isSimple()) return SDValue(); SDValue StoredVal = Store->getValue(); SDValue StorePtr = Store->getBasePtr(); SDValue StoreOffset = Store->getOffset(); - EVT VT = Store->getMemoryVT(); + EVT VT = StoredVal.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isTypeLegal(VT) || !TLI.isOperationLegalOrCustom(ISD::MSTORE, VT)) return SDValue(); - SDValue Mask, TrueVec, LoadCh; - if (!sd_match(StoredVal, - m_VSelect(m_Value(Mask), m_Value(TrueVec), - m_Load(m_Value(LoadCh), m_Specific(StorePtr), - m_Specific(StoreOffset))))) + if (StoredVal.getOpcode() != ISD::VSELECT) return SDValue(); - LoadSDNode *Load = cast(StoredVal.getOperand(2)); + SDValue Mask = StoredVal.getOperand(0); + SDValue TrueVec = StoredVal.getOperand(1); + SDValue FalseVec = StoredVal.getOperand(2); + + LoadSDNode *Load = cast(FalseVec.getNode()); if (!Load || !Load->isSimple()) return SDValue(); + SDValue LoadPtr = Load->getBasePtr(); + SDValue LoadOffset = Load->getOffset(); + + if (StorePtr != LoadPtr || StoreOffset != LoadOffset) + return SDValue(); + auto IsSafeToFold = [](StoreSDNode *Store, LoadSDNode *Load) { std::queue Worklist; @@ -53433,13 +53437,13 @@ static SDValue foldToMaskedStore(StoreSDNode *Store, SelectionDAG &DAG, if (!Node) return false; - if (Node == Load) - return true; - if (const auto *MemNode = dyn_cast(Node)) if (!MemNode->isSimple() || MemNode->writeMem()) return false; + if (Node == Load) + return true; + if (Node->getOpcode() == ISD::TokenFactor) { for (unsigned i = 0; i < Node->getNumOperands(); ++i) Worklist.push(Node->getOperand(i)); @@ -53455,8 +53459,8 @@ static SDValue foldToMaskedStore(StoreSDNode *Store, SelectionDAG &DAG, return SDValue(); return DAG.getMaskedStore(Store->getChain(), Dl, TrueVec, StorePtr, - StoreOffset, Mask, VT, Store->getMemOperand(), - Store->getAddressingMode()); + StoreOffset, Mask, Store->getMemoryVT(), + Store->getMemOperand(), Store->getAddressingMode()); } static SDValue combineStore(SDNode *N, SelectionDAG &DAG, @@ -53723,9 +53727,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, St->getMemOperand()); } - if (SDValue MaskedStore = foldToMaskedStore(St, DAG, dl, Subtarget)) - return MaskedStore; - // Turn load->store of MMX types into GPR load/stores. This avoids clobbering // the FP state in cases where an emms may be missing. // A preferable solution to the general problem is to figure out the right @@ -53787,6 +53788,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, St->getMemOperand()->getFlags()); } + if (SDValue MaskedStore = foldToMaskedStore(St, DAG, dl, Subtarget)) + return MaskedStore; + return SDValue(); } diff --git a/llvm/test/CodeGen/X86/combine-storetomstore.ll b/llvm/test/CodeGen/X86/combine-storetomstore.ll index 7b39fa450cbf8..75d0dd85cafda 100644 --- a/llvm/test/CodeGen/X86/combine-storetomstore.ll +++ b/llvm/test/CodeGen/X86/combine-storetomstore.ll @@ -218,7 +218,7 @@ define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %cmp, <8 x i1> %cmp2) { -; AVX-LABEL: test_masked_store_multiple: +; AVX-LABEL: foo: ; AVX: # %bb.0: ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX-NEXT: vpslld $31, %xmm4, %xmm4 @@ -237,7 +237,7 @@ define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, p ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX2-LABEL: test_masked_store_multiple: +; AVX2-LABEL: foo: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX2-NEXT: vpslld $31, %ymm2, %ymm2 @@ -250,7 +250,7 @@ define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, p ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_masked_store_multiple: +; AVX512-LABEL: foo: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 From 04366fa8543ce983f9de5914af5931105a09d9bb Mon Sep 17 00:00:00 2001 From: Abhishek Kaushik Date: Thu, 26 Jun 2025 23:09:47 +0530 Subject: [PATCH 5/8] Revert "[X86] Combine `store + vselect` to `masked_store``" This reverts commit 73c5a668e2c4ff72195a816b1b3c93279ed46185. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 78 ----- .../test/CodeGen/X86/combine-storetomstore.ll | 276 ------------------ 2 files changed, 354 deletions(-) delete mode 100644 llvm/test/CodeGen/X86/combine-storetomstore.ll diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1f8c5836f876d..307a237e2955c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -66,7 +66,6 @@ #include #include #include -#include using namespace llvm; #define DEBUG_TYPE "x86-isel" @@ -53389,80 +53388,6 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue foldToMaskedStore(StoreSDNode *Store, SelectionDAG &DAG, - const SDLoc &Dl, - const X86Subtarget &Subtarget) { - if (!Subtarget.hasAVX() && !Subtarget.hasAVX2() && !Subtarget.hasAVX512()) - return SDValue(); - - if (!Store->isSimple()) - return SDValue(); - - SDValue StoredVal = Store->getValue(); - SDValue StorePtr = Store->getBasePtr(); - SDValue StoreOffset = Store->getOffset(); - EVT VT = StoredVal.getValueType(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - - if (!TLI.isTypeLegal(VT) || !TLI.isOperationLegalOrCustom(ISD::MSTORE, VT)) - return SDValue(); - - if (StoredVal.getOpcode() != ISD::VSELECT) - return SDValue(); - - SDValue Mask = StoredVal.getOperand(0); - SDValue TrueVec = StoredVal.getOperand(1); - SDValue FalseVec = StoredVal.getOperand(2); - - LoadSDNode *Load = cast(FalseVec.getNode()); - if (!Load || !Load->isSimple()) - return SDValue(); - - SDValue LoadPtr = Load->getBasePtr(); - SDValue LoadOffset = Load->getOffset(); - - if (StorePtr != LoadPtr || StoreOffset != LoadOffset) - return SDValue(); - - auto IsSafeToFold = [](StoreSDNode *Store, LoadSDNode *Load) { - std::queue Worklist; - - Worklist.push(Store->getChain()); - - while (!Worklist.empty()) { - SDValue Chain = Worklist.front(); - Worklist.pop(); - - SDNode *Node = Chain.getNode(); - if (!Node) - return false; - - if (const auto *MemNode = dyn_cast(Node)) - if (!MemNode->isSimple() || MemNode->writeMem()) - return false; - - if (Node == Load) - return true; - - if (Node->getOpcode() == ISD::TokenFactor) { - for (unsigned i = 0; i < Node->getNumOperands(); ++i) - Worklist.push(Node->getOperand(i)); - } else { - Worklist.push(Node->getOperand(0)); - } - } - - return false; - }; - - if (!IsSafeToFold(Store, Load)) - return SDValue(); - - return DAG.getMaskedStore(Store->getChain(), Dl, TrueVec, StorePtr, - StoreOffset, Mask, Store->getMemoryVT(), - Store->getMemOperand(), Store->getAddressingMode()); -} - static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -53788,9 +53713,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, St->getMemOperand()->getFlags()); } - if (SDValue MaskedStore = foldToMaskedStore(St, DAG, dl, Subtarget)) - return MaskedStore; - return SDValue(); } diff --git a/llvm/test/CodeGen/X86/combine-storetomstore.ll b/llvm/test/CodeGen/X86/combine-storetomstore.ll deleted file mode 100644 index 75d0dd85cafda..0000000000000 --- a/llvm/test/CodeGen/X86/combine-storetomstore.ll +++ /dev/null @@ -1,276 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s -check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s -check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s -check-prefix=AVX512 - - -define void @test_masked_store_success(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { -; AVX-LABEL: test_masked_store_success: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX-NEXT: vpslld $31, %xmm2, %xmm2 -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; AVX-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq -; -; AVX2-LABEL: test_masked_store_success: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 -; AVX2-NEXT: vpmaskmovd %ymm0, %ymm1, (%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_masked_store_success: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 -; AVX512-NEXT: vpsllq $63, %zmm1, %zmm1 -; AVX512-NEXT: vptestmq %zmm1, %zmm1, %k1 -; AVX512-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %load = load <8 x i32>, ptr %ptr, align 32 - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load - store <8 x i32> %sel, ptr %ptr, align 32 - ret void -} - -define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { -; AVX-LABEL: test_masked_store_volatile_load: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX-NEXT: vpslld $31, %xmm2, %xmm2 -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; AVX-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX-NEXT: vmovaps (%rdi), %ymm2 -; AVX-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vmovaps %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq -; -; AVX2-LABEL: test_masked_store_volatile_load: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 -; AVX2-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vmovaps %ymm0, (%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_masked_store_volatile_load: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 -; AVX512-NEXT: vpsllq $63, %zmm1, %zmm1 -; AVX512-NEXT: vptestmq %zmm1, %zmm1, %k1 -; AVX512-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa %ymm1, (%rdi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %load = load volatile <8 x i32>, ptr %ptr, align 32 - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load - store <8 x i32> %sel, ptr %ptr, align 32 - ret void -} - -define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { -; AVX-LABEL: test_masked_store_volatile_store: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX-NEXT: vpslld $31, %xmm2, %xmm2 -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; AVX-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX-NEXT: vmovaps (%rdi), %ymm2 -; AVX-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vmovaps %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq -; -; AVX2-LABEL: test_masked_store_volatile_store: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 -; AVX2-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vmovaps %ymm0, (%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_masked_store_volatile_store: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 -; AVX512-NEXT: vpsllq $63, %zmm1, %zmm1 -; AVX512-NEXT: vptestmq %zmm1, %zmm1, %k1 -; AVX512-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa %ymm1, (%rdi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %load = load <8 x i32>, ptr %ptr, align 32 - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load - store volatile <8 x i32> %sel, ptr %ptr, align 32 - ret void -} - -declare void @use_vec(<8 x i32>) - -define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { -; AVX-LABEL: test_masked_store_intervening: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbx -; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: subq $32, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: .cfi_offset %rbx, -16 -; AVX-NEXT: movq %rdi, %rbx -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX-NEXT: vpslld $31, %xmm2, %xmm2 -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; AVX-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX-NEXT: vmovaps (%rdi), %ymm2 -; AVX-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %ymm0, (%rdi) -; AVX-NEXT: callq use_vec@PLT -; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, (%rbx) -; AVX-NEXT: addq $32, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: popq %rbx -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq -; -; AVX2-LABEL: test_masked_store_intervening: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: .cfi_def_cfa_offset 16 -; AVX2-NEXT: subq $32, %rsp -; AVX2-NEXT: .cfi_def_cfa_offset 48 -; AVX2-NEXT: .cfi_offset %rbx, -16 -; AVX2-NEXT: movq %rdi, %rbx -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 -; AVX2-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovaps %ymm0, (%rdi) -; AVX2-NEXT: callq use_vec@PLT -; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, (%rbx) -; AVX2-NEXT: addq $32, %rsp -; AVX2-NEXT: .cfi_def_cfa_offset 16 -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: .cfi_def_cfa_offset 8 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_masked_store_intervening: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: .cfi_def_cfa_offset 16 -; AVX512-NEXT: subq $144, %rsp -; AVX512-NEXT: .cfi_def_cfa_offset 160 -; AVX512-NEXT: .cfi_offset %rbx, -16 -; AVX512-NEXT: movq %rdi, %rbx -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxwq %xmm1, %zmm0 -; AVX512-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512-NEXT: vmovaps (%rdi), %ymm0 -; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %ymm0, (%rdi) -; AVX512-NEXT: callq use_vec@PLT -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa %ymm1, (%rbx) -; AVX512-NEXT: addq $144, %rsp -; AVX512-NEXT: .cfi_def_cfa_offset 16 -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: .cfi_def_cfa_offset 8 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %load = load <8 x i32>, ptr %ptr, align 32 - store <8 x i32> zeroinitializer, ptr %ptr, align 32 - %tmp = load <8 x i32>, ptr %ptr - call void @use_vec(<8 x i32> %tmp) - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load - store <8 x i32> %sel, ptr %ptr, align 32 - ret void -} - - -define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %cmp, <8 x i1> %cmp2) { -; AVX-LABEL: foo: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX-NEXT: vpslld $31, %xmm4, %xmm4 -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] -; AVX-NEXT: vpslld $31, %xmm2, %xmm2 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX-NEXT: vpslld $31, %xmm4, %xmm4 -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; AVX-NEXT: vpslld $31, %xmm3, %xmm3 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX-NEXT: vmovaps (%rsi), %ymm4 -; AVX-NEXT: vblendvps %ymm3, %ymm1, %ymm4, %ymm1 -; AVX-NEXT: vmaskmovps %ymm0, %ymm2, (%rdi) -; AVX-NEXT: vmovaps %ymm1, (%rsi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq -; -; AVX2-LABEL: foo: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX2-NEXT: vpslld $31, %ymm2, %ymm2 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX2-NEXT: vpslld $31, %ymm3, %ymm3 -; AVX2-NEXT: vmovaps (%rsi), %ymm4 -; AVX2-NEXT: vblendvps %ymm3, %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vpmaskmovd %ymm0, %ymm2, (%rdi) -; AVX2-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: foo: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512-NEXT: vpsllq $63, %zmm2, %zmm2 -; AVX512-NEXT: vptestmq %zmm2, %zmm2, %k1 -; AVX512-NEXT: vpmovsxwq %xmm3, %zmm2 -; AVX512-NEXT: vpsllq $63, %zmm2, %zmm2 -; AVX512-NEXT: vptestmq %zmm2, %zmm2, %k2 -; AVX512-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512-NEXT: vmovdqa32 %zmm1, %zmm2 {%k2} -; AVX512-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} -; AVX512-NEXT: vmovdqa %ymm2, (%rsi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %load = load <8 x i32>, ptr %ptr1, align 32 - %load2 = load <8 x i32>, ptr %ptr2, align 32 - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load - %sel2 = select <8 x i1> %cmp2, <8 x i32> %y, <8 x i32> %load2 - store <8 x i32> %sel, ptr %ptr1, align 32 - store <8 x i32> %sel2, ptr %ptr2, align 32 - ret void -} From 34fa965a7c0efe8ed406c2ed5443edcce58707a5 Mon Sep 17 00:00:00 2001 From: Abhishek Kaushik Date: Thu, 26 Jun 2025 23:12:07 +0530 Subject: [PATCH 6/8] Move to DAGCombiner --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 70 +++++ llvm/lib/CodeGen/TargetLoweringBase.cpp | 6 + .../CodeGen/AArch64/combine-storetomstore.ll | 282 ++++++++++++++++++ .../AArch64/sve-fixed-length-shuffles.ll | 25 +- .../test/CodeGen/ARM/combine-storetomstore.ll | 197 ++++++++++++ .../CodeGen/RISCV/combine-storetomstore.ll | 134 +++++++++ .../CodeGen/RISCV/rvv/sink-splat-operands.ll | 16 +- .../CodeGen/X86/avx512-broadcast-unfold.ll | 196 ++++++------ .../test/CodeGen/X86/combine-storetomstore.ll | 276 +++++++++++++++++ llvm/test/CodeGen/X86/pr30284.ll | 12 +- .../LoopStrengthReduce/X86/macro-fuse-cmp.ll | 1 + 11 files changed, 1070 insertions(+), 145 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/combine-storetomstore.ll create mode 100644 llvm/test/CodeGen/ARM/combine-storetomstore.ll create mode 100644 llvm/test/CodeGen/RISCV/combine-storetomstore.ll create mode 100644 llvm/test/CodeGen/X86/combine-storetomstore.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 08dab7c697b99..62a26c276a9f7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -71,6 +71,7 @@ #include #include #include +#include #include #include #include @@ -22451,12 +22452,81 @@ SDValue DAGCombiner::visitATOMIC_STORE(SDNode *N) { return SDValue(); } +static SDValue foldToMaskedStore(StoreSDNode *Store, SelectionDAG &DAG, + const SDLoc &Dl) { + using namespace llvm::SDPatternMatch; + + if (!Store->isSimple() || Store->isTruncatingStore()) + return SDValue(); + + SDValue StoredVal = Store->getValue(); + SDValue StorePtr = Store->getBasePtr(); + SDValue StoreOffset = Store->getOffset(); + EVT VT = Store->getMemoryVT(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + if (!TLI.isTypeLegal(VT) || !TLI.isOperationLegalOrCustom(ISD::MSTORE, VT)) + return SDValue(); + + SDValue Mask, TrueVec, LoadCh; + if (!sd_match(StoredVal, + m_VSelect(m_Value(Mask), m_Value(TrueVec), + m_Load(m_Value(LoadCh), m_Specific(StorePtr), + m_Specific(StoreOffset))))) + return SDValue(); + + LoadSDNode *Load = cast(StoredVal.getOperand(2)); + if (!Load->isSimple()) + return SDValue(); + + auto IsSafeToFold = [](StoreSDNode *Store, LoadSDNode *Load) { + std::queue Worklist; + + Worklist.push(Store->getChain()); + + while (!Worklist.empty()) { + SDValue Chain = Worklist.front(); + Worklist.pop(); + + SDNode *Node = Chain.getNode(); + if (!Node) + return false; + + if (Node == Load) + return true; + + if (const auto *MemNode = dyn_cast(Node)) + if (!MemNode->isSimple() || MemNode->writeMem()) + return false; + + if (Node->getOpcode() == ISD::TokenFactor) { + for (unsigned i = 0; i < Node->getNumOperands(); ++i) + Worklist.push(Node->getOperand(i)); + } else { + Worklist.push(Node->getOperand(0)); + } + } + + return false; + }; + + if (!IsSafeToFold(Store, Load)) + return SDValue(); + + return DAG.getMaskedStore(Store->getChain(), Dl, TrueVec, StorePtr, + StoreOffset, Mask, VT, Store->getMemOperand(), + Store->getAddressingMode()); +} + SDValue DAGCombiner::visitSTORE(SDNode *N) { StoreSDNode *ST = cast(N); SDValue Chain = ST->getChain(); SDValue Value = ST->getValue(); SDValue Ptr = ST->getBasePtr(); + if (SDValue MaskedStore = foldToMaskedStore(ST, DAG, SDLoc(N))) + return MaskedStore; + // If this is a store of a bit convert, store the input value if the // resultant store does not need a higher alignment than the original. if (Value.getOpcode() == ISD::BITCAST && !ST->isTruncatingStore() && diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index cda41a91a372f..04abce6cd6559 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -691,6 +691,12 @@ void TargetLoweringBase::initActions() { setAtomicLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD}, ValVT, MemVT, Expand); + for (MVT VT : MVT::all_valuetypes()) { + if (VT == MVT::Other) + continue; + setOperationAction(ISD::MSTORE, VT, Expand); + } + // We're somewhat special casing MVT::i2 and MVT::i4. Ideally we want to // remove this and targets should individually set these types if not legal. for (ISD::NodeType NT : enum_seq(ISD::DELETED_NODE, ISD::BUILTIN_OP_END, diff --git a/llvm/test/CodeGen/AArch64/combine-storetomstore.ll b/llvm/test/CodeGen/AArch64/combine-storetomstore.ll new file mode 100644 index 0000000000000..16c0786a38768 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/combine-storetomstore.ll @@ -0,0 +1,282 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=aarch64-- -mattr=+neon | FileCheck %s -check-prefix=AARCH64 +; RUN: llc < %s -mtriple=aarch64-- -mattr=+sve | FileCheck %s -check-prefix=SVE + +define void @test_masked_store_success(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +; AARCH64-LABEL: test_masked_store_success: +; AARCH64: // %bb.0: +; AARCH64-NEXT: zip1 v3.8b, v2.8b, v0.8b +; AARCH64-NEXT: zip2 v2.8b, v2.8b, v0.8b +; AARCH64-NEXT: ldp q4, q5, [x0] +; AARCH64-NEXT: ushll v3.4s, v3.4h, #0 +; AARCH64-NEXT: ushll v2.4s, v2.4h, #0 +; AARCH64-NEXT: shl v3.4s, v3.4s, #31 +; AARCH64-NEXT: shl v2.4s, v2.4s, #31 +; AARCH64-NEXT: cmlt v3.4s, v3.4s, #0 +; AARCH64-NEXT: cmlt v2.4s, v2.4s, #0 +; AARCH64-NEXT: bif v0.16b, v4.16b, v3.16b +; AARCH64-NEXT: bif v1.16b, v5.16b, v2.16b +; AARCH64-NEXT: stp q0, q1, [x0] +; AARCH64-NEXT: ret +; +; SVE-LABEL: test_masked_store_success: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: zip2 v3.8b, v2.8b, v0.8b +; SVE-NEXT: zip1 v2.8b, v2.8b, v0.8b +; SVE-NEXT: mov x8, #4 // =0x4 +; SVE-NEXT: ptrue p0.s, vl4 +; SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE-NEXT: ushll v3.4s, v3.4h, #0 +; SVE-NEXT: ushll v2.4s, v2.4h, #0 +; SVE-NEXT: shl v3.4s, v3.4s, #31 +; SVE-NEXT: shl v2.4s, v2.4s, #31 +; SVE-NEXT: cmlt v3.4s, v3.4s, #0 +; SVE-NEXT: cmlt v2.4s, v2.4s, #0 +; SVE-NEXT: cmpne p1.s, p0/z, z3.s, #0 +; SVE-NEXT: cmpne p0.s, p0/z, z2.s, #0 +; SVE-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2] +; SVE-NEXT: st1w { z0.s }, p0, [x0] +; SVE-NEXT: ret + %load = load <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +; AARCH64-LABEL: test_masked_store_volatile_load: +; AARCH64: // %bb.0: +; AARCH64-NEXT: zip1 v3.8b, v2.8b, v0.8b +; AARCH64-NEXT: zip2 v2.8b, v2.8b, v0.8b +; AARCH64-NEXT: ldr q4, [x0] +; AARCH64-NEXT: ldr q5, [x0, #16] +; AARCH64-NEXT: ushll v3.4s, v3.4h, #0 +; AARCH64-NEXT: ushll v2.4s, v2.4h, #0 +; AARCH64-NEXT: shl v3.4s, v3.4s, #31 +; AARCH64-NEXT: shl v2.4s, v2.4s, #31 +; AARCH64-NEXT: cmlt v3.4s, v3.4s, #0 +; AARCH64-NEXT: cmlt v2.4s, v2.4s, #0 +; AARCH64-NEXT: bif v0.16b, v4.16b, v3.16b +; AARCH64-NEXT: bif v1.16b, v5.16b, v2.16b +; AARCH64-NEXT: stp q0, q1, [x0] +; AARCH64-NEXT: ret +; +; SVE-LABEL: test_masked_store_volatile_load: +; SVE: // %bb.0: +; SVE-NEXT: zip1 v3.8b, v2.8b, v0.8b +; SVE-NEXT: zip2 v2.8b, v2.8b, v0.8b +; SVE-NEXT: ldr q4, [x0] +; SVE-NEXT: ldr q5, [x0, #16] +; SVE-NEXT: ushll v3.4s, v3.4h, #0 +; SVE-NEXT: ushll v2.4s, v2.4h, #0 +; SVE-NEXT: shl v3.4s, v3.4s, #31 +; SVE-NEXT: shl v2.4s, v2.4s, #31 +; SVE-NEXT: cmlt v3.4s, v3.4s, #0 +; SVE-NEXT: cmlt v2.4s, v2.4s, #0 +; SVE-NEXT: bif v0.16b, v4.16b, v3.16b +; SVE-NEXT: bif v1.16b, v5.16b, v2.16b +; SVE-NEXT: stp q0, q1, [x0] +; SVE-NEXT: ret + %load = load volatile <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +; AARCH64-LABEL: test_masked_store_volatile_store: +; AARCH64: // %bb.0: +; AARCH64-NEXT: zip1 v3.8b, v2.8b, v0.8b +; AARCH64-NEXT: zip2 v2.8b, v2.8b, v0.8b +; AARCH64-NEXT: ldp q4, q5, [x0] +; AARCH64-NEXT: ushll v3.4s, v3.4h, #0 +; AARCH64-NEXT: ushll v2.4s, v2.4h, #0 +; AARCH64-NEXT: shl v3.4s, v3.4s, #31 +; AARCH64-NEXT: shl v2.4s, v2.4s, #31 +; AARCH64-NEXT: cmlt v3.4s, v3.4s, #0 +; AARCH64-NEXT: cmlt v2.4s, v2.4s, #0 +; AARCH64-NEXT: bif v0.16b, v4.16b, v3.16b +; AARCH64-NEXT: bif v1.16b, v5.16b, v2.16b +; AARCH64-NEXT: str q0, [x0] +; AARCH64-NEXT: str q1, [x0, #16] +; AARCH64-NEXT: ret +; +; SVE-LABEL: test_masked_store_volatile_store: +; SVE: // %bb.0: +; SVE-NEXT: zip1 v3.8b, v2.8b, v0.8b +; SVE-NEXT: zip2 v2.8b, v2.8b, v0.8b +; SVE-NEXT: ldp q4, q5, [x0] +; SVE-NEXT: ushll v3.4s, v3.4h, #0 +; SVE-NEXT: ushll v2.4s, v2.4h, #0 +; SVE-NEXT: shl v3.4s, v3.4s, #31 +; SVE-NEXT: shl v2.4s, v2.4s, #31 +; SVE-NEXT: cmlt v3.4s, v3.4s, #0 +; SVE-NEXT: cmlt v2.4s, v2.4s, #0 +; SVE-NEXT: bif v0.16b, v4.16b, v3.16b +; SVE-NEXT: bif v1.16b, v5.16b, v2.16b +; SVE-NEXT: str q0, [x0] +; SVE-NEXT: str q1, [x0, #16] +; SVE-NEXT: ret + %load = load <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + store volatile <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +declare void @use_vec(<8 x i32>) + +define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +; AARCH64-LABEL: test_masked_store_intervening: +; AARCH64: // %bb.0: +; AARCH64-NEXT: sub sp, sp, #96 +; AARCH64-NEXT: str d8, [sp, #64] // 8-byte Folded Spill +; AARCH64-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; AARCH64-NEXT: .cfi_def_cfa_offset 96 +; AARCH64-NEXT: .cfi_offset w19, -8 +; AARCH64-NEXT: .cfi_offset w30, -16 +; AARCH64-NEXT: .cfi_offset b8, -32 +; AARCH64-NEXT: stp q1, q0, [sp, #32] // 32-byte Folded Spill +; AARCH64-NEXT: ldp q1, q3, [x0] +; AARCH64-NEXT: movi v0.2d, #0000000000000000 +; AARCH64-NEXT: fmov d8, d2 +; AARCH64-NEXT: mov x19, x0 +; AARCH64-NEXT: stp q1, q3, [sp] // 32-byte Folded Spill +; AARCH64-NEXT: movi v1.2d, #0000000000000000 +; AARCH64-NEXT: stp q0, q0, [x0] +; AARCH64-NEXT: bl use_vec +; AARCH64-NEXT: zip2 v0.8b, v8.8b, v0.8b +; AARCH64-NEXT: ldp q3, q2, [sp, #16] // 32-byte Folded Reload +; AARCH64-NEXT: zip1 v1.8b, v8.8b, v0.8b +; AARCH64-NEXT: ushll v0.4s, v0.4h, #0 +; AARCH64-NEXT: ldr d8, [sp, #64] // 8-byte Folded Reload +; AARCH64-NEXT: shl v0.4s, v0.4s, #31 +; AARCH64-NEXT: ushll v1.4s, v1.4h, #0 +; AARCH64-NEXT: cmlt v0.4s, v0.4s, #0 +; AARCH64-NEXT: shl v1.4s, v1.4s, #31 +; AARCH64-NEXT: bsl v0.16b, v2.16b, v3.16b +; AARCH64-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload +; AARCH64-NEXT: ldr q3, [sp] // 16-byte Folded Reload +; AARCH64-NEXT: cmlt v1.4s, v1.4s, #0 +; AARCH64-NEXT: bsl v1.16b, v2.16b, v3.16b +; AARCH64-NEXT: stp q1, q0, [x19] +; AARCH64-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; AARCH64-NEXT: add sp, sp, #96 +; AARCH64-NEXT: ret +; +; SVE-LABEL: test_masked_store_intervening: +; SVE: // %bb.0: +; SVE-NEXT: sub sp, sp, #96 +; SVE-NEXT: str d8, [sp, #64] // 8-byte Folded Spill +; SVE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; SVE-NEXT: .cfi_def_cfa_offset 96 +; SVE-NEXT: .cfi_offset w19, -8 +; SVE-NEXT: .cfi_offset w30, -16 +; SVE-NEXT: .cfi_offset b8, -32 +; SVE-NEXT: stp q1, q0, [sp, #32] // 32-byte Folded Spill +; SVE-NEXT: ldp q1, q3, [x0] +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fmov d8, d2 +; SVE-NEXT: mov x19, x0 +; SVE-NEXT: stp q1, q3, [sp] // 32-byte Folded Spill +; SVE-NEXT: movi v1.2d, #0000000000000000 +; SVE-NEXT: stp q0, q0, [x0] +; SVE-NEXT: bl use_vec +; SVE-NEXT: zip2 v0.8b, v8.8b, v0.8b +; SVE-NEXT: ldp q3, q2, [sp, #16] // 32-byte Folded Reload +; SVE-NEXT: zip1 v1.8b, v8.8b, v0.8b +; SVE-NEXT: ushll v0.4s, v0.4h, #0 +; SVE-NEXT: ldr d8, [sp, #64] // 8-byte Folded Reload +; SVE-NEXT: shl v0.4s, v0.4s, #31 +; SVE-NEXT: ushll v1.4s, v1.4h, #0 +; SVE-NEXT: cmlt v0.4s, v0.4s, #0 +; SVE-NEXT: shl v1.4s, v1.4s, #31 +; SVE-NEXT: bsl v0.16b, v2.16b, v3.16b +; SVE-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload +; SVE-NEXT: ldr q3, [sp] // 16-byte Folded Reload +; SVE-NEXT: cmlt v1.4s, v1.4s, #0 +; SVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; SVE-NEXT: stp q1, q0, [x19] +; SVE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; SVE-NEXT: add sp, sp, #96 +; SVE-NEXT: ret + %load = load <8 x i32>, ptr %ptr, align 32 + store <8 x i32> zeroinitializer, ptr %ptr, align 32 + %tmp = load <8 x i32>, ptr %ptr + call void @use_vec(<8 x i32> %tmp) + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + + +define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %cmp, <8 x i1> %cmp2) { +; AARCH64-LABEL: test_masked_store_multiple: +; AARCH64: // %bb.0: +; AARCH64-NEXT: zip1 v6.8b, v4.8b, v0.8b +; AARCH64-NEXT: zip2 v4.8b, v4.8b, v0.8b +; AARCH64-NEXT: zip1 v7.8b, v5.8b, v0.8b +; AARCH64-NEXT: zip2 v5.8b, v5.8b, v0.8b +; AARCH64-NEXT: ldp q16, q17, [x0] +; AARCH64-NEXT: ushll v6.4s, v6.4h, #0 +; AARCH64-NEXT: ushll v4.4s, v4.4h, #0 +; AARCH64-NEXT: ushll v7.4s, v7.4h, #0 +; AARCH64-NEXT: ushll v5.4s, v5.4h, #0 +; AARCH64-NEXT: shl v6.4s, v6.4s, #31 +; AARCH64-NEXT: shl v4.4s, v4.4s, #31 +; AARCH64-NEXT: shl v7.4s, v7.4s, #31 +; AARCH64-NEXT: shl v5.4s, v5.4s, #31 +; AARCH64-NEXT: cmlt v6.4s, v6.4s, #0 +; AARCH64-NEXT: cmlt v4.4s, v4.4s, #0 +; AARCH64-NEXT: cmlt v7.4s, v7.4s, #0 +; AARCH64-NEXT: cmlt v5.4s, v5.4s, #0 +; AARCH64-NEXT: bif v0.16b, v16.16b, v6.16b +; AARCH64-NEXT: ldp q6, q16, [x1] +; AARCH64-NEXT: bif v1.16b, v17.16b, v4.16b +; AARCH64-NEXT: bif v2.16b, v6.16b, v7.16b +; AARCH64-NEXT: bif v3.16b, v16.16b, v5.16b +; AARCH64-NEXT: stp q0, q1, [x0] +; AARCH64-NEXT: stp q2, q3, [x1] +; AARCH64-NEXT: ret +; +; SVE-LABEL: test_masked_store_multiple: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: zip2 v6.8b, v4.8b, v0.8b +; SVE-NEXT: zip1 v4.8b, v4.8b, v0.8b +; SVE-NEXT: mov x8, #4 // =0x4 +; SVE-NEXT: zip2 v7.8b, v5.8b, v0.8b +; SVE-NEXT: zip1 v5.8b, v5.8b, v0.8b +; SVE-NEXT: // kill: def $q3 killed $q3 def $z3 +; SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE-NEXT: ptrue p0.s, vl4 +; SVE-NEXT: ushll v6.4s, v6.4h, #0 +; SVE-NEXT: ushll v4.4s, v4.4h, #0 +; SVE-NEXT: ushll v7.4s, v7.4h, #0 +; SVE-NEXT: ushll v5.4s, v5.4h, #0 +; SVE-NEXT: shl v6.4s, v6.4s, #31 +; SVE-NEXT: shl v4.4s, v4.4s, #31 +; SVE-NEXT: shl v7.4s, v7.4s, #31 +; SVE-NEXT: shl v5.4s, v5.4s, #31 +; SVE-NEXT: cmlt v6.4s, v6.4s, #0 +; SVE-NEXT: cmlt v4.4s, v4.4s, #0 +; SVE-NEXT: cmlt v7.4s, v7.4s, #0 +; SVE-NEXT: cmlt v5.4s, v5.4s, #0 +; SVE-NEXT: cmpne p1.s, p0/z, z6.s, #0 +; SVE-NEXT: ldr q6, [x1] +; SVE-NEXT: cmpne p2.s, p0/z, z4.s, #0 +; SVE-NEXT: cmpne p0.s, p0/z, z7.s, #0 +; SVE-NEXT: bif v2.16b, v6.16b, v5.16b +; SVE-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2] +; SVE-NEXT: st1w { z0.s }, p2, [x0] +; SVE-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2] +; SVE-NEXT: str q2, [x1] +; SVE-NEXT: ret + %load = load <8 x i32>, ptr %ptr1, align 32 + %load2 = load <8 x i32>, ptr %ptr2, align 32 + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + %sel2 = select <8 x i1> %cmp2, <8 x i32> %y, <8 x i32> %load2 + store <8 x i32> %sel, ptr %ptr1, align 32 + store <8 x i32> %sel2, ptr %ptr2, align 32 + ret void +} diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll index d916f26f9b26b..2eff6da0866f8 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll @@ -60,31 +60,20 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang ; CHECK-NEXT: asr z1.s, z1.s, #31 ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: lsl z0.s, z0.s, #31 -; CHECK-NEXT: and z1.s, z1.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0 ; CHECK-NEXT: lsl z2.s, z2.s, #31 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: asr z0.s, z0.s, #31 -; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0 -; CHECK-NEXT: ldr z1, [x0] ; CHECK-NEXT: lsl z3.s, z3.s, #31 ; CHECK-NEXT: asr z2.s, z2.s, #31 -; CHECK-NEXT: and z0.s, z0.s, #0x1 -; CHECK-NEXT: asr z3.s, z3.s, #31 -; CHECK-NEXT: and z2.s, z2.s, #0x1 -; CHECK-NEXT: mov z1.s, p1/m, #0 // =0x0 +; CHECK-NEXT: st1w { z1.s }, p1, [x0] ; CHECK-NEXT: cmpne p2.s, p0/z, z0.s, #0 -; CHECK-NEXT: ldr z0, [x0, #2, mul vl] -; CHECK-NEXT: and z3.s, z3.s, #0x1 -; CHECK-NEXT: str z1, [x0] +; CHECK-NEXT: asr z3.s, z3.s, #31 ; CHECK-NEXT: cmpne p3.s, p0/z, z3.s, #0 ; CHECK-NEXT: cmpne p0.s, p0/z, z2.s, #0 -; CHECK-NEXT: ldr z3, [x0, #3, mul vl] -; CHECK-NEXT: ldr z2, [x0, #1, mul vl] -; CHECK-NEXT: mov z0.s, p2/m, #0 // =0x0 -; CHECK-NEXT: mov z3.s, p3/m, #0 // =0x0 -; CHECK-NEXT: mov z2.s, p0/m, #0 // =0x0 -; CHECK-NEXT: str z0, [x0, #2, mul vl] -; CHECK-NEXT: str z3, [x0, #3, mul vl] -; CHECK-NEXT: str z2, [x0, #1, mul vl] +; CHECK-NEXT: st1w { z1.s }, p2, [x0, #2, mul vl] +; CHECK-NEXT: st1w { z1.s }, p3, [x0, #3, mul vl] +; CHECK-NEXT: st1w { z1.s }, p0, [x0, #1, mul vl] ; CHECK-NEXT: .LBB1_2: // %exit ; CHECK-NEXT: ret %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/ARM/combine-storetomstore.ll b/llvm/test/CodeGen/ARM/combine-storetomstore.ll new file mode 100644 index 0000000000000..085141b2eabf9 --- /dev/null +++ b/llvm/test/CodeGen/ARM/combine-storetomstore.ll @@ -0,0 +1,197 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=armv7-- -mattr=+neon | FileCheck %s -check-prefix=ARM + +define void @test_masked_store_success(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +; ARM-LABEL: test_masked_store_success: +; ARM: @ %bb.0: +; ARM-NEXT: vldr d16, [sp, #24] +; ARM-NEXT: vmov d21, r2, r3 +; ARM-NEXT: ldr r12, [sp, #16] +; ARM-NEXT: vmov d20, r0, r1 +; ARM-NEXT: vzip.8 d16, d17 +; ARM-NEXT: mov r0, sp +; ARM-NEXT: vld1.64 {d22, d23}, [r12:128] +; ARM-NEXT: vmovl.u16 q9, d16 +; ARM-NEXT: vmovl.u16 q8, d17 +; ARM-NEXT: vshl.i32 q9, q9, #31 +; ARM-NEXT: vshl.i32 q8, q8, #31 +; ARM-NEXT: vshr.s32 q9, q9, #31 +; ARM-NEXT: vshr.s32 q8, q8, #31 +; ARM-NEXT: vbsl q9, q10, q11 +; ARM-NEXT: vld1.64 {d20, d21}, [r0] +; ARM-NEXT: vst1.32 {d18, d19}, [r12:128]! +; ARM-NEXT: vld1.64 {d18, d19}, [r12:128] +; ARM-NEXT: vbsl q8, q10, q9 +; ARM-NEXT: vst1.64 {d16, d17}, [r12:128] +; ARM-NEXT: bx lr + %load = load <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +; ARM-LABEL: test_masked_store_volatile_load: +; ARM: @ %bb.0: +; ARM-NEXT: vldr d16, [sp, #24] +; ARM-NEXT: vmov d21, r2, r3 +; ARM-NEXT: ldr r12, [sp, #16] +; ARM-NEXT: vmov d20, r0, r1 +; ARM-NEXT: vzip.8 d16, d17 +; ARM-NEXT: mov r0, sp +; ARM-NEXT: vld1.64 {d22, d23}, [r12:128] +; ARM-NEXT: vmovl.u16 q9, d16 +; ARM-NEXT: vmovl.u16 q8, d17 +; ARM-NEXT: vshl.i32 q9, q9, #31 +; ARM-NEXT: vshl.i32 q8, q8, #31 +; ARM-NEXT: vshr.s32 q9, q9, #31 +; ARM-NEXT: vshr.s32 q8, q8, #31 +; ARM-NEXT: vbsl q9, q10, q11 +; ARM-NEXT: vld1.64 {d20, d21}, [r0] +; ARM-NEXT: vst1.32 {d18, d19}, [r12:128]! +; ARM-NEXT: vld1.64 {d18, d19}, [r12:128] +; ARM-NEXT: vbsl q8, q10, q9 +; ARM-NEXT: vst1.64 {d16, d17}, [r12:128] +; ARM-NEXT: bx lr + %load = load volatile <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +; ARM-LABEL: test_masked_store_volatile_store: +; ARM: @ %bb.0: +; ARM-NEXT: vldr d16, [sp, #24] +; ARM-NEXT: vmov d21, r2, r3 +; ARM-NEXT: ldr r12, [sp, #16] +; ARM-NEXT: vmov d20, r0, r1 +; ARM-NEXT: vzip.8 d16, d17 +; ARM-NEXT: mov r1, sp +; ARM-NEXT: vmovl.u16 q9, d16 +; ARM-NEXT: mov r0, r12 +; ARM-NEXT: vmovl.u16 q8, d17 +; ARM-NEXT: vld1.32 {d22, d23}, [r0:128]! +; ARM-NEXT: vld1.64 {d24, d25}, [r0:128] +; ARM-NEXT: vshl.i32 q9, q9, #31 +; ARM-NEXT: vshl.i32 q8, q8, #31 +; ARM-NEXT: vshr.s32 q9, q9, #31 +; ARM-NEXT: vshr.s32 q8, q8, #31 +; ARM-NEXT: vbsl q9, q10, q11 +; ARM-NEXT: vld1.64 {d20, d21}, [r1] +; ARM-NEXT: vbsl q8, q10, q12 +; ARM-NEXT: vst1.64 {d18, d19}, [r12:128] +; ARM-NEXT: vst1.64 {d16, d17}, [r0:128] +; ARM-NEXT: bx lr + %load = load <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + store volatile <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +declare void @use_vec(<8 x i32>) + +define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +; ARM-LABEL: test_masked_store_intervening: +; ARM: @ %bb.0: +; ARM-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; ARM-NEXT: vpush {d8, d9, d10, d11} +; ARM-NEXT: sub sp, sp, #16 +; ARM-NEXT: ldr r8, [sp, #96] +; ARM-NEXT: vmov.i32 q8, #0x0 +; ARM-NEXT: mov r9, r3 +; ARM-NEXT: mov r5, r2 +; ARM-NEXT: vld1.64 {d8, d9}, [r8:128] +; ARM-NEXT: mov r6, r1 +; ARM-NEXT: mov r4, r8 +; ARM-NEXT: mov r7, r0 +; ARM-NEXT: vst1.32 {d16, d17}, [r4:128]! +; ARM-NEXT: mov r0, #0 +; ARM-NEXT: mov r1, #0 +; ARM-NEXT: mov r2, #0 +; ARM-NEXT: vld1.64 {d10, d11}, [r4:128] +; ARM-NEXT: mov r3, #0 +; ARM-NEXT: vst1.64 {d16, d17}, [r4:128] +; ARM-NEXT: vst1.64 {d16, d17}, [sp] +; ARM-NEXT: bl use_vec +; ARM-NEXT: vldr d16, [sp, #104] +; ARM-NEXT: vmov d21, r5, r9 +; ARM-NEXT: add r0, sp, #80 +; ARM-NEXT: vmov d20, r7, r6 +; ARM-NEXT: vzip.8 d16, d17 +; ARM-NEXT: vld1.64 {d22, d23}, [r0] +; ARM-NEXT: vmovl.u16 q9, d17 +; ARM-NEXT: vmovl.u16 q8, d16 +; ARM-NEXT: vshl.i32 q9, q9, #31 +; ARM-NEXT: vshl.i32 q8, q8, #31 +; ARM-NEXT: vshr.s32 q9, q9, #31 +; ARM-NEXT: vshr.s32 q8, q8, #31 +; ARM-NEXT: vbsl q9, q11, q5 +; ARM-NEXT: vbsl q8, q10, q4 +; ARM-NEXT: vst1.64 {d18, d19}, [r4:128] +; ARM-NEXT: vst1.64 {d16, d17}, [r8:128] +; ARM-NEXT: add sp, sp, #16 +; ARM-NEXT: vpop {d8, d9, d10, d11} +; ARM-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} + %load = load <8 x i32>, ptr %ptr, align 32 + store <8 x i32> zeroinitializer, ptr %ptr, align 32 + %tmp = load <8 x i32>, ptr %ptr + call void @use_vec(<8 x i32> %tmp) + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + + +define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %cmp, <8 x i1> %cmp2) { +; ARM-LABEL: test_masked_store_multiple: +; ARM: @ %bb.0: +; ARM-NEXT: push {r11, lr} +; ARM-NEXT: vldr d16, [sp, #64] +; ARM-NEXT: vmov d23, r2, r3 +; ARM-NEXT: ldr lr, [sp, #60] +; ARM-NEXT: vmov d22, r0, r1 +; ARM-NEXT: vzip.8 d16, d17 +; ARM-NEXT: ldr r12, [sp, #56] +; ARM-NEXT: add r1, sp, #8 +; ARM-NEXT: vmovl.u16 q9, d16 +; ARM-NEXT: vldr d16, [sp, #72] +; ARM-NEXT: mov r0, lr +; ARM-NEXT: vld1.64 {d20, d21}, [r12:128] +; ARM-NEXT: vmovl.u16 q14, d17 +; ARM-NEXT: vld1.32 {d24, d25}, [r0:128]! +; ARM-NEXT: vshl.i32 q9, q9, #31 +; ARM-NEXT: vzip.8 d16, d26 +; ARM-NEXT: vshr.s32 q9, q9, #31 +; ARM-NEXT: vmovl.u16 q8, d16 +; ARM-NEXT: vbsl q9, q11, q10 +; ARM-NEXT: vld1.64 {d22, d23}, [r1] +; ARM-NEXT: vmovl.u16 q10, d26 +; ARM-NEXT: add r1, sp, #24 +; ARM-NEXT: vshl.i32 q13, q14, #31 +; ARM-NEXT: vld1.64 {d28, d29}, [r1] +; ARM-NEXT: vshl.i32 q8, q8, #31 +; ARM-NEXT: add r1, sp, #40 +; ARM-NEXT: vst1.32 {d18, d19}, [r12:128]! +; ARM-NEXT: vshl.i32 q10, q10, #31 +; ARM-NEXT: vld1.64 {d18, d19}, [r12:128] +; ARM-NEXT: vshr.s32 q13, q13, #31 +; ARM-NEXT: vshr.s32 q8, q8, #31 +; ARM-NEXT: vld1.64 {d30, d31}, [r0:128] +; ARM-NEXT: vshr.s32 q10, q10, #31 +; ARM-NEXT: vbit q9, q11, q13 +; ARM-NEXT: vld1.64 {d22, d23}, [r1] +; ARM-NEXT: vbsl q8, q14, q12 +; ARM-NEXT: vbsl q10, q11, q15 +; ARM-NEXT: vst1.64 {d18, d19}, [r12:128] +; ARM-NEXT: vst1.64 {d16, d17}, [lr:128] +; ARM-NEXT: vst1.64 {d20, d21}, [r0:128] +; ARM-NEXT: pop {r11, pc} + %load = load <8 x i32>, ptr %ptr1, align 32 + %load2 = load <8 x i32>, ptr %ptr2, align 32 + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + %sel2 = select <8 x i1> %cmp2, <8 x i32> %y, <8 x i32> %load2 + store <8 x i32> %sel, ptr %ptr1, align 32 + store <8 x i32> %sel2, ptr %ptr2, align 32 + ret void +} diff --git a/llvm/test/CodeGen/RISCV/combine-storetomstore.ll b/llvm/test/CodeGen/RISCV/combine-storetomstore.ll new file mode 100644 index 0000000000000..26cad3959e7d3 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/combine-storetomstore.ll @@ -0,0 +1,134 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv64-- -mattr=+m,+v,+f | FileCheck %s -check-prefix=RISCV + +define void @test_masked_store_success(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +; RISCV-LABEL: test_masked_store_success: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RISCV-NEXT: vse32.v v8, (a0), v0.t +; RISCV-NEXT: ret + %load = load <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +; RISCV-LABEL: test_masked_store_volatile_load: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RISCV-NEXT: vle32.v v10, (a0) +; RISCV-NEXT: vmerge.vvm v8, v10, v8, v0 +; RISCV-NEXT: vse32.v v8, (a0) +; RISCV-NEXT: ret + %load = load volatile <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +; RISCV-LABEL: test_masked_store_volatile_store: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RISCV-NEXT: vle32.v v10, (a0) +; RISCV-NEXT: vmerge.vvm v8, v10, v8, v0 +; RISCV-NEXT: vse32.v v8, (a0) +; RISCV-NEXT: ret + %load = load <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + store volatile <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +declare void @use_vec(<8 x i32>) + +define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +; RISCV-LABEL: test_masked_store_intervening: +; RISCV: # %bb.0: +; RISCV-NEXT: addi sp, sp, -32 +; RISCV-NEXT: .cfi_def_cfa_offset 32 +; RISCV-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RISCV-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; RISCV-NEXT: .cfi_offset ra, -8 +; RISCV-NEXT: .cfi_offset s0, -16 +; RISCV-NEXT: csrr a1, vlenb +; RISCV-NEXT: slli a2, a1, 2 +; RISCV-NEXT: add a1, a2, a1 +; RISCV-NEXT: sub sp, sp, a1 +; RISCV-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x05, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 5 * vlenb +; RISCV-NEXT: csrr a1, vlenb +; RISCV-NEXT: slli a1, a1, 2 +; RISCV-NEXT: add a1, sp, a1 +; RISCV-NEXT: addi a1, a1, 16 +; RISCV-NEXT: vs1r.v v0, (a1) # vscale x 8-byte Folded Spill +; RISCV-NEXT: mv s0, a0 +; RISCV-NEXT: csrr a1, vlenb +; RISCV-NEXT: slli a1, a1, 1 +; RISCV-NEXT: add a1, sp, a1 +; RISCV-NEXT: addi a1, a1, 16 +; RISCV-NEXT: vs2r.v v8, (a1) # vscale x 16-byte Folded Spill +; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RISCV-NEXT: vle32.v v8, (a0) +; RISCV-NEXT: addi a1, sp, 16 +; RISCV-NEXT: vs2r.v v8, (a1) # vscale x 16-byte Folded Spill +; RISCV-NEXT: vmv.v.i v8, 0 +; RISCV-NEXT: vse32.v v8, (a0) +; RISCV-NEXT: call use_vec +; RISCV-NEXT: csrr a0, vlenb +; RISCV-NEXT: slli a0, a0, 2 +; RISCV-NEXT: add a0, sp, a0 +; RISCV-NEXT: addi a0, a0, 16 +; RISCV-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload +; RISCV-NEXT: csrr a0, vlenb +; RISCV-NEXT: slli a0, a0, 1 +; RISCV-NEXT: add a0, sp, a0 +; RISCV-NEXT: addi a0, a0, 16 +; RISCV-NEXT: vl2r.v v8, (a0) # vscale x 16-byte Folded Reload +; RISCV-NEXT: addi a0, sp, 16 +; RISCV-NEXT: vl2r.v v10, (a0) # vscale x 16-byte Folded Reload +; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RISCV-NEXT: vmerge.vvm v8, v10, v8, v0 +; RISCV-NEXT: vse32.v v8, (s0) +; RISCV-NEXT: csrr a0, vlenb +; RISCV-NEXT: slli a1, a0, 2 +; RISCV-NEXT: add a0, a1, a0 +; RISCV-NEXT: add sp, sp, a0 +; RISCV-NEXT: .cfi_def_cfa sp, 32 +; RISCV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RISCV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; RISCV-NEXT: .cfi_restore ra +; RISCV-NEXT: .cfi_restore s0 +; RISCV-NEXT: addi sp, sp, 32 +; RISCV-NEXT: .cfi_def_cfa_offset 0 +; RISCV-NEXT: ret + %load = load <8 x i32>, ptr %ptr, align 32 + store <8 x i32> zeroinitializer, ptr %ptr, align 32 + %tmp = load <8 x i32>, ptr %ptr + call void @use_vec(<8 x i32> %tmp) + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + + +define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %cmp, <8 x i1> %cmp2) { +; RISCV-LABEL: test_masked_store_multiple: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RISCV-NEXT: vmv1r.v v13, v0 +; RISCV-NEXT: vle32.v v14, (a1) +; RISCV-NEXT: vmv1r.v v0, v12 +; RISCV-NEXT: vmerge.vvm v10, v14, v10, v0 +; RISCV-NEXT: vmv1r.v v0, v13 +; RISCV-NEXT: vse32.v v8, (a0), v0.t +; RISCV-NEXT: vse32.v v10, (a1) +; RISCV-NEXT: ret + %load = load <8 x i32>, ptr %ptr1, align 32 + %load2 = load <8 x i32>, ptr %ptr2, align 32 + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + %sel2 = select <8 x i1> %cmp2, <8 x i32> %y, <8 x i32> %load2 + store <8 x i32> %sel, ptr %ptr1, align 32 + store <8 x i32> %sel2, ptr %ptr2, align 32 + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll index c216fb65a6a5b..677152b7c407d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll +++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll @@ -5427,18 +5427,18 @@ for.cond.cleanup: ; preds = %vector.body define void @sink_splat_select_op1(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_select_op1: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a2, 1 -; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: li a3, 42 +; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: li a2, 42 +; CHECK-NEXT: add a3, a0, a3 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: .LBB117_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmseq.vx v0, v8, a3 -; CHECK-NEXT: vmerge.vxm v8, v8, a1, v0 -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vmseq.vx v0, v9, a2 +; CHECK-NEXT: vse32.v v8, (a0), v0.t ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: bne a0, a2, .LBB117_1 +; CHECK-NEXT: bne a0, a3, .LBB117_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll index ba2cacc087b36..7377311b9e0c9 100644 --- a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll +++ b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll @@ -3161,13 +3161,12 @@ define void @bcast_unfold_pcmpgt_v4i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB96_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1 -; CHECK-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vpcmpltd 4096(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovdqu32 %xmm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB96_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3195,13 +3194,12 @@ define void @bcast_unfold_pcmpgt_v8i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB97_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm1 -; CHECK-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) +; CHECK-NEXT: vpcmpltd 4096(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovdqu32 %ymm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB97_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3230,13 +3228,12 @@ define void @bcast_unfold_pcmpgt_v16i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB98_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm1 -; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vpcmpltd 4096(%rdi,%rax), %zmm0, %k1 +; CHECK-NEXT: vmovdqu32 %zmm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB98_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3265,13 +3262,12 @@ define void @bcast_unfold_pcmpgt_v2i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB99_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm1 -; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3] -; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vpcmpltq 8192(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovdqu64 %xmm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB99_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3299,13 +3295,12 @@ define void @bcast_unfold_pcmpgt_v4i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB100_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 -; CHECK-NEXT: vpcmpgtq %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: vpcmpltq 8192(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovdqu64 %ymm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB100_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3334,13 +3329,12 @@ define void @bcast_unfold_pcmpgt_v8i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB101_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm1 -; CHECK-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vpcmpltq 8192(%rdi,%rax), %zmm0, %k1 +; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB101_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3369,13 +3363,12 @@ define void @bcast_unfold_pcmpeq_v4i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB102_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vpcmpeqd 4096(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovdqu32 %xmm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB102_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3403,13 +3396,12 @@ define void @bcast_unfold_pcmpeq_v8i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB103_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm1 -; CHECK-NEXT: vpcmpeqd %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) +; CHECK-NEXT: vpcmpeqd 4096(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovdqu32 %ymm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB103_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3438,13 +3430,12 @@ define void @bcast_unfold_pcmpeq_v16i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB104_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm1 -; CHECK-NEXT: vpcmpeqd %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vpcmpeqd 4096(%rdi,%rax), %zmm0, %k1 +; CHECK-NEXT: vmovdqu32 %zmm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB104_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3473,13 +3464,12 @@ define void @bcast_unfold_pcmpeq_v2i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB105_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm1 -; CHECK-NEXT: vpcmpeqq %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3] -; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vpcmpeqq 8192(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovdqu64 %xmm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB105_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3507,13 +3497,12 @@ define void @bcast_unfold_pcmpeq_v4i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB106_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 -; CHECK-NEXT: vpcmpeqq %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: vpcmpeqq 8192(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovdqu64 %ymm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB106_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3542,13 +3531,12 @@ define void @bcast_unfold_pcmpeq_v8i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB107_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm1 -; CHECK-NEXT: vpcmpeqq %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vpcmpeqq 8192(%rdi,%rax), %zmm0, %k1 +; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB107_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3577,13 +3565,12 @@ define void @bcast_unfold_pcmp_v4i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB108_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %xmm1 -; CHECK-NEXT: vpcmpltd %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: vpcmpgtd (%rdi,%rax,4), %xmm0, %k1 +; CHECK-NEXT: vmovdqu32 %xmm1, (%rdi,%rax,4) {%k1} ; CHECK-NEXT: addq $4, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: jg .LBB108_1 @@ -3612,13 +3599,12 @@ define void @bcast_unfold_pcmp_v8i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB109_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %ymm1 -; CHECK-NEXT: vpcmpltd %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,4) +; CHECK-NEXT: vpcmpgtd (%rdi,%rax,4), %ymm0, %k1 +; CHECK-NEXT: vmovdqu32 %ymm1, (%rdi,%rax,4) {%k1} ; CHECK-NEXT: addq $8, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: jg .LBB109_1 @@ -3648,13 +3634,12 @@ define void @bcast_unfold_pcmp_v16i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB110_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 (%rdi,%rax,4), %zmm1 -; CHECK-NEXT: vpcmpltd %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4) +; CHECK-NEXT: vpcmpgtd (%rdi,%rax,4), %zmm0, %k1 +; CHECK-NEXT: vmovdqu32 %zmm1, (%rdi,%rax,4) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: jg .LBB110_1 @@ -3684,13 +3669,12 @@ define void @bcast_unfold_pcmp_v2i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB111_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %xmm1 -; CHECK-NEXT: vpcmpltq %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3] -; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8) +; CHECK-NEXT: vpcmpgtq (%rdi,%rax,8), %xmm0, %k1 +; CHECK-NEXT: vmovdqu64 %xmm1, (%rdi,%rax,8) {%k1} ; CHECK-NEXT: addq $2, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: jg .LBB111_1 @@ -3719,13 +3703,12 @@ define void @bcast_unfold_pcmp_v4i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB112_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %ymm1 -; CHECK-NEXT: vpcmpltq %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: vpcmpgtq (%rdi,%rax,8), %ymm0, %k1 +; CHECK-NEXT: vmovdqu64 %ymm1, (%rdi,%rax,8) {%k1} ; CHECK-NEXT: addq $4, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: jg .LBB112_1 @@ -3755,13 +3738,12 @@ define void @bcast_unfold_pcmp_v8i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB113_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 (%rdi,%rax,8), %zmm1 -; CHECK-NEXT: vpcmpltq %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8) +; CHECK-NEXT: vpcmpgtq (%rdi,%rax,8), %zmm0, %k1 +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8) {%k1} ; CHECK-NEXT: addq $8, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: jg .LBB113_1 @@ -3791,13 +3773,12 @@ define void @bcast_unfold_pcmpu_v4i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB114_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %xmm1 -; CHECK-NEXT: vpcmpltud %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: vpcmpnleud (%rdi,%rax,4), %xmm0, %k1 +; CHECK-NEXT: vmovdqu32 %xmm1, (%rdi,%rax,4) {%k1} ; CHECK-NEXT: addq $4, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: ja .LBB114_1 @@ -3826,13 +3807,12 @@ define void @bcast_unfold_pcmpu_v8i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB115_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %ymm1 -; CHECK-NEXT: vpcmpltud %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,4) +; CHECK-NEXT: vpcmpnleud (%rdi,%rax,4), %ymm0, %k1 +; CHECK-NEXT: vmovdqu32 %ymm1, (%rdi,%rax,4) {%k1} ; CHECK-NEXT: addq $8, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: ja .LBB115_1 @@ -3862,13 +3842,12 @@ define void @bcast_unfold_pcmpu_v16i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB116_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 (%rdi,%rax,4), %zmm1 -; CHECK-NEXT: vpcmpltud %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4) +; CHECK-NEXT: vpcmpnleud (%rdi,%rax,4), %zmm0, %k1 +; CHECK-NEXT: vmovdqu32 %zmm1, (%rdi,%rax,4) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: ja .LBB116_1 @@ -3898,13 +3877,12 @@ define void @bcast_unfold_pcmpu_v2i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2] +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB117_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %xmm1 -; CHECK-NEXT: vpcmpltuq %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3] -; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8) +; CHECK-NEXT: vpcmpnleuq (%rdi,%rax,8), %xmm0, %k1 +; CHECK-NEXT: vmovdqu64 %xmm1, (%rdi,%rax,8) {%k1} ; CHECK-NEXT: addq $2, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: ja .LBB117_1 @@ -3933,13 +3911,12 @@ define void @bcast_unfold_pcmpu_v4i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB118_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %ymm1 -; CHECK-NEXT: vpcmpltuq %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: vpcmpnleuq (%rdi,%rax,8), %ymm0, %k1 +; CHECK-NEXT: vmovdqu64 %ymm1, (%rdi,%rax,8) {%k1} ; CHECK-NEXT: addq $4, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: ja .LBB118_1 @@ -3969,13 +3946,12 @@ define void @bcast_unfold_pcmpu_v8i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2] +; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB119_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 (%rdi,%rax,8), %zmm1 -; CHECK-NEXT: vpcmpltuq %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8) +; CHECK-NEXT: vpcmpnleuq (%rdi,%rax,8), %zmm0, %k1 +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8) {%k1} ; CHECK-NEXT: addq $8, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: ja .LBB119_1 @@ -4254,13 +4230,12 @@ define void @bcast_unfold_ptestm_v4i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB127_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1 -; CHECK-NEXT: vptestmd %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vptestmd 4096(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovdqu32 %xmm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB127_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4289,13 +4264,12 @@ define void @bcast_unfold_ptestnm_v4i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB128_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1 -; CHECK-NEXT: vptestnmd %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vptestnmd 4096(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovdqu32 %xmm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB128_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4324,13 +4298,12 @@ define void @bcast_unfold_ptestm_v4i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB129_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 -; CHECK-NEXT: vptestmq %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: vptestmq 8192(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovdqu64 %ymm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB129_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4360,13 +4333,12 @@ define void @bcast_unfold_ptestnm_v4i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB130_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 -; CHECK-NEXT: vptestnmq %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: vptestnmq 8192(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovdqu64 %ymm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB130_1 ; CHECK-NEXT: # %bb.2: # %bb10 diff --git a/llvm/test/CodeGen/X86/combine-storetomstore.ll b/llvm/test/CodeGen/X86/combine-storetomstore.ll new file mode 100644 index 0000000000000..7b39fa450cbf8 --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-storetomstore.ll @@ -0,0 +1,276 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s -check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s -check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s -check-prefix=AVX512 + + +define void @test_masked_store_success(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +; AVX-LABEL: test_masked_store_success: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vpmaskmovd %ymm0, %ymm1, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpsllq $63, %zmm1, %zmm1 +; AVX512-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +; AVX-LABEL: test_masked_store_volatile_load: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmovaps (%rdi), %ymm2 +; AVX-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmovaps %ymm0, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_volatile_load: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_volatile_load: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpsllq $63, %zmm1, %zmm1 +; AVX512-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa %ymm1, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load volatile <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +; AVX-LABEL: test_masked_store_volatile_store: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmovaps (%rdi), %ymm2 +; AVX-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmovaps %ymm0, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_volatile_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_volatile_store: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpsllq $63, %zmm1, %zmm1 +; AVX512-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa %ymm1, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + store volatile <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +declare void @use_vec(<8 x i32>) + +define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +; AVX-LABEL: test_masked_store_intervening: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbx +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: subq $32, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 +; AVX-NEXT: .cfi_offset %rbx, -16 +; AVX-NEXT: movq %rdi, %rbx +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmovaps (%rdi), %ymm2 +; AVX-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovaps %ymm0, (%rdi) +; AVX-NEXT: callq use_vec@PLT +; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, (%rbx) +; AVX-NEXT: addq $32, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: popq %rbx +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_intervening: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: .cfi_def_cfa_offset 16 +; AVX2-NEXT: subq $32, %rsp +; AVX2-NEXT: .cfi_def_cfa_offset 48 +; AVX2-NEXT: .cfi_offset %rbx, -16 +; AVX2-NEXT: movq %rdi, %rbx +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-NEXT: callq use_vec@PLT +; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, (%rbx) +; AVX2-NEXT: addq $32, %rsp +; AVX2-NEXT: .cfi_def_cfa_offset 16 +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: .cfi_def_cfa_offset 8 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_intervening: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: subq $144, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 160 +; AVX512-NEXT: .cfi_offset %rbx, -16 +; AVX512-NEXT: movq %rdi, %rbx +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm0 +; AVX512-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512-NEXT: vmovaps (%rdi), %ymm0 +; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %ymm0, (%rdi) +; AVX512-NEXT: callq use_vec@PLT +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa %ymm1, (%rbx) +; AVX512-NEXT: addq $144, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x i32>, ptr %ptr, align 32 + store <8 x i32> zeroinitializer, ptr %ptr, align 32 + %tmp = load <8 x i32>, ptr %ptr + call void @use_vec(<8 x i32> %tmp) + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + + +define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %cmp, <8 x i1> %cmp2) { +; AVX-LABEL: test_masked_store_multiple: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX-NEXT: vpslld $31, %xmm4, %xmm4 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX-NEXT: vpslld $31, %xmm4, %xmm4 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX-NEXT: vmovaps (%rsi), %ymm4 +; AVX-NEXT: vblendvps %ymm3, %ymm1, %ymm4, %ymm1 +; AVX-NEXT: vmaskmovps %ymm0, %ymm2, (%rdi) +; AVX-NEXT: vmovaps %ymm1, (%rsi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_multiple: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX2-NEXT: vpslld $31, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX2-NEXT: vpslld $31, %ymm3, %ymm3 +; AVX2-NEXT: vmovaps (%rsi), %ymm4 +; AVX2-NEXT: vblendvps %ymm3, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpmaskmovd %ymm0, %ymm2, (%rdi) +; AVX2-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_multiple: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovsxwq %xmm2, %zmm2 +; AVX512-NEXT: vpsllq $63, %zmm2, %zmm2 +; AVX512-NEXT: vptestmq %zmm2, %zmm2, %k1 +; AVX512-NEXT: vpmovsxwq %xmm3, %zmm2 +; AVX512-NEXT: vpsllq $63, %zmm2, %zmm2 +; AVX512-NEXT: vptestmq %zmm2, %zmm2, %k2 +; AVX512-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512-NEXT: vmovdqa32 %zmm1, %zmm2 {%k2} +; AVX512-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x i32>, ptr %ptr1, align 32 + %load2 = load <8 x i32>, ptr %ptr2, align 32 + %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + %sel2 = select <8 x i1> %cmp2, <8 x i32> %y, <8 x i32> %load2 + store <8 x i32> %sel, ptr %ptr1, align 32 + store <8 x i32> %sel2, ptr %ptr2, align 32 + ret void +} diff --git a/llvm/test/CodeGen/X86/pr30284.ll b/llvm/test/CodeGen/X86/pr30284.ll index f4fb1b3ce72e3..708f0f7ee72da 100644 --- a/llvm/test/CodeGen/X86/pr30284.ll +++ b/llvm/test/CodeGen/X86/pr30284.ll @@ -19,14 +19,12 @@ define void @f_f___un_3C_unf_3E_un_3C_unf_3E_(<16 x i1> %x) { ; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0 ; CHECK-NEXT: vpslld $31, %zmm0, %zmm0 ; CHECK-NEXT: vpmovd2m %zmm0, %k1 -; CHECK-NEXT: vmovapd 0, %zmm0 -; CHECK-NEXT: vmovapd 64, %zmm1 -; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm2 = [0,16,0,16,0,16,0,16,0,16,0,16,0,16,0,16] +; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [0,16,0,16,0,16,0,16,0,16,0,16,0,16,0,16] +; CHECK-NEXT: vporq 64, %zmm0, %zmm1 +; CHECK-NEXT: vporq 0, %zmm0, %zmm0 ; CHECK-NEXT: kshiftrw $8, %k1, %k2 -; CHECK-NEXT: vorpd %zmm2, %zmm1, %zmm1 {%k2} -; CHECK-NEXT: vorpd %zmm2, %zmm0, %zmm0 {%k1} -; CHECK-NEXT: vmovapd %zmm0, 0 -; CHECK-NEXT: vmovapd %zmm1, 64 +; CHECK-NEXT: vmovdqa64 %zmm0, 0 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, 64 {%k2} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl %a_load22 = load <16 x i64>, ptr null, align 1 diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll index ef2ed1f0e87e2..483c2b58e9cc4 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll @@ -90,6 +90,7 @@ define void @maxArray(ptr noalias nocapture %x, ptr noalias nocapture readonly % ; BASE-NEXT: jne .LBB0_1 ; BASE-NEXT: # %bb.2: # %exit ; BASE-NEXT: retq +; ; FUSE-LABEL: maxArray: ; FUSE: # %bb.0: # %entry ; FUSE-NEXT: xorl %eax, %eax From 3106f46b88adc0aa6fd80c4940373580340137b2 Mon Sep 17 00:00:00 2001 From: Abhishek Kaushik Date: Thu, 26 Jun 2025 23:18:28 +0530 Subject: [PATCH 7/8] Update macro-fuse-cmp.ll --- llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll index 483c2b58e9cc4..ef2ed1f0e87e2 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll @@ -90,7 +90,6 @@ define void @maxArray(ptr noalias nocapture %x, ptr noalias nocapture readonly % ; BASE-NEXT: jne .LBB0_1 ; BASE-NEXT: # %bb.2: # %exit ; BASE-NEXT: retq -; ; FUSE-LABEL: maxArray: ; FUSE: # %bb.0: # %entry ; FUSE-NEXT: xorl %eax, %eax From 8c14fba1d1a59472311c6f5931a35a54e3658228 Mon Sep 17 00:00:00 2001 From: Abhishek Kaushik Date: Fri, 27 Jun 2025 13:51:24 +0530 Subject: [PATCH 8/8] Use allowsMisalignedMemoryAccesses to check if unaligned stores are allowed --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 5 +- .../CodeGen/AArch64/combine-storetomstore.ll | 66 +++++++++++++++---- .../test/CodeGen/ARM/combine-storetomstore.ll | 53 +++++++++++---- .../CodeGen/RISCV/combine-storetomstore.ll | 41 ++++++++---- .../test/CodeGen/X86/combine-storetomstore.ll | 59 +++++++++++++---- 5 files changed, 179 insertions(+), 45 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 62a26c276a9f7..dd4305e4a6819 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -22463,9 +22463,12 @@ static SDValue foldToMaskedStore(StoreSDNode *Store, SelectionDAG &DAG, SDValue StorePtr = Store->getBasePtr(); SDValue StoreOffset = Store->getOffset(); EVT VT = Store->getMemoryVT(); + unsigned AddrSpace = Store->getAddressSpace(); + Align Alignment = Store->getAlign(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (!TLI.isTypeLegal(VT) || !TLI.isOperationLegalOrCustom(ISD::MSTORE, VT)) + if (!TLI.isOperationLegalOrCustom(ISD::MSTORE, VT) || + !TLI.allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment)) return SDValue(); SDValue Mask, TrueVec, LoadCh; diff --git a/llvm/test/CodeGen/AArch64/combine-storetomstore.ll b/llvm/test/CodeGen/AArch64/combine-storetomstore.ll index 16c0786a38768..7e612a976cead 100644 --- a/llvm/test/CodeGen/AArch64/combine-storetomstore.ll +++ b/llvm/test/CodeGen/AArch64/combine-storetomstore.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -mtriple=aarch64-- -mattr=+neon | FileCheck %s -check-prefix=AARCH64 ; RUN: llc < %s -mtriple=aarch64-- -mattr=+sve | FileCheck %s -check-prefix=SVE -define void @test_masked_store_success(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +define void @test_masked_store_success(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { ; AARCH64-LABEL: test_masked_store_success: ; AARCH64: // %bb.0: ; AARCH64-NEXT: zip1 v3.8b, v2.8b, v0.8b @@ -39,12 +39,12 @@ define void @test_masked_store_success(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { ; SVE-NEXT: st1w { z0.s }, p0, [x0] ; SVE-NEXT: ret %load = load <8 x i32>, ptr %ptr, align 32 - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load store <8 x i32> %sel, ptr %ptr, align 32 ret void } -define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { ; AARCH64-LABEL: test_masked_store_volatile_load: ; AARCH64: // %bb.0: ; AARCH64-NEXT: zip1 v3.8b, v2.8b, v0.8b @@ -79,12 +79,12 @@ define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %c ; SVE-NEXT: stp q0, q1, [x0] ; SVE-NEXT: ret %load = load volatile <8 x i32>, ptr %ptr, align 32 - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load store <8 x i32> %sel, ptr %ptr, align 32 ret void } -define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { ; AARCH64-LABEL: test_masked_store_volatile_store: ; AARCH64: // %bb.0: ; AARCH64-NEXT: zip1 v3.8b, v2.8b, v0.8b @@ -119,14 +119,14 @@ define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> % ; SVE-NEXT: str q1, [x0, #16] ; SVE-NEXT: ret %load = load <8 x i32>, ptr %ptr, align 32 - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load store volatile <8 x i32> %sel, ptr %ptr, align 32 ret void } declare void @use_vec(<8 x i32>) -define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { ; AARCH64-LABEL: test_masked_store_intervening: ; AARCH64: // %bb.0: ; AARCH64-NEXT: sub sp, sp, #96 @@ -204,13 +204,13 @@ define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp store <8 x i32> zeroinitializer, ptr %ptr, align 32 %tmp = load <8 x i32>, ptr %ptr call void @use_vec(<8 x i32> %tmp) - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load store <8 x i32> %sel, ptr %ptr, align 32 ret void } -define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %cmp, <8 x i1> %cmp2) { +define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %mask, <8 x i1> %mask2) { ; AARCH64-LABEL: test_masked_store_multiple: ; AARCH64: // %bb.0: ; AARCH64-NEXT: zip1 v6.8b, v4.8b, v0.8b @@ -274,9 +274,53 @@ define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, p ; SVE-NEXT: ret %load = load <8 x i32>, ptr %ptr1, align 32 %load2 = load <8 x i32>, ptr %ptr2, align 32 - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load - %sel2 = select <8 x i1> %cmp2, <8 x i32> %y, <8 x i32> %load2 + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load + %sel2 = select <8 x i1> %mask2, <8 x i32> %y, <8 x i32> %load2 store <8 x i32> %sel, ptr %ptr1, align 32 store <8 x i32> %sel2, ptr %ptr2, align 32 ret void } + +define void @test_masked_store_unaligned(<8 x i32> %data, ptr %ptr, <8 x i1> %mask) { +; AARCH64-LABEL: test_masked_store_unaligned: +; AARCH64: // %bb.0: +; AARCH64-NEXT: zip1 v3.8b, v2.8b, v0.8b +; AARCH64-NEXT: zip2 v2.8b, v2.8b, v0.8b +; AARCH64-NEXT: ldp q4, q5, [x0] +; AARCH64-NEXT: ushll v3.4s, v3.4h, #0 +; AARCH64-NEXT: ushll v2.4s, v2.4h, #0 +; AARCH64-NEXT: shl v3.4s, v3.4s, #31 +; AARCH64-NEXT: shl v2.4s, v2.4s, #31 +; AARCH64-NEXT: cmlt v3.4s, v3.4s, #0 +; AARCH64-NEXT: cmlt v2.4s, v2.4s, #0 +; AARCH64-NEXT: bif v0.16b, v4.16b, v3.16b +; AARCH64-NEXT: bif v1.16b, v5.16b, v2.16b +; AARCH64-NEXT: stp q0, q1, [x0] +; AARCH64-NEXT: ret +; +; SVE-LABEL: test_masked_store_unaligned: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: zip2 v3.8b, v2.8b, v0.8b +; SVE-NEXT: zip1 v2.8b, v2.8b, v0.8b +; SVE-NEXT: mov x8, #4 // =0x4 +; SVE-NEXT: ptrue p0.s, vl4 +; SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE-NEXT: ushll v3.4s, v3.4h, #0 +; SVE-NEXT: ushll v2.4s, v2.4h, #0 +; SVE-NEXT: shl v3.4s, v3.4s, #31 +; SVE-NEXT: shl v2.4s, v2.4s, #31 +; SVE-NEXT: cmlt v3.4s, v3.4s, #0 +; SVE-NEXT: cmlt v2.4s, v2.4s, #0 +; SVE-NEXT: cmpne p1.s, p0/z, z3.s, #0 +; SVE-NEXT: cmpne p0.s, p0/z, z2.s, #0 +; SVE-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2] +; SVE-NEXT: st1w { z0.s }, p0, [x0] +; SVE-NEXT: ret + %ptr_i8 = getelementptr i8, ptr %ptr, i32 1 + %ptr_vec = bitcast ptr %ptr_i8 to ptr + %load = load <8 x i32>, ptr %ptr, align 1 + %sel = select <8 x i1> %mask, <8 x i32> %data, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 1 + ret void +} diff --git a/llvm/test/CodeGen/ARM/combine-storetomstore.ll b/llvm/test/CodeGen/ARM/combine-storetomstore.ll index 085141b2eabf9..e42f4a578b59c 100644 --- a/llvm/test/CodeGen/ARM/combine-storetomstore.ll +++ b/llvm/test/CodeGen/ARM/combine-storetomstore.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=armv7-- -mattr=+neon | FileCheck %s -check-prefix=ARM -define void @test_masked_store_success(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +define void @test_masked_store_success(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { ; ARM-LABEL: test_masked_store_success: ; ARM: @ %bb.0: ; ARM-NEXT: vldr d16, [sp, #24] @@ -25,12 +25,12 @@ define void @test_masked_store_success(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { ; ARM-NEXT: vst1.64 {d16, d17}, [r12:128] ; ARM-NEXT: bx lr %load = load <8 x i32>, ptr %ptr, align 32 - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load store <8 x i32> %sel, ptr %ptr, align 32 ret void } -define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { ; ARM-LABEL: test_masked_store_volatile_load: ; ARM: @ %bb.0: ; ARM-NEXT: vldr d16, [sp, #24] @@ -54,12 +54,12 @@ define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %c ; ARM-NEXT: vst1.64 {d16, d17}, [r12:128] ; ARM-NEXT: bx lr %load = load volatile <8 x i32>, ptr %ptr, align 32 - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load store <8 x i32> %sel, ptr %ptr, align 32 ret void } -define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { ; ARM-LABEL: test_masked_store_volatile_store: ; ARM: @ %bb.0: ; ARM-NEXT: vldr d16, [sp, #24] @@ -84,14 +84,14 @@ define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> % ; ARM-NEXT: vst1.64 {d16, d17}, [r0:128] ; ARM-NEXT: bx lr %load = load <8 x i32>, ptr %ptr, align 32 - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load store volatile <8 x i32> %sel, ptr %ptr, align 32 ret void } declare void @use_vec(<8 x i32>) -define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { ; ARM-LABEL: test_masked_store_intervening: ; ARM: @ %bb.0: ; ARM-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} @@ -137,13 +137,13 @@ define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp store <8 x i32> zeroinitializer, ptr %ptr, align 32 %tmp = load <8 x i32>, ptr %ptr call void @use_vec(<8 x i32> %tmp) - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load store <8 x i32> %sel, ptr %ptr, align 32 ret void } -define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %cmp, <8 x i1> %cmp2) { +define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %mask, <8 x i1> %mask2) { ; ARM-LABEL: test_masked_store_multiple: ; ARM: @ %bb.0: ; ARM-NEXT: push {r11, lr} @@ -189,9 +189,40 @@ define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, p ; ARM-NEXT: pop {r11, pc} %load = load <8 x i32>, ptr %ptr1, align 32 %load2 = load <8 x i32>, ptr %ptr2, align 32 - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load - %sel2 = select <8 x i1> %cmp2, <8 x i32> %y, <8 x i32> %load2 + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load + %sel2 = select <8 x i1> %mask2, <8 x i32> %y, <8 x i32> %load2 store <8 x i32> %sel, ptr %ptr1, align 32 store <8 x i32> %sel2, ptr %ptr2, align 32 ret void } + +define void @test_masked_store_unaligned(<8 x i32> %data, ptr %ptr, <8 x i1> %mask) { +; ARM-LABEL: test_masked_store_unaligned: +; ARM: @ %bb.0: +; ARM-NEXT: vldr d16, [sp, #24] +; ARM-NEXT: vmov d21, r2, r3 +; ARM-NEXT: ldr r12, [sp, #16] +; ARM-NEXT: vmov d20, r0, r1 +; ARM-NEXT: vzip.8 d16, d17 +; ARM-NEXT: mov r0, sp +; ARM-NEXT: vld1.8 {d22, d23}, [r12] +; ARM-NEXT: vmovl.u16 q9, d16 +; ARM-NEXT: vmovl.u16 q8, d17 +; ARM-NEXT: vshl.i32 q9, q9, #31 +; ARM-NEXT: vshl.i32 q8, q8, #31 +; ARM-NEXT: vshr.s32 q9, q9, #31 +; ARM-NEXT: vshr.s32 q8, q8, #31 +; ARM-NEXT: vbsl q9, q10, q11 +; ARM-NEXT: vld1.64 {d20, d21}, [r0] +; ARM-NEXT: vst1.8 {d18, d19}, [r12]! +; ARM-NEXT: vld1.8 {d18, d19}, [r12] +; ARM-NEXT: vbsl q8, q10, q9 +; ARM-NEXT: vst1.8 {d16, d17}, [r12] +; ARM-NEXT: bx lr + %ptr_i8 = getelementptr i8, ptr %ptr, i32 1 + %ptr_vec = bitcast ptr %ptr_i8 to ptr + %load = load <8 x i32>, ptr %ptr, align 1 + %sel = select <8 x i1> %mask, <8 x i32> %data, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 1 + ret void +} diff --git a/llvm/test/CodeGen/RISCV/combine-storetomstore.ll b/llvm/test/CodeGen/RISCV/combine-storetomstore.ll index 26cad3959e7d3..099513a832311 100644 --- a/llvm/test/CodeGen/RISCV/combine-storetomstore.ll +++ b/llvm/test/CodeGen/RISCV/combine-storetomstore.ll @@ -1,19 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=riscv64-- -mattr=+m,+v,+f | FileCheck %s -check-prefix=RISCV -define void @test_masked_store_success(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +define void @test_masked_store_success(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { ; RISCV-LABEL: test_masked_store_success: ; RISCV: # %bb.0: ; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RISCV-NEXT: vse32.v v8, (a0), v0.t ; RISCV-NEXT: ret %load = load <8 x i32>, ptr %ptr, align 32 - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load store <8 x i32> %sel, ptr %ptr, align 32 ret void } -define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { ; RISCV-LABEL: test_masked_store_volatile_load: ; RISCV: # %bb.0: ; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma @@ -22,12 +22,12 @@ define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %c ; RISCV-NEXT: vse32.v v8, (a0) ; RISCV-NEXT: ret %load = load volatile <8 x i32>, ptr %ptr, align 32 - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load store <8 x i32> %sel, ptr %ptr, align 32 ret void } -define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { ; RISCV-LABEL: test_masked_store_volatile_store: ; RISCV: # %bb.0: ; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma @@ -36,14 +36,14 @@ define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> % ; RISCV-NEXT: vse32.v v8, (a0) ; RISCV-NEXT: ret %load = load <8 x i32>, ptr %ptr, align 32 - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load store volatile <8 x i32> %sel, ptr %ptr, align 32 ret void } declare void @use_vec(<8 x i32>) -define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { ; RISCV-LABEL: test_masked_store_intervening: ; RISCV: # %bb.0: ; RISCV-NEXT: addi sp, sp, -32 @@ -106,13 +106,13 @@ define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp store <8 x i32> zeroinitializer, ptr %ptr, align 32 %tmp = load <8 x i32>, ptr %ptr call void @use_vec(<8 x i32> %tmp) - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load store <8 x i32> %sel, ptr %ptr, align 32 ret void } -define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %cmp, <8 x i1> %cmp2) { +define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %mask, <8 x i1> %mask2) { ; RISCV-LABEL: test_masked_store_multiple: ; RISCV: # %bb.0: ; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma @@ -126,9 +126,28 @@ define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, p ; RISCV-NEXT: ret %load = load <8 x i32>, ptr %ptr1, align 32 %load2 = load <8 x i32>, ptr %ptr2, align 32 - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load - %sel2 = select <8 x i1> %cmp2, <8 x i32> %y, <8 x i32> %load2 + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load + %sel2 = select <8 x i1> %mask2, <8 x i32> %y, <8 x i32> %load2 store <8 x i32> %sel, ptr %ptr1, align 32 store <8 x i32> %sel2, ptr %ptr2, align 32 ret void } + +define void @test_masked_store_unaligned(<8 x i32> %data, ptr %ptr, <8 x i1> %mask) { +; RISCV-LABEL: test_masked_store_unaligned: +; RISCV: # %bb.0: +; RISCV-NEXT: li a1, 32 +; RISCV-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; RISCV-NEXT: vle8.v v10, (a0) +; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RISCV-NEXT: vmerge.vvm v8, v10, v8, v0 +; RISCV-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; RISCV-NEXT: vse8.v v8, (a0) +; RISCV-NEXT: ret + %ptr_i8 = getelementptr i8, ptr %ptr, i32 1 + %ptr_vec = bitcast ptr %ptr_i8 to ptr + %load = load <8 x i32>, ptr %ptr, align 1 + %sel = select <8 x i1> %mask, <8 x i32> %data, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 1 + ret void +} diff --git a/llvm/test/CodeGen/X86/combine-storetomstore.ll b/llvm/test/CodeGen/X86/combine-storetomstore.ll index 7b39fa450cbf8..a35e1f3aad5d3 100644 --- a/llvm/test/CodeGen/X86/combine-storetomstore.ll +++ b/llvm/test/CodeGen/X86/combine-storetomstore.ll @@ -4,7 +4,7 @@ ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s -check-prefix=AVX512 -define void @test_masked_store_success(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +define void @test_masked_store_success(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { ; AVX-LABEL: test_masked_store_success: ; AVX: # %bb.0: ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -34,12 +34,12 @@ define void @test_masked_store_success(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %load = load <8 x i32>, ptr %ptr, align 32 - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load store <8 x i32> %sel, ptr %ptr, align 32 ret void } -define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { ; AVX-LABEL: test_masked_store_volatile_load: ; AVX: # %bb.0: ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -75,12 +75,12 @@ define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %c ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %load = load volatile <8 x i32>, ptr %ptr, align 32 - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load store <8 x i32> %sel, ptr %ptr, align 32 ret void } -define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { ; AVX-LABEL: test_masked_store_volatile_store: ; AVX: # %bb.0: ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -116,14 +116,14 @@ define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> % ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %load = load <8 x i32>, ptr %ptr, align 32 - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load store volatile <8 x i32> %sel, ptr %ptr, align 32 ret void } declare void @use_vec(<8 x i32>) -define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) { +define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { ; AVX-LABEL: test_masked_store_intervening: ; AVX: # %bb.0: ; AVX-NEXT: pushq %rbx @@ -211,13 +211,13 @@ define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp store <8 x i32> zeroinitializer, ptr %ptr, align 32 %tmp = load <8 x i32>, ptr %ptr call void @use_vec(<8 x i32> %tmp) - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load store <8 x i32> %sel, ptr %ptr, align 32 ret void } -define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %cmp, <8 x i1> %cmp2) { +define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %mask, <8 x i1> %mask2) { ; AVX-LABEL: test_masked_store_multiple: ; AVX: # %bb.0: ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero @@ -268,9 +268,46 @@ define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, p ; AVX512-NEXT: retq %load = load <8 x i32>, ptr %ptr1, align 32 %load2 = load <8 x i32>, ptr %ptr2, align 32 - %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load - %sel2 = select <8 x i1> %cmp2, <8 x i32> %y, <8 x i32> %load2 + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load + %sel2 = select <8 x i1> %mask2, <8 x i32> %y, <8 x i32> %load2 store <8 x i32> %sel, ptr %ptr1, align 32 store <8 x i32> %sel2, ptr %ptr2, align 32 ret void } + +define void @test_masked_store_unaligned(<8 x i32> %data, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_unaligned: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_unaligned: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vpmaskmovd %ymm0, %ymm1, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_unaligned: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpsllq $63, %zmm1, %zmm1 +; AVX512-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %ptr_i8 = getelementptr i8, ptr %ptr, i32 1 + %ptr_vec = bitcast ptr %ptr_i8 to ptr + %load = load <8 x i32>, ptr %ptr, align 1 + %sel = select <8 x i1> %mask, <8 x i32> %data, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 1 + ret void +}