diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 52cb1dbb33b86..475047cec1317 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -129,6 +129,7 @@ class VectorCombine { bool foldShuffleOfIntrinsics(Instruction &I); bool foldShuffleToIdentity(Instruction &I); bool foldShuffleFromReductions(Instruction &I); + bool foldShuffleChainsToReduce(Instruction &I); bool foldCastFromReductions(Instruction &I); bool foldSelectShuffle(Instruction &I, bool FromReduction = false); bool foldInterleaveIntrinsics(Instruction &I); @@ -2910,6 +2911,133 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) { return foldSelectShuffle(*Shuffle, true); } +bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { + auto *SVI = dyn_cast(&I); + if (!SVI) + return false; + + std::queue Worklist; + SmallVector ToEraseFromParent; + + SmallVector ShuffleMask; + bool IsShuffleOp = true; + + Worklist.push(SVI); + SVI->getShuffleMask(ShuffleMask); + + if (ShuffleMask.size() < 2) + return false; + + Instruction *Prev0 = nullptr, *Prev1 = nullptr; + Instruction *LastOp = nullptr; + + int MaskHalfPos = ShuffleMask.size() / 2; + bool IsFirst = true; + + while (!Worklist.empty()) { + Value *V = Worklist.front(); + Worklist.pop(); + + auto *CI = dyn_cast(V); + if (!CI) + return false; + + if (auto *SV = dyn_cast(V)) { + if (!IsShuffleOp || MaskHalfPos < 1 || (!Prev1 && !IsFirst)) + return false; + + auto *Op0 = SV->getOperand(0); + auto *Op1 = SV->getOperand(1); + if (!Op0 || !Op1) + return false; + + auto *FVT = dyn_cast(Op1->getType()); + if (!FVT || !isa(Op1)) + return false; + + SmallVector CurrentMask; + SV->getShuffleMask(CurrentMask); + + int64_t MaskSize = CurrentMask.size(); + for (int MaskPos = 0; MaskPos != MaskSize; ++MaskPos) { + if (MaskPos < MaskHalfPos && + CurrentMask[MaskPos] != MaskHalfPos + MaskPos) + return false; + if (MaskPos >= MaskHalfPos && CurrentMask[MaskPos] != -1) + return false; + } + MaskHalfPos /= 2; + Prev0 = SV; + } else if (auto *Call = dyn_cast(V)) { + if (IsShuffleOp || !Prev0) + return false; + + auto *II = dyn_cast(Call); + if (!II) + return false; + + switch (II->getIntrinsicID()) { + case Intrinsic::umin: { + auto *Op0 = Call->getOperand(0); + auto *Op1 = Call->getOperand(1); + if (!(Op0 == Prev0 && Op1 == Prev1) && + !(Op0 == Prev1 && Op1 == Prev0) && !IsFirst) + return false; + + if (!IsFirst) + Prev0 = Prev1; + else + IsFirst = false; + Prev1 = Call; + break; + } + default: + return false; + } + } else if (auto *ExtractElement = dyn_cast(CI)) { + if (!IsShuffleOp || !Prev0 || !Prev1 || MaskHalfPos != 0) + return false; + + auto *Op0 = ExtractElement->getOperand(0); + auto *Op1 = ExtractElement->getOperand(1); + if (Op0 != Prev1) + return false; + + if (auto *Op1Idx = dyn_cast(Op1)) { + if (Op1Idx->getValue() != 0) + return false; + } else { + return false; + } + LastOp = ExtractElement; + break; + } + IsShuffleOp ^= 1; + ToEraseFromParent.push_back(CI); + + auto *NextI = CI->getNextNode(); + if (!NextI) + return false; + Worklist.push(NextI); + } + + if (!LastOp) + return false; + + auto *ReducedResult = Builder.CreateIntrinsic( + Intrinsic::vector_reduce_umin, {SVI->getType()}, {SVI->getOperand(0)}); + replaceValue(*LastOp, *ReducedResult); + + ToEraseFromParent.push_back(LastOp); + + std::reverse(ToEraseFromParent.begin(), ToEraseFromParent.end()); + // for (auto &Instr : ToEraseFromParent) + // eraseInstruction(*Instr); + // Instr->eraseFromParent(); + + return true; +} + /// Determine if its more efficient to fold: /// reduce(trunc(x)) -> trunc(reduce(x)). /// reduce(sext(x)) -> sext(reduce(x)). @@ -3607,6 +3735,7 @@ bool VectorCombine::run() { MadeChange |= foldShuffleOfIntrinsics(I); MadeChange |= foldSelectShuffle(I); MadeChange |= foldShuffleToIdentity(I); + MadeChange |= foldShuffleChainsToReduce(I); break; case Instruction::BitCast: MadeChange |= foldBitcastShuffle(I); diff --git a/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll new file mode 100644 index 0000000000000..6f21eb5097fde --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll @@ -0,0 +1,18 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=vector-combine -S | FileCheck %s + +define i16 @test_reduce_v8i16(<8 x i16> %a0) local_unnamed_addr #0 { +; CHECK-LABEL: define i16 @test_reduce_v8i16( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5) + %7 = extractelement <8 x i16> %6, i64 0 + ret i16 %7 +}