-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[RISCV] Use slideup to lower build_vector when its last operand is a reduction #154450
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Min-Yih Hsu (mshockwave) ChangesThe general lowering of build_vector starts with splatting the first operand before sliding down other operands one-by-one. However, the initial splat could be avoid if the last operand is an extract from a reduction result, in which case we could use the original vector reduction result as start value before sliding up other operands (in reverse order) one-by-one. Original context: #154175 (comment) Full diff: https://github.com/llvm/llvm-project/pull/154450.diff 4 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 4a1db80076530..ce6fc8425856a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4512,33 +4512,94 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
"Illegal type which will result in reserved encoding");
const unsigned Policy = RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC;
+ auto getVSlide = [&](bool SlideUp, EVT ContainerVT, SDValue Passthru,
+ SDValue Vec, SDValue Offset, SDValue Mask,
+ SDValue VL) -> SDValue {
+ if (SlideUp)
+ return getVSlideup(DAG, Subtarget, DL, ContainerVT, Passthru, Vec, Offset,
+ Mask, VL, Policy);
+ return getVSlidedown(DAG, Subtarget, DL, ContainerVT, Passthru, Vec, Offset,
+ Mask, VL, Policy);
+ };
+
+ // General case: splat the first operand and sliding other operands down one
+ // by one to form a vector. Alternatively, if the last operand is an
+ // extraction from a reduction result, we can use the original vector
+ // reduction result as the start value and slide up instead of slide down.
+ // Such that we can avoid the splat.
+ SmallVector<SDValue> Operands(Op->op_begin(), Op->op_end());
+ SDValue Reduce;
+ bool SlideUp = false;
+ // Find the first first non-undef from the tail.
+ auto ItLastNonUndef = find_if(Operands.rbegin(), Operands.rend(),
+ [](SDValue V) { return !V.isUndef(); });
+ if (ItLastNonUndef != Operands.rend()) {
+ using namespace SDPatternMatch;
+ // Check if the last non-undef operand was extracted from a reduction.
+ for (unsigned Opc :
+ {RISCVISD::VECREDUCE_ADD_VL, RISCVISD::VECREDUCE_UMAX_VL,
+ RISCVISD::VECREDUCE_SMAX_VL, RISCVISD::VECREDUCE_UMIN_VL,
+ RISCVISD::VECREDUCE_SMIN_VL, RISCVISD::VECREDUCE_AND_VL,
+ RISCVISD::VECREDUCE_OR_VL, RISCVISD::VECREDUCE_XOR_VL,
+ RISCVISD::VECREDUCE_FADD_VL, RISCVISD::VECREDUCE_SEQ_FADD_VL,
+ RISCVISD::VECREDUCE_FMAX_VL, RISCVISD::VECREDUCE_FMIN_VL}) {
+ SlideUp = sd_match(
+ *ItLastNonUndef,
+ m_ExtractElt(m_AllOf(m_Opc(Opc), m_Value(Reduce)), m_Zero()));
+ if (SlideUp)
+ break;
+ }
+ }
+
+ if (SlideUp) {
+ // Adapt Reduce's type into ContainerVT.
+ if (Reduce.getValueType().getVectorMinNumElements() <
+ ContainerVT.getVectorMinNumElements())
+ Reduce = DAG.getInsertSubvector(DL, DAG.getUNDEF(ContainerVT), Reduce, 0);
+ else
+ Reduce = DAG.getExtractSubvector(DL, ContainerVT, Reduce, 0);
+
+ // Reverse the elements as we're going to slide up from the last element.
+ for (unsigned i = 0U, N = Operands.size(), H = divideCeil(N, 2); i < H; ++i)
+ std::swap(Operands[i], Operands[N - 1 - i]);
+ }
SDValue Vec;
UndefCount = 0;
- for (SDValue V : Op->ops()) {
+ for (SDValue V : Operands) {
if (V.isUndef()) {
UndefCount++;
continue;
}
- // Start our sequence with a TA splat in the hopes that hardware is able to
- // recognize there's no dependency on the prior value of our temporary
- // register.
+ // Start our sequence with either a TA splat or a reduction result in the
+ // hopes that hardware is able to recognize there's no dependency on the
+ // prior value of our temporary register.
if (!Vec) {
- Vec = DAG.getSplatVector(VT, DL, V);
- Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
+ if (SlideUp) {
+ Vec = Reduce;
+ } else {
+ Vec = DAG.getSplatVector(VT, DL, V);
+ Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
+ }
+
UndefCount = 0;
continue;
}
if (UndefCount) {
const SDValue Offset = DAG.getConstant(UndefCount, DL, Subtarget.getXLenVT());
- Vec = getVSlidedown(DAG, Subtarget, DL, ContainerVT, DAG.getUNDEF(ContainerVT),
- Vec, Offset, Mask, VL, Policy);
+ Vec = getVSlide(SlideUp, ContainerVT, DAG.getUNDEF(ContainerVT), Vec,
+ Offset, Mask, VL);
UndefCount = 0;
}
- auto OpCode =
- VT.isFloatingPoint() ? RISCVISD::VFSLIDE1DOWN_VL : RISCVISD::VSLIDE1DOWN_VL;
+
+ unsigned OpCode;
+ if (VT.isFloatingPoint())
+ OpCode = SlideUp ? RISCVISD::VFSLIDE1UP_VL : RISCVISD::VFSLIDE1DOWN_VL;
+ else
+ OpCode = SlideUp ? RISCVISD::VSLIDE1UP_VL : RISCVISD::VSLIDE1DOWN_VL;
+
if (!VT.isFloatingPoint())
V = DAG.getNode(ISD::ANY_EXTEND, DL, Subtarget.getXLenVT(), V);
Vec = DAG.getNode(OpCode, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Vec,
@@ -4546,8 +4607,8 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
}
if (UndefCount) {
const SDValue Offset = DAG.getConstant(UndefCount, DL, Subtarget.getXLenVT());
- Vec = getVSlidedown(DAG, Subtarget, DL, ContainerVT, DAG.getUNDEF(ContainerVT),
- Vec, Offset, Mask, VL, Policy);
+ Vec = getVSlide(SlideUp, ContainerVT, DAG.getUNDEF(ContainerVT), Vec,
+ Offset, Mask, VL);
}
return convertFromScalableVector(VT, Vec, DAG, Subtarget);
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
index 3c3e08d387faa..a806e758d2758 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -1828,3 +1828,59 @@ define <8 x double> @buildvec_v8f64_zvl512(double %e0, double %e1, double %e2, d
%v7 = insertelement <8 x double> %v6, double %e7, i64 7
ret <8 x double> %v7
}
+
+define <4 x float> @buildvec_vfredusum(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind {
+; CHECK-LABEL: buildvec_vfredusum:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vfmv.s.f v16, fa0
+; CHECK-NEXT: vfredusum.vs v8, v8, v16
+; CHECK-NEXT: vfredusum.vs v9, v10, v16
+; CHECK-NEXT: vfredusum.vs v10, v12, v16
+; CHECK-NEXT: vfmv.f.s fa5, v8
+; CHECK-NEXT: vfmv.f.s fa4, v9
+; CHECK-NEXT: vfmv.f.s fa3, v10
+; CHECK-NEXT: vfredusum.vs v8, v14, v16
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vfslide1up.vf v9, v8, fa3
+; CHECK-NEXT: vfslide1up.vf v10, v9, fa4
+; CHECK-NEXT: vfslide1up.vf v8, v10, fa5
+; CHECK-NEXT: ret
+ %247 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
+ %248 = insertelement <4 x float> poison, float %247, i64 0
+ %250 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg2)
+ %251 = insertelement <4 x float> %248, float %250, i64 1
+ %252 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3)
+ %253 = insertelement <4 x float> %251, float %252, i64 2
+ %254 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4)
+ %255 = insertelement <4 x float> %253, float %254, i64 3
+ ret <4 x float> %255
+}
+
+define <4 x float> @buildvec_vfredosum(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind {
+; CHECK-LABEL: buildvec_vfredosum:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vfmv.s.f v16, fa0
+; CHECK-NEXT: vfredosum.vs v8, v8, v16
+; CHECK-NEXT: vfredosum.vs v9, v10, v16
+; CHECK-NEXT: vfredosum.vs v10, v12, v16
+; CHECK-NEXT: vfmv.f.s fa5, v8
+; CHECK-NEXT: vfmv.f.s fa4, v9
+; CHECK-NEXT: vfmv.f.s fa3, v10
+; CHECK-NEXT: vfredosum.vs v8, v14, v16
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vfslide1up.vf v9, v8, fa3
+; CHECK-NEXT: vfslide1up.vf v10, v9, fa4
+; CHECK-NEXT: vfslide1up.vf v8, v10, fa5
+; CHECK-NEXT: ret
+ %247 = tail call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
+ %248 = insertelement <4 x float> poison, float %247, i64 0
+ %250 = tail call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg2)
+ %251 = insertelement <4 x float> %248, float %250, i64 1
+ %252 = tail call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3)
+ %253 = insertelement <4 x float> %251, float %252, i64 2
+ %254 = tail call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4)
+ %255 = insertelement <4 x float> %253, float %254, i64 3
+ ret <4 x float> %255
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
index d9bb007a10f71..a02117fdd2833 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
@@ -3416,5 +3416,204 @@ define <4 x i1> @buildvec_i1_splat(i1 %e1) {
ret <4 x i1> %v4
}
+define <4 x i32> @buildvec_vredsum(<8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> %arg2, <8 x i32> %arg3) nounwind {
+; RV32-LABEL: buildvec_vredsum:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vmv.s.x v16, zero
+; RV32-NEXT: vredsum.vs v8, v8, v16
+; RV32-NEXT: vredsum.vs v9, v10, v16
+; RV32-NEXT: vredsum.vs v10, v12, v16
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vmv.x.s a1, v9
+; RV32-NEXT: vmv.x.s a2, v10
+; RV32-NEXT: vredsum.vs v8, v14, v16
+; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vslide1up.vx v9, v8, a2
+; RV32-NEXT: vslide1up.vx v10, v9, a1
+; RV32-NEXT: vslide1up.vx v8, v10, a0
+; RV32-NEXT: ret
+;
+; RV64V-ONLY-LABEL: buildvec_vredsum:
+; RV64V-ONLY: # %bb.0:
+; RV64V-ONLY-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64V-ONLY-NEXT: vmv.s.x v16, zero
+; RV64V-ONLY-NEXT: vredsum.vs v8, v8, v16
+; RV64V-ONLY-NEXT: vredsum.vs v9, v10, v16
+; RV64V-ONLY-NEXT: vredsum.vs v10, v12, v16
+; RV64V-ONLY-NEXT: vmv.x.s a0, v8
+; RV64V-ONLY-NEXT: vmv.x.s a1, v9
+; RV64V-ONLY-NEXT: vmv.x.s a2, v10
+; RV64V-ONLY-NEXT: vredsum.vs v8, v14, v16
+; RV64V-ONLY-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64V-ONLY-NEXT: vslide1up.vx v9, v8, a2
+; RV64V-ONLY-NEXT: vslide1up.vx v10, v9, a1
+; RV64V-ONLY-NEXT: vslide1up.vx v8, v10, a0
+; RV64V-ONLY-NEXT: ret
+;
+; RVA22U64-LABEL: buildvec_vredsum:
+; RVA22U64: # %bb.0:
+; RVA22U64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RVA22U64-NEXT: vmv.s.x v16, zero
+; RVA22U64-NEXT: vredsum.vs v8, v8, v16
+; RVA22U64-NEXT: vredsum.vs v9, v10, v16
+; RVA22U64-NEXT: vredsum.vs v10, v12, v16
+; RVA22U64-NEXT: vredsum.vs v11, v14, v16
+; RVA22U64-NEXT: vmv.x.s a0, v8
+; RVA22U64-NEXT: vmv.x.s a1, v9
+; RVA22U64-NEXT: vmv.x.s a2, v10
+; RVA22U64-NEXT: slli a1, a1, 32
+; RVA22U64-NEXT: add.uw a0, a0, a1
+; RVA22U64-NEXT: vmv.x.s a1, v11
+; RVA22U64-NEXT: slli a1, a1, 32
+; RVA22U64-NEXT: add.uw a1, a2, a1
+; RVA22U64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RVA22U64-NEXT: vmv.v.x v8, a0
+; RVA22U64-NEXT: vslide1down.vx v8, v8, a1
+; RVA22U64-NEXT: ret
+;
+; RVA22U64-PACK-LABEL: buildvec_vredsum:
+; RVA22U64-PACK: # %bb.0:
+; RVA22U64-PACK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RVA22U64-PACK-NEXT: vmv.s.x v16, zero
+; RVA22U64-PACK-NEXT: vredsum.vs v8, v8, v16
+; RVA22U64-PACK-NEXT: vredsum.vs v9, v10, v16
+; RVA22U64-PACK-NEXT: vredsum.vs v10, v12, v16
+; RVA22U64-PACK-NEXT: vredsum.vs v11, v14, v16
+; RVA22U64-PACK-NEXT: vmv.x.s a0, v8
+; RVA22U64-PACK-NEXT: vmv.x.s a1, v9
+; RVA22U64-PACK-NEXT: vmv.x.s a2, v10
+; RVA22U64-PACK-NEXT: pack a0, a0, a1
+; RVA22U64-PACK-NEXT: vmv.x.s a1, v11
+; RVA22U64-PACK-NEXT: pack a1, a2, a1
+; RVA22U64-PACK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RVA22U64-PACK-NEXT: vmv.v.x v8, a0
+; RVA22U64-PACK-NEXT: vslide1down.vx v8, v8, a1
+; RVA22U64-PACK-NEXT: ret
+;
+; RV64ZVE32-LABEL: buildvec_vredsum:
+; RV64ZVE32: # %bb.0:
+; RV64ZVE32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64ZVE32-NEXT: vmv.s.x v16, zero
+; RV64ZVE32-NEXT: vredsum.vs v8, v8, v16
+; RV64ZVE32-NEXT: vredsum.vs v9, v10, v16
+; RV64ZVE32-NEXT: vredsum.vs v10, v12, v16
+; RV64ZVE32-NEXT: vmv.x.s a0, v8
+; RV64ZVE32-NEXT: vmv.x.s a1, v9
+; RV64ZVE32-NEXT: vmv.x.s a2, v10
+; RV64ZVE32-NEXT: vredsum.vs v8, v14, v16
+; RV64ZVE32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64ZVE32-NEXT: vslide1up.vx v9, v8, a2
+; RV64ZVE32-NEXT: vslide1up.vx v10, v9, a1
+; RV64ZVE32-NEXT: vslide1up.vx v8, v10, a0
+; RV64ZVE32-NEXT: ret
+ %247 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %arg0)
+ %248 = insertelement <4 x i32> poison, i32 %247, i64 0
+ %250 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %arg1)
+ %251 = insertelement <4 x i32> %248, i32 %250, i64 1
+ %252 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %arg2)
+ %253 = insertelement <4 x i32> %251, i32 %252, i64 2
+ %254 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %arg3)
+ %255 = insertelement <4 x i32> %253, i32 %254, i64 3
+ ret <4 x i32> %255
+}
+
+define <4 x i32> @buildvec_vredmax(<8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> %arg2, <8 x i32> %arg3) nounwind {
+; RV32-LABEL: buildvec_vredmax:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vredmaxu.vs v8, v8, v8
+; RV32-NEXT: vredmaxu.vs v9, v10, v10
+; RV32-NEXT: vredmaxu.vs v10, v12, v12
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vmv.x.s a1, v9
+; RV32-NEXT: vmv.x.s a2, v10
+; RV32-NEXT: vredmaxu.vs v8, v14, v14
+; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vslide1up.vx v9, v8, a2
+; RV32-NEXT: vslide1up.vx v10, v9, a1
+; RV32-NEXT: vslide1up.vx v8, v10, a0
+; RV32-NEXT: ret
+;
+; RV64V-ONLY-LABEL: buildvec_vredmax:
+; RV64V-ONLY: # %bb.0:
+; RV64V-ONLY-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64V-ONLY-NEXT: vredmaxu.vs v8, v8, v8
+; RV64V-ONLY-NEXT: vredmaxu.vs v9, v10, v10
+; RV64V-ONLY-NEXT: vredmaxu.vs v10, v12, v12
+; RV64V-ONLY-NEXT: vmv.x.s a0, v8
+; RV64V-ONLY-NEXT: vmv.x.s a1, v9
+; RV64V-ONLY-NEXT: vmv.x.s a2, v10
+; RV64V-ONLY-NEXT: vredmaxu.vs v8, v14, v14
+; RV64V-ONLY-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64V-ONLY-NEXT: vslide1up.vx v9, v8, a2
+; RV64V-ONLY-NEXT: vslide1up.vx v10, v9, a1
+; RV64V-ONLY-NEXT: vslide1up.vx v8, v10, a0
+; RV64V-ONLY-NEXT: ret
+;
+; RVA22U64-LABEL: buildvec_vredmax:
+; RVA22U64: # %bb.0:
+; RVA22U64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RVA22U64-NEXT: vredmaxu.vs v8, v8, v8
+; RVA22U64-NEXT: vredmaxu.vs v9, v10, v10
+; RVA22U64-NEXT: vredmaxu.vs v10, v12, v12
+; RVA22U64-NEXT: vredmaxu.vs v11, v14, v14
+; RVA22U64-NEXT: vmv.x.s a0, v8
+; RVA22U64-NEXT: vmv.x.s a1, v9
+; RVA22U64-NEXT: vmv.x.s a2, v10
+; RVA22U64-NEXT: slli a1, a1, 32
+; RVA22U64-NEXT: add.uw a0, a0, a1
+; RVA22U64-NEXT: vmv.x.s a1, v11
+; RVA22U64-NEXT: slli a1, a1, 32
+; RVA22U64-NEXT: add.uw a1, a2, a1
+; RVA22U64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RVA22U64-NEXT: vmv.v.x v8, a0
+; RVA22U64-NEXT: vslide1down.vx v8, v8, a1
+; RVA22U64-NEXT: ret
+;
+; RVA22U64-PACK-LABEL: buildvec_vredmax:
+; RVA22U64-PACK: # %bb.0:
+; RVA22U64-PACK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RVA22U64-PACK-NEXT: vredmaxu.vs v8, v8, v8
+; RVA22U64-PACK-NEXT: vredmaxu.vs v9, v10, v10
+; RVA22U64-PACK-NEXT: vredmaxu.vs v10, v12, v12
+; RVA22U64-PACK-NEXT: vredmaxu.vs v11, v14, v14
+; RVA22U64-PACK-NEXT: vmv.x.s a0, v8
+; RVA22U64-PACK-NEXT: vmv.x.s a1, v9
+; RVA22U64-PACK-NEXT: vmv.x.s a2, v10
+; RVA22U64-PACK-NEXT: pack a0, a0, a1
+; RVA22U64-PACK-NEXT: vmv.x.s a1, v11
+; RVA22U64-PACK-NEXT: pack a1, a2, a1
+; RVA22U64-PACK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RVA22U64-PACK-NEXT: vmv.v.x v8, a0
+; RVA22U64-PACK-NEXT: vslide1down.vx v8, v8, a1
+; RVA22U64-PACK-NEXT: ret
+;
+; RV64ZVE32-LABEL: buildvec_vredmax:
+; RV64ZVE32: # %bb.0:
+; RV64ZVE32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64ZVE32-NEXT: vredmaxu.vs v8, v8, v8
+; RV64ZVE32-NEXT: vredmaxu.vs v9, v10, v10
+; RV64ZVE32-NEXT: vredmaxu.vs v10, v12, v12
+; RV64ZVE32-NEXT: vmv.x.s a0, v8
+; RV64ZVE32-NEXT: vmv.x.s a1, v9
+; RV64ZVE32-NEXT: vmv.x.s a2, v10
+; RV64ZVE32-NEXT: vredmaxu.vs v8, v14, v14
+; RV64ZVE32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64ZVE32-NEXT: vslide1up.vx v9, v8, a2
+; RV64ZVE32-NEXT: vslide1up.vx v10, v9, a1
+; RV64ZVE32-NEXT: vslide1up.vx v8, v10, a0
+; RV64ZVE32-NEXT: ret
+ %247 = tail call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %arg0)
+ %248 = insertelement <4 x i32> poison, i32 %247, i64 0
+ %250 = tail call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %arg1)
+ %251 = insertelement <4 x i32> %248, i32 %250, i64 1
+ %252 = tail call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %arg2)
+ %253 = insertelement <4 x i32> %251, i32 %252, i64 2
+ %254 = tail call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %arg3)
+ %255 = insertelement <4 x i32> %253, i32 %254, i64 3
+ ret <4 x i32> %255
+}
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/redundant-vfmvsf.ll b/llvm/test/CodeGen/RISCV/rvv/redundant-vfmvsf.ll
index da912bf401ec0..821d4240827fb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/redundant-vfmvsf.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/redundant-vfmvsf.ll
@@ -9,12 +9,11 @@ define <2 x float> @redundant_vfmv(<2 x float> %arg0, <64 x float> %arg1, <64 x
; CHECK-NEXT: vfredusum.vs v9, v12, v8
; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 1
+; CHECK-NEXT: vfmv.f.s fa5, v9
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT: vfredusum.vs v8, v16, v8
-; CHECK-NEXT: vfmv.f.s fa5, v8
+; CHECK-NEXT: vfredusum.vs v9, v16, v8
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT: vrgather.vi v8, v9, 0
-; CHECK-NEXT: vfslide1down.vf v8, v8, fa5
+; CHECK-NEXT: vfslide1up.vf v8, v9, fa5
; CHECK-NEXT: ret
%s0 = extractelement <2 x float> %arg0, i64 0
%r0 = tail call reassoc float @llvm.vector.reduce.fadd.v64f32(float %s0, <64 x float> %arg1)
|
{RISCVISD::VECREDUCE_ADD_VL, RISCVISD::VECREDUCE_UMAX_VL, | ||
RISCVISD::VECREDUCE_SMAX_VL, RISCVISD::VECREDUCE_UMIN_VL, | ||
RISCVISD::VECREDUCE_SMIN_VL, RISCVISD::VECREDUCE_AND_VL, | ||
RISCVISD::VECREDUCE_OR_VL, RISCVISD::VECREDUCE_XOR_VL, | ||
RISCVISD::VECREDUCE_FADD_VL, RISCVISD::VECREDUCE_SEQ_FADD_VL, | ||
RISCVISD::VECREDUCE_FMAX_VL, RISCVISD::VECREDUCE_FMIN_VL}) { | ||
SlideUp = sd_match( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we need to check if it's a reduction specifically? Does this not work for any extract_element at index 0?
The general lowering of build_vector starts with splatting the first operand before sliding down other operands one-by-one. However, the initial splat could be avoid if the last operand is an extract from a reduction result, in which case we could use the original vector reduction result as start value before sliding up other operands (in reverse order) one-by-one.
Original context: #154175 (comment)