Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 31 additions & 6 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7096,6 +7096,20 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
if (AddrI && vputils::isSingleScalar(WidenMemR->getAddr()) !=
CostCtx.isLegacyUniformAfterVectorization(AddrI, VF))
return true;

if (WidenMemR->isReverse()) {
// If the stored value of a reverse store is invariant, LICM will
// hoist the reverse operation to the preheader. In this case, the
// result of the VPlan-based cost model will diverge from that of
// the legacy model.
if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(WidenMemR))
if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
return true;

if (auto *StoreR = dyn_cast<VPWidenStoreEVLRecipe>(WidenMemR))
if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
return true;
}
}

// The legacy cost model costs non-header phis with a scalar VF as a phi,
Expand Down Expand Up @@ -7633,8 +7647,8 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
});
}

VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
VFRange &Range) {
VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
VFRange &Range) {
assert((VPI->getOpcode() == Instruction::Load ||
VPI->getOpcode() == Instruction::Store) &&
"Must be called with either a load or store");
Expand Down Expand Up @@ -7695,15 +7709,26 @@ VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
Builder.insert(VectorPtr);
Ptr = VectorPtr;
}

if (VPI->getOpcode() == Instruction::Load) {
auto *Load = cast<LoadInst>(I);
return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, *VPI,
VPI->getDebugLoc());
auto *LoadR = new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
*VPI, Load->getDebugLoc());
if (Reverse) {
Builder.insert(LoadR);
return new VPInstruction(VPInstruction::Reverse, LoadR, {}, {},
LoadR->getDebugLoc());
}
return LoadR;
}

StoreInst *Store = cast<StoreInst>(I);
return new VPWidenStoreRecipe(*Store, Ptr, VPI->getOperand(0), Mask,
Consecutive, Reverse, *VPI, VPI->getDebugLoc());
VPValue *StoredVal = VPI->getOperand(0);
if (Reverse)
StoredVal = Builder.createNaryOp(VPInstruction::Reverse, StoredVal,
Store->getDebugLoc());
return new VPWidenStoreRecipe(*Store, Ptr, StoredVal, Mask, Consecutive,
Reverse, *VPI, Store->getDebugLoc());
}

VPWidenIntOrFpInductionRecipe *
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ class VPRecipeBuilder {
/// Check if the load or store instruction \p VPI should widened for \p
/// Range.Start and potentially masked. Such instructions are handled by a
/// recipe that takes an additional VPInstruction for the mask.
VPWidenMemoryRecipe *tryToWidenMemory(VPInstruction *VPI, VFRange &Range);
VPRecipeBase *tryToWidenMemory(VPInstruction *VPI, VFRange &Range);

/// Optimize the special case where the operand of \p VPI is a constant
/// integer induction variable.
Expand Down
8 changes: 5 additions & 3 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -1111,6 +1111,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
// Unrolling will add all copies of its original operand as additional
// operands.
LastActiveLane,
// Returns a reversed vector for the operand.
Reverse,

// The opcodes below are used for VPInstructionWithType.
//
Expand Down Expand Up @@ -3476,10 +3478,10 @@ struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
/// using the value to store, the address to store to, the explicit vector
/// length and an optional mask.
struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue *Addr, VPValue &EVL,
VPValue *Mask)
VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue *Addr,
VPValue *StoredVal, VPValue &EVL, VPValue *Mask)
Comment on lines +3481 to +3482
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the only addition here is the stored value, right? can we unify to a single constructor, and clarify that VPWidenStoreRecipe is only used for other fields?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

92733e9
Unify them, thank.

: VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
{Addr, S.getStoredValue(), &EVL}, S.isConsecutive(),
{Addr, StoredVal, &EVL}, S.isConsecutive(),
S.isReverse(), S, S.getDebugLoc()) {
setMask(Mask);
}
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::Broadcast:
case VPInstruction::PtrAdd:
case VPInstruction::WidePtrAdd:
case VPInstruction::Reverse:
// Return the type based on first operand.
return inferScalarType(R->getOperand(0));
case VPInstruction::BranchOnCond:
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,12 @@ m_LastActiveLane(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::LastActiveLane>(Op0);
}

template <typename Op0_t>
inline VPInstruction_match<VPInstruction::Reverse, Op0_t>
m_Reverse(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::Reverse>(Op0);
}

inline VPInstruction_match<VPInstruction::StepVector> m_StepVector() {
return m_VPInstruction<VPInstruction::StepVector>();
}
Expand Down
77 changes: 38 additions & 39 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::Not:
case VPInstruction::ResumeForEpilogue:
case VPInstruction::Reverse:
case VPInstruction::Unpack:
return 1;
case Instruction::ICmp:
Expand Down Expand Up @@ -901,6 +902,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
}
case VPInstruction::ResumeForEpilogue:
return State.get(getOperand(0), true);
case VPInstruction::Reverse:
return Builder.CreateVectorReverse(State.get(getOperand(0)), "reverse");
default:
llvm_unreachable("Unsupported opcode for instruction");
}
Expand Down Expand Up @@ -1087,6 +1090,14 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
I32Ty, {Arg0Ty, I32Ty, I1Ty});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
case VPInstruction::Reverse: {
assert(VF.isVector() && "Reverse operation must be vector type");
auto *VectorTy = cast<VectorType>(
toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
VectorTy, /*Mask=*/{}, Ctx.CostKind,
/*Index=*/0);
}
case VPInstruction::ExtractLastLane: {
// Add on the cost of extracting the element.
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
Expand Down Expand Up @@ -1189,6 +1200,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::WidePtrAdd:
case VPInstruction::StepVector:
case VPInstruction::ReductionStartVector:
case VPInstruction::Reverse:
case VPInstruction::VScale:
case VPInstruction::Unpack:
return false;
Expand Down Expand Up @@ -1366,6 +1378,9 @@ void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
case VPInstruction::ResumeForEpilogue:
O << "resume-for-epilogue";
break;
case VPInstruction::Reverse:
O << "reverse";
break;
case VPInstruction::Unpack:
O << "unpack";
break;
Expand Down Expand Up @@ -2244,18 +2259,32 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
VPValue *Operand = getOperand(0);
TTI::CastContextHint CCH = TTI::CastContextHint::None;
// For Trunc/FPTrunc, get the context from the only user.
if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
!hasMoreThanOneUniqueUser() && getNumUsers() > 0) {
if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(*user_begin()))
CCH = ComputeCCH(StoreRecipe);
if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
auto GetOnlyUser = [](const VPSingleDefRecipe *R) -> VPRecipeBase * {
if (R->getNumUsers() == 0 || R->hasMoreThanOneUniqueUser())
return nullptr;
return dyn_cast<VPRecipeBase>(*R->user_begin());
};

if (VPRecipeBase *Recipe = GetOnlyUser(this)) {
if (match(Recipe, m_Reverse(m_VPValue())))
Recipe = GetOnlyUser(cast<VPInstruction>(Recipe));
if (Recipe)
CCH = ComputeCCH(Recipe);
}
}
// For Z/Sext, get the context from the operand.
else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
Opcode == Instruction::FPExt) {
Comment on lines 2277 to 2278
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure I understand why the code is guarded by Trunc, FPTrunc, FPExt, ZExt, or SExt opcodes?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code is used to determine TTI::CastContextHint. Perhaps for other cast opcodes, there isn’t currently a situation that requires this hint. But I think that’s a separate issue. The change here is just to ensure that CastContextHint::Reversed is still correctly propagated after the reverse operation is separated out.

if (Operand->isLiveIn())
CCH = TTI::CastContextHint::Normal;
else if (Operand->getDefiningRecipe())
CCH = ComputeCCH(Operand->getDefiningRecipe());
else if (auto *Recipe = Operand->getDefiningRecipe()) {
VPValue *ReverseOp;
if (match(Recipe, m_Reverse(m_VPValue(ReverseOp))))
Recipe = ReverseOp->getDefiningRecipe();
if (Recipe)
CCH = ComputeCCH(Recipe);
}
}

auto *SrcTy =
Expand Down Expand Up @@ -3500,12 +3529,7 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
OpInfo, &Ingredient);
}
if (!Reverse)
return Cost;

return Cost += Ctx.TTI.getShuffleCost(
TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
return Cost;
}

void VPWidenLoadRecipe::execute(VPTransformState &State) {
Expand Down Expand Up @@ -3536,8 +3560,6 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
}
applyMetadata(*cast<Instruction>(NewLI));
if (Reverse)
NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
State.set(this, NewLI);
}

Expand Down Expand Up @@ -3592,8 +3614,6 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
applyMetadata(*NewLI);
Instruction *Res = NewLI;
if (isReverse())
Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
State.set(this, Res);
}

Expand All @@ -3610,15 +3630,9 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
->getAddressSpace();
InstructionCost Cost = Ctx.TTI.getMemIntrinsicInstrCost(
return Ctx.TTI.getMemIntrinsicInstrCost(
MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS),
Ctx.CostKind);
if (!Reverse)
return Cost;

return Cost + Ctx.TTI.getShuffleCost(
TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
Expand Down Expand Up @@ -3647,13 +3661,6 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
}

Value *StoredVal = State.get(StoredVPValue);
if (isReverse()) {
// If we store to reverse consecutive memory locations, then we need
// to reverse the order of elements in the stored value.
StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
// We don't want to update the value in the map as it might be used in
// another expression. So don't call resetVectorValue(StoredVal).
}
Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
Instruction *NewSI = nullptr;
if (CreateScatter)
Expand Down Expand Up @@ -3682,8 +3689,6 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
CallInst *NewSI = nullptr;
Value *StoredVal = State.get(StoredValue);
Value *EVL = State.get(getEVL(), VPLane(0));
if (isReverse())
StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
Value *Mask = nullptr;
if (VPValue *VPMask = getMask()) {
Mask = State.get(VPMask);
Expand Down Expand Up @@ -3720,15 +3725,9 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
->getAddressSpace();
InstructionCost Cost = Ctx.TTI.getMemIntrinsicInstrCost(
return Ctx.TTI.getMemIntrinsicInstrCost(
MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS),
Ctx.CostKind);
if (!Reverse)
return Cost;

return Cost + Ctx.TTI.getShuffleCost(
TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
Expand Down
36 changes: 26 additions & 10 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2887,25 +2887,41 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
EVL, Mask);

if (match(&CurRecipe,
VPValue *ReversedVal;
if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
match(ReversedVal,
m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
AdjustEndPtr(EndPtr), EVL, Mask);
cast<VPWidenLoadRecipe>(ReversedVal)->isReverse()) {
auto *LoadR = new VPWidenLoadEVLRecipe(
*cast<VPWidenLoadRecipe>(ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask);
LoadR->insertBefore(&CurRecipe);
return new VPWidenIntrinsicRecipe(
Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
TypeInfo.inferScalarType(LoadR), {}, {}, DL);
}

if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(),
VPValue *StoredVal;
if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
m_RemoveMask(HeaderMask, Mask))) &&
!cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
EVL, Mask);
StoredVal, EVL, Mask);

if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(),
m_RemoveMask(HeaderMask, Mask))) &&
if (match(&CurRecipe,
m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
m_RemoveMask(HeaderMask, Mask))) &&
match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) {
auto *NewReverse = new VPWidenIntrinsicRecipe(
Intrinsic::experimental_vp_reverse,
{ReversedVal, Plan->getTrue(), &EVL},
TypeInfo.inferScalarType(ReversedVal), {}, {}, DL);
NewReverse->insertBefore(&CurRecipe);
return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
AdjustEndPtr(EndPtr), EVL, Mask);
AdjustEndPtr(EndPtr), NewReverse, EVL,
Mask);
}

if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
if (Rdx->isConditional() &&
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ define void @vector_reverse_mask_nxv4i1(ptr %a, ptr %cond, i64 %N) #0 {
; CHECK: %[[WIDEMSKLOAD:.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr align 8 %{{.*}}, <vscale x 4 x i1> %[[REVERSE6]], <vscale x 4 x double> poison)
; CHECK: %[[REVERSE7:.*]] = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %[[WIDEMSKLOAD]])
; CHECK: %[[FADD:.*]] = fadd <vscale x 4 x double> %[[REVERSE7]]
; CHECK: %[[REVERSE9:.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
; CHECK: %[[REVERSE8:.*]] = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %[[FADD]])
; CHECK: %[[REVERSE9:.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
; CHECK: call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> %[[REVERSE8]], ptr align 8 %{{.*}}, <vscale x 4 x i1> %[[REVERSE9]]

entry:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ define void @vector_reverse_mask_v4i1(ptr noalias %a, ptr noalias %cond, i64 %N)
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 -24
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 -56
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP3]], align 8
; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x double> [[WIDE_LOAD]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x double>, ptr [[TMP4]], align 8
; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x double> [[WIDE_LOAD]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x double> [[WIDE_LOAD1]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[TMP5:%.*]] = fcmp une <4 x double> [[REVERSE]], zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = fcmp une <4 x double> [[REVERSE2]], zeroinitializer
Expand Down
Loading
Loading