Skip to content

[WebAssembly] Implement getInterleavedMemoryOpCost #146864

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 109 additions & 16 deletions llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,12 +150,6 @@ InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
CostKind);
}

int ISD = TLI->InstructionOpcodeToISD(Opcode);
if (ISD != ISD::LOAD) {
return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
CostKind);
}

EVT VT = TLI->getValueType(DL, Ty, true);
// Type legalization can't handle structs
if (VT == MVT::Other)
Expand All @@ -166,22 +160,121 @@ InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
if (!LT.first.isValid())
return InstructionCost::getInvalid();

// 128-bit loads are a single instruction. 32-bit and 64-bit vector loads can
// be lowered to load32_zero and load64_zero respectively. Assume SIMD loads
// are twice as expensive as scalar.
int ISD = TLI->InstructionOpcodeToISD(Opcode);
unsigned width = VT.getSizeInBits();
switch (width) {
default:
break;
case 32:
case 64:
case 128:
return 2;
if (ISD == ISD::LOAD) {
// 128-bit loads are a single instruction. 32-bit and 64-bit vector loads
// can be lowered to load32_zero and load64_zero respectively. Assume SIMD
// loads are twice as expensive as scalar.
switch (width) {
default:
break;
case 32:
case 64:
case 128:
return 2;
}
} else if (ISD == ISD::STORE) {
// For stores, we can use store lane operations.
switch (width) {
default:
break;
case 8:
case 16:
case 32:
case 64:
case 128:
return 2;
}
}

return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, CostKind);
}

InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost(
unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
bool UseMaskForCond, bool UseMaskForGaps) const {
assert(Factor >= 2 && "Invalid interleave factor");

auto *VecTy = cast<VectorType>(Ty);
if (!ST->hasSIMD128() || !isa<FixedVectorType>(VecTy)) {
return InstructionCost::getInvalid();
}

if (UseMaskForCond || UseMaskForGaps)
return BaseT::getInterleavedMemoryOpCost(Opcode, Ty, Factor, Indices,
Alignment, AddressSpace, CostKind,
UseMaskForCond, UseMaskForGaps);

constexpr unsigned MaxInterleaveFactor = 4;
if (Factor <= MaxInterleaveFactor) {
unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
// Ensure the number of vector elements is greater than 1.
if (MinElts < 2 || MinElts % Factor != 0)
return InstructionCost::getInvalid();

unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
// Ensure the element type is legal.
if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
return InstructionCost::getInvalid();

auto *SubVecTy =
VectorType::get(VecTy->getElementType(),
VecTy->getElementCount().divideCoefficientBy(Factor));
InstructionCost MemCost =
getMemoryOpCost(Opcode, SubVecTy, Alignment, AddressSpace, CostKind);

unsigned VecSize = DL.getTypeSizeInBits(SubVecTy);
unsigned MaxVecSize = 128;
unsigned NumAccesses =
std::max<unsigned>(1, (MinElts * ElSize + MaxVecSize - 1) / VecSize);

// A stride of two is commonly supported via dedicated instructions, so it
// should be relatively cheap for all element sizes. A stride of four is
// more expensive as it will likely require more shuffles. Using two
// simd128 inputs is considered more expensive and we mainly account for
// shuffling two inputs (32 bytes), but we do model 4 x v4i32 to enable
// arithmetic kernels.
static const CostTblEntry ShuffleCostTbl[] = {
// One reg.
{2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8
{2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8
{2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8
{2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16
{2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16
{2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32

// Two regs.
{2, MVT::v16i8, 2}, // interleave 2 x 16i8 into 32i8
{2, MVT::v8i16, 2}, // interleave 2 x 8i16 into 16i16
{2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32

// One reg.
{4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8
{4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8
{4, MVT::v2i16, 4}, // interleave 4 x 2i16 into 8i16

// Two regs.
{4, MVT::v8i8, 16}, // interleave 4 x 8i8 into 32i8
{4, MVT::v4i16, 8}, // interleave 4 x 4i16 into 16i16
{4, MVT::v2i32, 4}, // interleave 4 x 2i32 into 8i32

// Four regs.
{4, MVT::v4i32, 16}, // interleave 4 x 4i32 into 16i32
};

EVT ETy = TLI->getValueType(DL, SubVecTy);
if (const auto *Entry =
CostTableLookup(ShuffleCostTbl, Factor, ETy.getSimpleVT()))
return Entry->Cost + (NumAccesses * MemCost);
}

return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace, CostKind,
UseMaskForCond, UseMaskForGaps);
}

InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
const Value *Op0, const Value *Op1) const {
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
TTI::TargetCostKind CostKind,
TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
const Instruction *I = nullptr) const override;
InstructionCost getInterleavedMemoryOpCost(
unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
bool UseMaskForCond, bool UseMaskForGaps) const override;
using BaseT::getVectorInstrCost;
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
Expand Down
Loading