Skip to content

[IA] Support vp.store in lowerinterleavedStore #149605

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jul 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 7 additions & 13 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -3219,25 +3219,19 @@ class LLVM_ABI TargetLoweringBase {
/// Lower an interleaved store to target specific intrinsics. Return
/// true on success.
///
/// \p SI is the vector store instruction.
/// \p SI is the vector store instruction. Can be either a plain store
/// or a vp.store.
/// \p Mask is a per-segment (i.e. number of lanes equal to that of one
/// component being interwoven) mask. Can be nullptr, in which case the
/// result is unconditional.
/// \p SVI is the shufflevector to RE-interleave the stored vector.
/// \p Factor is the interleave factor.
virtual bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
virtual bool lowerInterleavedStore(Instruction *Store, Value *Mask,
ShuffleVectorInst *SVI,
unsigned Factor) const {
return false;
}

/// Lower an interleaved store to target specific intrinsics. Return
/// true on success.
///
/// \p Store is the vp.store instruction.
/// \p Mask is a mask value
/// \p InterleaveOps is a list of values being interleaved.
virtual bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
ArrayRef<Value *> InterleaveOps) const {
return false;
}

/// Lower a deinterleave intrinsic to a target specific load intrinsic.
/// Return true on success. Currently only supports
/// llvm.vector.deinterleave{2,3,5,7}
Expand Down
38 changes: 9 additions & 29 deletions llvm/lib/CodeGen/InterleavedAccessPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -518,46 +518,26 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
assert(NumStoredElements % Factor == 0 &&
"number of stored element should be a multiple of Factor");

Value *Mask = nullptr;
if (auto *VPStore = dyn_cast<VPIntrinsic>(Store)) {
unsigned LaneMaskLen = NumStoredElements / Factor;
Value *LaneMask = getMask(VPStore->getMaskParam(), Factor,
ElementCount::getFixed(LaneMaskLen));
if (!LaneMask)
Mask = getMask(VPStore->getMaskParam(), Factor,
ElementCount::getFixed(LaneMaskLen));
if (!Mask)
return false;

LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store: " << *Store
<< "\n");

IRBuilder<> Builder(VPStore);
// We need to effectively de-interleave the shufflemask
// because lowerInterleavedVPStore expects individual de-interleaved
// values.
SmallVector<Value *, 10> NewShuffles;
SmallVector<int, 16> NewShuffleMask(LaneMaskLen);
auto ShuffleMask = SVI->getShuffleMask();

for (unsigned i = 0; i < Factor; i++) {
for (unsigned j = 0; j < LaneMaskLen; j++)
NewShuffleMask[j] = ShuffleMask[i + Factor * j];

NewShuffles.push_back(Builder.CreateShuffleVector(
SVI->getOperand(0), SVI->getOperand(1), NewShuffleMask));
}

// Try to create target specific intrinsics to replace the vp.store and
// shuffle.
if (!TLI->lowerInterleavedVPStore(VPStore, LaneMask, NewShuffles))
// We already created new shuffles.
return true;
} else {
LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n");

// Try to create target specific intrinsics to replace the store and
// shuffle.
if (!TLI->lowerInterleavedStore(cast<StoreInst>(Store), SVI, Factor))
return false;
}

// Try to create target specific intrinsics to replace the store and
// shuffle.
if (!TLI->lowerInterleavedStore(Store, Mask, SVI, Factor))
return false;

// Already have a new target specific interleaved store. Erase the old store.
DeadInsts.insert(Store);
DeadInsts.insert(SVI);
Expand Down
7 changes: 6 additions & 1 deletion llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17343,12 +17343,17 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
Value *LaneMask,
ShuffleVectorInst *SVI,
unsigned Factor) const {

assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
auto *SI = dyn_cast<StoreInst>(Store);
if (!SI)
return false;
assert(!LaneMask && "Unexpected mask on store");

auto *VecTy = cast<FixedVectorType>(SVI->getType());
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AArch64/AArch64ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,8 @@ class AArch64TargetLowering : public TargetLowering {
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
ShuffleVectorInst *SVI,
unsigned Factor) const override;

bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
Expand Down
7 changes: 6 additions & 1 deletion llvm/lib/Target/ARM/ARMISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21731,11 +21731,16 @@ bool ARMTargetLowering::lowerInterleavedLoad(
/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
bool ARMTargetLowering::lowerInterleavedStore(Instruction *Store,
Value *LaneMask,
ShuffleVectorInst *SVI,
unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
auto *SI = dyn_cast<StoreInst>(Store);
if (!SI)
return false;
assert(!LaneMask && "Unexpected mask on store");

auto *VecTy = cast<FixedVectorType>(SVI->getType());
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/ARM/ARMISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -685,7 +685,8 @@ class VectorType;
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
ShuffleVectorInst *SVI,
unsigned Factor) const override;

bool shouldInsertFencesForAtomic(const Instruction *I) const override;
Expand Down
6 changes: 2 additions & 4 deletions llvm/lib/Target/RISCV/RISCVISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,8 @@ class RISCVTargetLowering : public TargetLowering {
ArrayRef<unsigned> Indices,
unsigned Factor) const override;

bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
ShuffleVectorInst *SVI,
unsigned Factor) const override;

bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
Expand All @@ -444,9 +445,6 @@ class RISCVTargetLowering : public TargetLowering {
Instruction *Store, Value *Mask,
ArrayRef<Value *> InterleaveValues) const override;

bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
ArrayRef<Value *> InterleaveOps) const override;

bool supportKCFIBundles() const override { return true; }

SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
Expand Down
144 changes: 28 additions & 116 deletions llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -266,22 +266,28 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
///
/// Note that the new shufflevectors will be removed and we'll only generate one
/// vsseg3 instruction in CodeGen.
bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store,
Value *LaneMask,
ShuffleVectorInst *SVI,
unsigned Factor) const {
IRBuilder<> Builder(SI);
const DataLayout &DL = SI->getDataLayout();
IRBuilder<> Builder(Store);
const DataLayout &DL = Store->getDataLayout();
auto Mask = SVI->getShuffleMask();
auto *ShuffleVTy = cast<FixedVectorType>(SVI->getType());
// Given SVI : <n*factor x ty>, then VTy : <n x ty>
auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(),
ShuffleVTy->getNumElements() / Factor);
if (!isLegalInterleavedAccessType(VTy, Factor, SI->getAlign(),
SI->getPointerAddressSpace(), DL))
auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can use Builder.getIntNTy(Subtarget.getXLen() if you want.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have this same pattern in a few cases in this file, I'll do this as a post-commit and get them all.


Value *Ptr, *VL;
Align Alignment;
if (!getMemOperands(Factor, VTy, XLenTy, Store, Ptr, LaneMask, VL, Alignment))
return false;

auto *PtrTy = SI->getPointerOperandType();
auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
Type *PtrTy = Ptr->getType();
unsigned AS = PtrTy->getPointerAddressSpace();
if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
return false;

unsigned Index;
// If the segment store only has one active lane (i.e. the interleave is
Expand All @@ -292,27 +298,27 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
unsigned ScalarSizeInBytes =
DL.getTypeStoreSize(ShuffleVTy->getElementType());
Value *Data = SVI->getOperand(0);
auto *DataVTy = cast<FixedVectorType>(Data->getType());
Data = Builder.CreateExtractVector(VTy, Data, uint64_t(0));
Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes);
Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset);
Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount());
Value *VL = Builder.CreateElementCount(Builder.getInt32Ty(),
VTy->getElementCount());

CallInst *CI = Builder.CreateIntrinsic(
Intrinsic::experimental_vp_strided_store,
{Data->getType(), BasePtr->getType(), Stride->getType()},
{Data, BasePtr, Stride, Mask, VL});
Align Alignment = commonAlignment(SI->getAlign(), Index * ScalarSizeInBytes);
CI->addParamAttr(
1, Attribute::getWithAlignment(CI->getContext(), Alignment));
Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
// Note: Same VL as above, but i32 not xlen due to signature of
// vp.strided.store
VL = Builder.CreateElementCount(Builder.getInt32Ty(),
VTy->getElementCount());

CallInst *CI =
Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store,
{VTy, BasePtr->getType(), Stride->getType()},
{Data, BasePtr, Stride, LaneMask, VL});
Alignment = commonAlignment(Alignment, Index * ScalarSizeInBytes);
CI->addParamAttr(1,
Attribute::getWithAlignment(CI->getContext(), Alignment));
return true;
}

Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
SI->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy});
Store->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy});

SmallVector<Value *, 10> Ops;
SmallVector<int, 16> NewShuffleMask;
Expand All @@ -328,13 +334,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,

NewShuffleMask.clear();
}
// This VL should be OK (should be executable in one vsseg instruction,
// potentially under larger LMULs) because we checked that the fixed vector
// type fits in isLegalInterleavedAccessType
Value *VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount());
Value *StoreMask = Builder.getAllOnesMask(VTy->getElementCount());
Ops.append({SI->getPointerOperand(), StoreMask, VL});

Ops.append({Ptr, LaneMask, VL});
Builder.CreateCall(VssegNFunc, Ops);

return true;
Expand Down Expand Up @@ -457,91 +457,3 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
Builder.CreateCall(VssegNFunc, Operands);
return true;
}

/// Lower an interleaved vp.store into a vssegN intrinsic.
///
/// E.g. Lower an interleaved vp.store (Factor = 2):
///
/// %is = tail call <vscale x 64 x i8>
/// @llvm.vector.interleave2.nxv64i8(
/// <vscale x 32 x i8> %load0,
/// <vscale x 32 x i8> %load1
/// %wide.rvl = shl nuw nsw i32 %rvl, 1
/// tail call void @llvm.vp.store.nxv64i8.p0(
/// <vscale x 64 x i8> %is, ptr %ptr,
/// %mask,
/// i32 %wide.rvl)
///
/// Into:
/// call void @llvm.riscv.vsseg2.mask.nxv32i8.i64(
/// <vscale x 32 x i8> %load1,
/// <vscale x 32 x i8> %load2, ptr %ptr,
/// %mask,
/// i64 %rvl)
bool RISCVTargetLowering::lowerInterleavedVPStore(
VPIntrinsic *Store, Value *Mask,
ArrayRef<Value *> InterleaveOperands) const {
assert(Mask && "Expect a valid mask");
assert(Store->getIntrinsicID() == Intrinsic::vp_store &&
"Unexpected intrinsic");

const unsigned Factor = InterleaveOperands.size();

auto *VTy = dyn_cast<VectorType>(InterleaveOperands[0]->getType());
if (!VTy)
return false;

const DataLayout &DL = Store->getDataLayout();
Align Alignment = Store->getParamAlign(1).value_or(
DL.getABITypeAlign(VTy->getElementType()));
if (!isLegalInterleavedAccessType(
VTy, Factor, Alignment,
Store->getArgOperand(1)->getType()->getPointerAddressSpace(), DL))
return false;

IRBuilder<> Builder(Store);
Value *WideEVL = Store->getArgOperand(3);
// Conservatively check if EVL is a multiple of factor, otherwise some
// (trailing) elements might be lost after the transformation.
if (!isMultipleOfN(WideEVL, Store->getDataLayout(), Factor))
return false;

auto *PtrTy = Store->getArgOperand(1)->getType();
auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
Value *EVL =
Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);

if (isa<FixedVectorType>(VTy)) {
SmallVector<Value *, 8> Operands(InterleaveOperands);
Operands.append({Store->getArgOperand(1), Mask, EVL});
Builder.CreateIntrinsic(FixedVssegIntrIds[Factor - 2],
{VTy, PtrTy, XLenTy}, Operands);
return true;
}

unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
unsigned NumElts = VTy->getElementCount().getKnownMinValue();
Type *VecTupTy = TargetExtType::get(
Store->getContext(), "riscv.vector.tuple",
ScalableVectorType::get(Type::getInt8Ty(Store->getContext()),
NumElts * SEW / 8),
Factor);

Function *VecInsertFunc = Intrinsic::getOrInsertDeclaration(
Store->getModule(), Intrinsic::riscv_tuple_insert, {VecTupTy, VTy});
Value *StoredVal = PoisonValue::get(VecTupTy);
for (unsigned i = 0; i < Factor; ++i)
StoredVal = Builder.CreateCall(
VecInsertFunc, {StoredVal, InterleaveOperands[i], Builder.getInt32(i)});

Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
Store->getModule(), ScalableVssegIntrIds[Factor - 2],
{VecTupTy, PtrTy, Mask->getType(), EVL->getType()});

Value *Operands[] = {StoredVal, Store->getArgOperand(1), Mask, EVL,
ConstantInt::get(XLenTy, Log2_64(SEW))};

Builder.CreateCall(VssegNFunc, Operands);
return true;
}
3 changes: 2 additions & 1 deletion llvm/lib/Target/X86/X86ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -1668,7 +1668,8 @@ namespace llvm {

/// Lower interleaved store(s) into target specific
/// instructions/intrinsics.
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
ShuffleVectorInst *SVI,
unsigned Factor) const override;

SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
Expand Down
8 changes: 7 additions & 1 deletion llvm/lib/Target/X86/X86InterleavedAccess.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -822,7 +822,8 @@ bool X86TargetLowering::lowerInterleavedLoad(
return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
}

bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
bool X86TargetLowering::lowerInterleavedStore(Instruction *Store,
Value *LaneMask,
ShuffleVectorInst *SVI,
unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
Expand All @@ -832,6 +833,11 @@ bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
0 &&
"Invalid interleaved store");

auto *SI = dyn_cast<StoreInst>(Store);
if (!SI)
return false;
assert(!LaneMask && "Unexpected mask on store");

// Holds the indices of SVI that correspond to the starting index of each
// interleaved shuffle.
auto Mask = SVI->getShuffleMask();
Expand Down
Loading
Loading