llvm · artagnon · Dec 15, 2025 · lukel97 · Dec 16, 2025 · lukel97
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7660,9 +7660,9 @@ VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
           CM.foldTailByMasking() || !GEP
               ? GEPNoWrapFlags::none()
               : GEP->getNoWrapFlags().withoutNoUnsignedWrap();
-      VectorPtr = new VPVectorEndPointerRecipe(
-          Ptr, &Plan.getVF(), getLoadStoreType(I),
-          /*Stride*/ -1, Flags, VPI->getDebugLoc());
+      VectorPtr = createVectorEndPointerRecipe(Ptr, getLoadStoreType(I),
+                                               /*Stride*/ -1, Flags,
+                                               VPI->getDebugLoc());
     } else {
       VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
                                             GEP ? GEP->getNoWrapFlags()
@@ -8351,6 +8351,25 @@ VPRecipeBuilder::tryToCreatePartialReduction(VPInstruction *Reduction,
       RdxUnordered{/*VFScaleFactor=*/ScaleFactor}, ReductionI->getDebugLoc());
 }
 
+VPVectorEndPointerRecipe *VPRecipeBuilder::createVectorEndPointerRecipe(
+    VPValue *Ptr, Type *SourceElementType, int64_t Stride,
+    GEPNoWrapFlags GEPFlags, DebugLoc DbgLoc) {
+  // Offset for Part 0 = Stride * (VF - 1).
+  VPTypeAnalysis TypeInfo(Plan);
+  const DataLayout &DL =
+      Plan.getScalarHeader()->getIRBasicBlock()->getDataLayout();
+  Type *IndexTy = DL.getIndexType(TypeInfo.inferScalarType(Ptr));
+  Type *VFTy = TypeInfo.inferScalarType(&Plan.getVF());
+  VPValue *VF = Builder.createScalarZExtOrTrunc(&Plan.getVF(), IndexTy, VFTy,
+                                                DebugLoc::getUnknown());
+  VPValue *VFMinusOne = Builder.createOverflowingOp(
+      Instruction::Sub, {VF, Plan.getConstantInt(IndexTy, 1)}, {true, true});
+  VPValue *StridexVFMinusOne = Builder.createOverflowingOp(
+      Instruction::Mul, {VFMinusOne, Plan.getConstantInt(IndexTy, Stride)});
+  return new VPVectorEndPointerRecipe(Ptr, StridexVFMinusOne, SourceElementType,
+                                      Stride, GEPFlags, DbgLoc);
+}
+
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
                                                         ElementCount MaxVF) {
   if (ElementCount::isKnownGT(MinVF, MaxVF))

diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -149,6 +149,9 @@ class VPRecipeBuilder {
                                           : std::make_optional(It->second);
   }
 
+  void setInsertPoint(VPRecipeBase *IP) { Builder.setInsertPoint(IP); }
+  VPBuilder &getBuilder() const { return Builder; }
+
   /// Find all possible partial reductions in the loop and track all of those
   /// that are valid so recipes can be formed later.
   void collectScaledReductions(VFRange &Range);
@@ -162,6 +165,12 @@ class VPRecipeBuilder {
   VPRecipeBase *tryToCreatePartialReduction(VPInstruction *Reduction,
                                             unsigned ScaleFactor);
 
+  /// Create and return a VectorEndPointer recipe.
+  VPVectorEndPointerRecipe *
+  createVectorEndPointerRecipe(VPValue *Ptr, Type *SourceElementType,
+                               int64_t Stride, GEPNoWrapFlags GEPFlags,
+                               DebugLoc DbgLoc);
+
   /// Set the recipe created for given ingredient.
   void setRecipe(Instruction *I, VPRecipeBase *R) {
     assert(!Ingredient2Recipe.contains(I) &&

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1907,27 +1907,27 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
 /// A recipe to compute a pointer to the last element of each part of a widened
 /// memory access for widened memory accesses of IndexedTy. Used for
 /// VPWidenMemoryRecipes or VPInterleaveRecipes that are reversed.
-class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
-                                 public VPUnrollPartAccessor<2> {
-  Type *IndexedTy;
+class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags {
+  Type *SourceElementTy;
 
   /// The constant stride of the pointer computed by this recipe, expressed in
-  /// units of IndexedTy.
+  /// units of SourceElementTy.
   int64_t Stride;
 
 public:
-  VPVectorEndPointerRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy,
+  VPVectorEndPointerRecipe(VPValue *Ptr, VPValue *Offset, Type *SourceElementTy,
                            int64_t Stride, GEPNoWrapFlags GEPFlags, DebugLoc DL)
-      : VPRecipeWithIRFlags(VPDef::VPVectorEndPointerSC,
-                            ArrayRef<VPValue *>({Ptr, VF}), GEPFlags, DL),
-        IndexedTy(IndexedTy), Stride(Stride) {
+      : VPRecipeWithIRFlags(VPDef::VPVectorEndPointerSC, {Ptr, Offset},
+                            GEPFlags, DL),
+        SourceElementTy(SourceElementTy), Stride(Stride) {
     assert(Stride < 0 && "Stride must be negative");
   }
 
   VP_CLASSOF_IMPL(VPDef::VPVectorEndPointerSC)
 
-  VPValue *getVFValue() { return getOperand(1); }
-  const VPValue *getVFValue() const { return getOperand(1); }
+  int64_t getStride() const { return Stride; }
+  Type *getSourceElementType() const { return SourceElementTy; }
+  VPValue *getOffset() const { return getOperand(1); }
 
   void execute(VPTransformState &State) override;
 
@@ -1953,9 +1953,9 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
   }
 
   VPVectorEndPointerRecipe *clone() override {
-    return new VPVectorEndPointerRecipe(getOperand(0), getVFValue(), IndexedTy,
-                                        Stride, getGEPNoWrapFlags(),
-                                        getDebugLoc());
+    return new VPVectorEndPointerRecipe(getOperand(0), getOffset(),
+                                        getSourceElementType(), getStride(),
+                                        getGEPNoWrapFlags(), getDebugLoc());
   }
 
 protected:

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2488,28 +2488,10 @@ void VPWidenGEPRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
 
 void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
-  unsigned CurrentPart = getUnrollPart(*this);
-  const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
-  Type *IndexTy = DL.getIndexType(State.TypeAnalysis.inferScalarType(this));
-
-  // The wide store needs to start at the last vector element.
-  Value *RunTimeVF = State.get(getVFValue(), VPLane(0));
-  if (IndexTy != RunTimeVF->getType())
-    RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy);
-  // NumElt = Stride * CurrentPart * RunTimeVF
-  Value *NumElt = Builder.CreateMul(
-      ConstantInt::get(IndexTy, Stride * (int64_t)CurrentPart), RunTimeVF);
-  // LastLane = Stride * (RunTimeVF - 1)
-  Value *LastLane = Builder.CreateSub(RunTimeVF, ConstantInt::get(IndexTy, 1));
-  if (Stride != 1)
-    LastLane =
-        Builder.CreateMul(ConstantInt::getSigned(IndexTy, Stride), LastLane);
-  Value *Ptr = State.get(getOperand(0), VPLane(0));
-  Value *ResultPtr =
-      Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags());
-  ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "",
-                                getGEPNoWrapFlags());
-
+  Value *Ptr = State.get(getOperand(0), true);
+  Value *Offset = State.get(getOffset(), true);
+  Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Offset, "",
+                                       getGEPNoWrapFlags());
   State.set(this, ResultPtr, /*IsScalar*/ true);
 }
 

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -783,6 +783,15 @@ static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) {
   return Users.takeVector();
 }
 
+static SmallVector<VPValue *> collectOperandsRecursively(VPRecipeBase *R) {
+  SetVector<VPValue *> Operands(llvm::from_range, R->operands());
+  for (unsigned I = 0; I != Operands.size(); ++I) {
+    if (VPRecipeBase *Cur = Operands[I]->getDefiningRecipe())
+      Operands.insert_range(Cur->operands());
+  }
+  return Operands.takeVector();
+}
+
 /// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
 /// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
 /// generates scalar values.
@@ -2843,11 +2852,30 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
   VPValue *Addr, *Mask, *EndPtr;
 
   /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
-  auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
-    auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
-    EVLEndPtr->insertBefore(&CurRecipe);
-    EVLEndPtr->setOperand(1, &EVL);
-    return EVLEndPtr;
+  auto AdjustEndPtr = [&EVL, &Plan, &TypeInfo](VPValue *EndPtr) {
+    auto *VEPR = cast<VPVectorEndPointerRecipe>(EndPtr);
+    SmallVector<VPValue *> Operands({VEPR->getOffset()});
+    VPRecipeBase *OffsetR = VEPR->getOffset()->getDefiningRecipe();
+    append_range(Operands, collectOperandsRecursively(OffsetR));
+    for (VPValue *Op : reverse(Operands)) {
+      if (!Op->hasDefiningRecipe())
+        continue;
+      VPRecipeBase *Clone = Op->getDefiningRecipe()->clone();
+      Clone->insertBefore(VEPR);
+      VPRecipeBase *EVLR = EVL.getDefiningRecipe();
+      VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
+      Type *EVLType = TypeInfo.inferScalarType(&EVL);
+      VPValue *EVLCast = Builder.createScalarZExtOrTrunc(
+          &EVL, TypeInfo.inferScalarType(&Plan->getVF()), EVLType,
+          EVLR->getDebugLoc());
+      Clone->replaceUsesOfWith(&Plan->getVF(), EVLCast);
+      Op->replaceUsesWithIf(
+          Clone->getVPSingleValue(), [&Operands](VPUser &U, unsigned Idx) {
+            return !is_contained(Operands,
+                                 cast<VPRecipeBase>(U).getVPSingleValue());
+          });
+    }
+    return VEPR;
   };
 
   if (match(&CurRecipe,
@@ -2858,7 +2886,7 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
 
   if (match(&CurRecipe,
             m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
-      match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
+      match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_VPValue())) &&
       cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
     return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
                                     AdjustEndPtr(EndPtr), EVL, Mask);
@@ -2871,7 +2899,7 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
 
   if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(),
                                       m_RemoveMask(HeaderMask, Mask))) &&
-      match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
+      match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_VPValue())) &&
       cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
     return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
                                      AdjustEndPtr(EndPtr), EVL, Mask);
@@ -2916,10 +2944,10 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
 
-  assert(all_of(Plan.getVF().users(),
-                IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
-                        VPWidenIntOrFpInductionRecipe>) &&
-         "User of VF that we can't transform to EVL.");
+  assert(
+      all_of(Plan.getVF().users(), IsaPred<VPInstruction, VPScalarIVStepsRecipe,
+                                           VPWidenIntOrFpInductionRecipe>) &&
+      "User of VF that we can't transform to EVL.");
   Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
     return isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe>(U);
   });
@@ -3412,6 +3440,7 @@ void VPlanTransforms::createInterleaveGroups(
     Instruction *IRInsertPos = IG->getInsertPos();
     auto *InsertPos =
         cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
+    RecipeBuilder.setInsertPoint(InsertPos);
 
     GEPNoWrapFlags NW = GEPNoWrapFlags::none();
     if (auto *Gep = dyn_cast<GetElementPtrInst>(
@@ -3421,6 +3450,7 @@ void VPlanTransforms::createInterleaveGroups(
     // Get or create the start address for the interleave group.
     VPValue *Addr = Start->getAddr();
     VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
+    VPBuilder &B = RecipeBuilder.getBuilder();
     if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
       // We cannot re-use the address of member zero because it does not
       // dominate the insert position. Instead, use the address of the insert
@@ -3436,18 +3466,17 @@ void VPlanTransforms::createInterleaveGroups(
                        IG->getIndex(IRInsertPos),
                    /*IsSigned=*/true);
       VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
-      VPBuilder B(InsertPos);
       Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
     }
     // If the group is reverse, adjust the index to refer to the last vector
     // lane instead of the first. We adjust the index from the first vector
     // lane, rather than directly getting the pointer for lane VF - 1, because
     // the pointer operand of the interleaved access is supposed to be uniform.
     if (IG->isReverse()) {
-      auto *ReversePtr = new VPVectorEndPointerRecipe(
-          Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
-          -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
-      ReversePtr->insertBefore(InsertPos);
+      auto *ReversePtr = RecipeBuilder.createVectorEndPointerRecipe(
+          Addr, getLoadStoreType(IRInsertPos), -(int64_t)IG->getFactor(), NW,
+          InsertPos->getDebugLoc());
+      B.insert(ReversePtr);
       Addr = ReversePtr;
     }
     auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,

diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -313,6 +313,25 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
       Copy->addOperand(VFxPart);
       continue;
     }
+    if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
+      VPBuilder Builder(VEPR);
+      VPValue *PrevOffset =
+          cast<VPVectorEndPointerRecipe>(getValueForPart(VEPR, Part - 1))
+              ->getOffset();
+      Type *IndexTy = TypeInfo.inferScalarType(PrevOffset);
+      Type *VFTy = TypeInfo.inferScalarType(&Plan.getVF());
+      VPValue *VF = Builder.createScalarZExtOrTrunc(
+          &Plan.getVF(), IndexTy, VFTy, DebugLoc::getUnknown());
+      // Offset = PrevOffset + Stride * VF.
+      VPValue *VFxStride = Builder.createOverflowingOp(
+          Instruction::Mul,
+          {VF, Plan.getConstantInt(IndexTy, VEPR->getStride())});
+      VPValue *Offset = Builder.createOverflowingOp(Instruction::Add,
+                                                    {PrevOffset, VFxStride});
+      Copy->setOperand(0, VEPR->getOperand(0));
+      Copy->setOperand(1, Offset);
+      continue;
+    }
     if (auto *Red = dyn_cast<VPReductionRecipe>(&R)) {
       auto *Phi = dyn_cast<VPReductionPHIRecipe>(R.getOperand(0));
       if (Phi && Phi->isOrdered()) {
@@ -329,14 +348,10 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
 
     // Add operand indicating the part to generate code for, to recipes still
     // requiring it.
-    if (isa<VPScalarIVStepsRecipe, VPWidenCanonicalIVRecipe,
-            VPVectorEndPointerRecipe>(Copy) ||
+    if (isa<VPScalarIVStepsRecipe, VPWidenCanonicalIVRecipe>(Copy) ||
         match(Copy,
               m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>()))
       Copy->addOperand(getConstantInt(Part));
-
-    if (isa<VPVectorEndPointerRecipe>(R))
-      Copy->setOperand(0, R.getOperand(0));
   }
 }
 

diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -171,8 +171,7 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
           }
           return VerifyEVLUse(*R, 2);
         })
-        .Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe,
-              VPInterleaveEVLRecipe>(
+        .Case<VPWidenLoadEVLRecipe, VPInterleaveEVLRecipe>(
             [&](const VPRecipeBase *R) { return VerifyEVLUse(*R, 1); })
         .Case<VPInstructionWithType>(
             [&](const VPInstructionWithType *S) { return VerifyEVLUse(*S, 0); })