-
Notifications
You must be signed in to change notification settings - Fork 15.7k
[LowerMemIntrinsics] Propagate value profile to branch weights #174490
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
ec56a67 to
17aeac5
Compare
🪟 Windows x64 Test Results
All executed tests passed, but another part of the build failed. Click on a failure below to see the details. [code=4294967295] unittests/CodeGen/GlobalISel/GlobalISelTests.exeIf these failures are unrelated to your changes (for example tests are broken or flaky at HEAD), please open an issue at https://github.com/llvm/llvm-project/issues and add the |
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
17aeac5 to
376b55a
Compare
376b55a to
be752cf
Compare
3328ede to
884b717
Compare
884b717 to
05598ba
Compare
|
@llvm/pr-subscribers-llvm-transforms Author: Mircea Trofin (mtrofin) ChangesPatch is 25.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/174490.diff 5 Files Affected:
diff --git a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
index d4e72a60fc1ea..1ec150330484d 100644
--- a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
+++ b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
@@ -38,7 +38,8 @@ LLVM_ABI void createMemCpyLoopUnknownSize(
Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen,
Align SrcAlign, Align DestAlign, bool SrcIsVolatile, bool DstIsVolatile,
bool CanOverlap, const TargetTransformInfo &TTI,
- std::optional<unsigned> AtomicSize = std::nullopt);
+ std::optional<unsigned> AtomicSize = std::nullopt,
+ std::optional<uint64_t> AverageTripCount = std::nullopt);
/// Emit a loop implementing the semantics of an llvm.memcpy whose size is a
/// compile time constant. Loop is inserted at \p InsertBefore.
@@ -46,7 +47,8 @@ LLVM_ABI void createMemCpyLoopKnownSize(
Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr,
ConstantInt *CopyLen, Align SrcAlign, Align DestAlign, bool SrcIsVolatile,
bool DstIsVolatile, bool CanOverlap, const TargetTransformInfo &TTI,
- std::optional<uint32_t> AtomicCpySize = std::nullopt);
+ std::optional<uint32_t> AtomicCpySize = std::nullopt,
+ std::optional<uint64_t> AverageTripCount = std::nullopt);
/// Expand \p MemCpy as a loop. \p MemCpy is not deleted.
LLVM_ABI void expandMemCpyAsLoop(MemCpyInst *MemCpy,
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 4ab99edd64baa..c6f9e8c32b90d 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -12,15 +12,23 @@
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/ProfDataUtils.h"
+#include "llvm/ProfileData/InstrProf.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include <limits>
#include <optional>
#define DEBUG_TYPE "lower-mem-intrinsics"
using namespace llvm;
+namespace llvm {
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+}
+
/// \returns \p Len urem \p OpSize, checking for optimization opportunities.
/// \p OpSizeVal must be the integer value of the \c ConstantInt \p OpSize.
static Value *getRuntimeLoopRemainder(IRBuilderBase &B, Value *Len,
@@ -60,6 +68,28 @@ struct LoopExpansionInfo {
/// required.
Value *ResidualLoopIndex = nullptr;
};
+
+std::optional<uint64_t> getAverageMemOpLoopTripCount(const MemIntrinsic &I) {
+ if (ProfcheckDisableMetadataFixes)
+ return std::nullopt;
+ if (std::optional<Function::ProfileCount> EC =
+ I.getFunction()->getEntryCount();
+ !EC || !EC->getCount())
+ return std::nullopt;
+ if (const auto Len = I.getLengthInBytes())
+ return Len->getZExtValue();
+ uint64_t Total = 0;
+ auto ProfData =
+ getValueProfDataFromInst(I, InstrProfValueKind::IPVK_MemOPSize,
+ std::numeric_limits<uint32_t>::max(), Total);
+ if (!Total)
+ return std::nullopt;
+ uint64_t TripCount = 0;
+ for (const auto &KV : ProfData)
+ TripCount += KV.Count * KV.Value;
+ return std::round(1.0 * TripCount / Total);
+}
+
} // namespace
/// Insert the control flow and loop counters for a memcpy/memset loop
@@ -94,10 +124,11 @@ struct LoopExpansionInfo {
/// to \p MainLoopStep.
/// The generated \c MainLoopIP, \c MainLoopIndex, \c ResidualLoopIP, and
/// \c ResidualLoopIndex are returned in a \c LoopExpansionInfo object.
-static LoopExpansionInfo insertLoopExpansion(Instruction *InsertBefore,
- Value *Len, unsigned MainLoopStep,
- unsigned ResidualLoopStep,
- StringRef BBNamePrefix) {
+static LoopExpansionInfo
+insertLoopExpansion(Instruction *InsertBefore, Value *Len,
+ unsigned MainLoopStep, unsigned ResidualLoopStep,
+ StringRef BBNamePrefix,
+ std::optional<uint64_t> AverageTripCount) {
assert((ResidualLoopStep == 0 || MainLoopStep % ResidualLoopStep == 0) &&
"ResidualLoopStep must divide MainLoopStep if specified");
assert(ResidualLoopStep <= MainLoopStep &&
@@ -175,9 +206,18 @@ static LoopExpansionInfo insertLoopExpansion(Instruction *InsertBefore,
ConstantInt *Zero = ConstantInt::get(ILenType, 0U);
if (MustTakeMainLoop)
PreLoopBuilder.CreateBr(MainLoopBB);
- else
- PreLoopBuilder.CreateCondBr(PreLoopBuilder.CreateICmpNE(LoopUnits, Zero),
- MainLoopBB, ResidualCondBB);
+ else {
+ auto *BR = PreLoopBuilder.CreateCondBr(
+ PreLoopBuilder.CreateICmpNE(LoopUnits, Zero), MainLoopBB,
+ ResidualCondBB);
+ if (AverageTripCount.has_value()) {
+ MDBuilder MDB(ParentFunc->getContext());
+ setFittedBranchWeights(
+ *BR, {AverageTripCount.value() % MainLoopStep, 1}, false);
+ } else {
+ setExplicitlyUnknownBranchWeightsIfProfiled(*BR, DEBUG_TYPE);
+ }
+ }
PreLoopBB->getTerminator()->eraseFromParent();
// Stay in the MainLoop until we have handled all the LoopUnits. Then go to
@@ -222,22 +262,32 @@ static LoopExpansionInfo insertLoopExpansion(Instruction *InsertBefore,
PreLoopBuilder.CreateBr(MainLoopBB);
} else {
ConstantInt *Zero = ConstantInt::get(ILenType, 0U);
+ MDBuilder B(ParentFunc->getContext());
PreLoopBuilder.CreateCondBr(PreLoopBuilder.CreateICmpNE(LoopUnits, Zero),
- MainLoopBB, PostLoopBB);
+ MainLoopBB, PostLoopBB,
+ B.createLikelyBranchWeights());
}
PreLoopBB->getTerminator()->eraseFromParent();
// Stay in the MainLoop until we have handled all the LoopUnits.
- LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, LoopUnits),
- MainLoopBB, PostLoopBB);
+ auto *Br = LoopBuilder.CreateCondBr(
+ LoopBuilder.CreateICmpULT(NewIndex, LoopUnits), MainLoopBB, PostLoopBB);
+ if (AverageTripCount.has_value())
+ setFittedBranchWeights(*Br, {AverageTripCount.value() / MainLoopStep, 1},
+ /*IsExpected=*/false);
+ else
+ setExplicitlyUnknownBranchWeightsIfProfiled(*Br, DEBUG_TYPE);
}
return LEI;
}
-void llvm::createMemCpyLoopKnownSize(
- Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr,
- ConstantInt *CopyLen, Align SrcAlign, Align DstAlign, bool SrcIsVolatile,
- bool DstIsVolatile, bool CanOverlap, const TargetTransformInfo &TTI,
- std::optional<uint32_t> AtomicElementSize) {
+void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
+ Value *DstAddr, ConstantInt *CopyLen,
+ Align SrcAlign, Align DstAlign,
+ bool SrcIsVolatile, bool DstIsVolatile,
+ bool CanOverlap,
+ const TargetTransformInfo &TTI,
+ std::optional<uint32_t> AtomicElementSize,
+ std::optional<uint64_t> AverageTripCount) {
// No need to expand zero length copies.
if (CopyLen->isZero())
return;
@@ -269,8 +319,9 @@ void llvm::createMemCpyLoopKnownSize(
// Skip the loop expansion entirely if the loop would never be taken.
if (LoopEndCount != 0) {
- LoopExpansionInfo LEI = insertLoopExpansion(InsertBefore, CopyLen,
- LoopOpSize, 0, "static-memcpy");
+ LoopExpansionInfo LEI =
+ insertLoopExpansion(InsertBefore, CopyLen, LoopOpSize, 0,
+ "static-memcpy", AverageTripCount);
// Fill MainLoopBB
IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
@@ -357,7 +408,8 @@ void llvm::createMemCpyLoopUnknownSize(
Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen,
Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile,
bool CanOverlap, const TargetTransformInfo &TTI,
- std::optional<uint32_t> AtomicElementSize) {
+ std::optional<uint32_t> AtomicElementSize,
+ std::optional<uint64_t> AverageTripCount) {
BasicBlock *PreLoopBB = InsertBefore->getParent();
Function *ParentFunc = PreLoopBB->getParent();
const DataLayout &DL = ParentFunc->getDataLayout();
@@ -387,8 +439,9 @@ void llvm::createMemCpyLoopUnknownSize(
assert(ResidualLoopOpSize == (AtomicElementSize ? *AtomicElementSize : 1) &&
"Store size is expected to match type size");
- LoopExpansionInfo LEI = insertLoopExpansion(
- InsertBefore, CopyLen, LoopOpSize, ResidualLoopOpSize, "dynamic-memcpy");
+ LoopExpansionInfo LEI =
+ insertLoopExpansion(InsertBefore, CopyLen, LoopOpSize, ResidualLoopOpSize,
+ "dynamic-memcpy", AverageTripCount);
// Fill MainLoopBB
IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
@@ -931,6 +984,7 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
Value *CopyLen, Value *SetValue, Align DstAlign,
+ std::optional<uint64_t> AverageTripCount,
bool IsVolatile) {
Type *TypeOfCopyLen = CopyLen->getType();
BasicBlock *OrigBB = InsertBefore->getParent();
@@ -943,9 +997,16 @@ static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
IRBuilder<> Builder(OrigBB->getTerminator());
- Builder.CreateCondBr(
+ auto *ToLoopBR = Builder.CreateCondBr(
Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0), CopyLen), NewBB,
LoopBB);
+ MDBuilder MDB(F->getContext());
+ if (AverageTripCount.has_value())
+ ToLoopBR->setMetadata(LLVMContext::MD_prof,
+ MDB.createLikelyBranchWeights());
+ else
+ setExplicitlyUnknownBranchWeightsIfProfiled(*ToLoopBR, DEBUG_TYPE);
+
OrigBB->getTerminator()->eraseFromParent();
unsigned PartSize = DL.getTypeStoreSize(SetValue->getType());
@@ -964,8 +1025,13 @@ static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
LoopIndex->addIncoming(NewIndex, LoopBB);
- LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
- NewBB);
+ auto *LoopBR = LoopBuilder.CreateCondBr(
+ LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB, NewBB);
+ if (AverageTripCount.has_value())
+ setFittedBranchWeights(*LoopBR, {AverageTripCount.value(), 1},
+ /*IsExpected=*/false);
+ else
+ setExplicitlyUnknownBranchWeightsIfProfiled(*LoopBR, DEBUG_TYPE);
}
template <typename T>
@@ -983,6 +1049,7 @@ void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
const TargetTransformInfo &TTI,
ScalarEvolution *SE) {
bool CanOverlap = canOverlap(Memcpy, SE);
+ auto TripCount = getAverageMemOpLoopTripCount(*Memcpy);
if (ConstantInt *CI = dyn_cast<ConstantInt>(Memcpy->getLength())) {
createMemCpyLoopKnownSize(
/* InsertBefore */ Memcpy,
@@ -994,7 +1061,9 @@ void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
/* SrcIsVolatile */ Memcpy->isVolatile(),
/* DstIsVolatile */ Memcpy->isVolatile(),
/* CanOverlap */ CanOverlap,
- /* TargetTransformInfo */ TTI);
+ /* TargetTransformInfo */ TTI,
+ /* AtomicElementSize */ std::nullopt,
+ /* AverageTripCount */ TripCount);
} else {
createMemCpyLoopUnknownSize(
/* InsertBefore */ Memcpy,
@@ -1006,7 +1075,9 @@ void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
/* SrcIsVolatile */ Memcpy->isVolatile(),
/* DstIsVolatile */ Memcpy->isVolatile(),
/* CanOverlap */ CanOverlap,
- /* TargetTransformInfo */ TTI);
+ /* TargetTransformInfo */ TTI,
+ /* AtomicElementSize */ std::nullopt,
+ /* AverageTripCount */ TripCount);
}
}
@@ -1027,16 +1098,17 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
if (!TTI.addrspacesMayAlias(SrcAS, DstAS)) {
// We may not be able to emit a pointer comparison, but we don't have
// to. Expand as memcpy.
+ auto AverageTripCount = getAverageMemOpLoopTripCount(*Memmove);
if (ConstantInt *CI = dyn_cast<ConstantInt>(CopyLen)) {
- createMemCpyLoopKnownSize(/*InsertBefore=*/Memmove, SrcAddr, DstAddr,
- CI, SrcAlign, DstAlign, SrcIsVolatile,
- DstIsVolatile,
- /*CanOverlap=*/false, TTI);
+ createMemCpyLoopKnownSize(
+ /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CI, SrcAlign, DstAlign,
+ SrcIsVolatile, DstIsVolatile,
+ /*CanOverlap=*/false, TTI, std::nullopt, AverageTripCount);
} else {
- createMemCpyLoopUnknownSize(/*InsertBefore=*/Memmove, SrcAddr, DstAddr,
- CopyLen, SrcAlign, DstAlign, SrcIsVolatile,
- DstIsVolatile,
- /*CanOverlap=*/false, TTI);
+ createMemCpyLoopUnknownSize(
+ /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen, SrcAlign,
+ DstAlign, SrcIsVolatile, DstIsVolatile,
+ /*CanOverlap=*/false, TTI, std::nullopt, AverageTripCount);
}
return true;
@@ -1072,7 +1144,8 @@ void llvm::expandMemSetAsLoop(MemSetInst *Memset) {
/* CopyLen */ Memset->getLength(),
/* SetValue */ Memset->getValue(),
/* Alignment */ Memset->getDestAlign().valueOrOne(),
- Memset->isVolatile());
+ /* AverageTripCount */ getAverageMemOpLoopTripCount(*Memset),
+ /* IsVolatile */ Memset->isVolatile());
}
void llvm::expandMemSetPatternAsLoop(MemSetPatternInst *Memset) {
@@ -1081,7 +1154,8 @@ void llvm::expandMemSetPatternAsLoop(MemSetPatternInst *Memset) {
/* CopyLen=*/Memset->getLength(),
/* SetValue=*/Memset->getValue(),
/* Alignment=*/Memset->getDestAlign().valueOrOne(),
- Memset->isVolatile());
+ /* AverageTripCount */ getAverageMemOpLoopTripCount(*Memset),
+ /* IsVolatile */ Memset->isVolatile());
}
void llvm::expandAtomicMemCpyAsLoop(AnyMemCpyInst *AtomicMemcpy,
@@ -1100,7 +1174,7 @@ void llvm::expandAtomicMemCpyAsLoop(AnyMemCpyInst *AtomicMemcpy,
/* DstIsVolatile */ AtomicMemcpy->isVolatile(),
/* CanOverlap */ false, // SrcAddr & DstAddr may not overlap by spec.
/* TargetTransformInfo */ TTI,
- /* AtomicCpySize */ AtomicMemcpy->getElementSizeInBytes());
+ /* AtomicElementSize */ AtomicMemcpy->getElementSizeInBytes());
} else {
createMemCpyLoopUnknownSize(
/* InsertBefore */ AtomicMemcpy,
@@ -1113,6 +1187,6 @@ void llvm::expandAtomicMemCpyAsLoop(AnyMemCpyInst *AtomicMemcpy,
/* DstIsVolatile */ AtomicMemcpy->isVolatile(),
/* CanOverlap */ false, // SrcAddr & DstAddr may not overlap by spec.
/* TargetTransformInfo */ TTI,
- /* AtomicCpySize */ AtomicMemcpy->getElementSizeInBytes());
+ /* AtomicElementSize */ AtomicMemcpy->getElementSizeInBytes());
}
}
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memcpy-inline-non-constant-len.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memcpy-inline-non-constant-len.ll
index a4e049941030e..0ca0bb2421c8b 100644
--- a/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memcpy-inline-non-constant-len.ll
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memcpy-inline-non-constant-len.ll
@@ -1,49 +1,62 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
; RUN: opt -mtriple=x86_64-pc-linux-gnu -passes=pre-isel-intrinsic-lowering -S -o - %s | FileCheck %s
; Constant length memcpy.inline should be left unmodified.
-define void @memcpy_32(ptr %dst, ptr %src) nounwind {
+define void @memcpy_32(ptr %dst, ptr %src) nounwind !prof !0 {
; CHECK-LABEL: define void @memcpy_32(
-; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: call void @llvm.memcpy.inline.p0.p0.i64(ptr [[DST]], ptr [[SRC]], i64 32, i1 false)
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
+; CHECK-NEXT: call void @llvm.memcpy.inline.p0.p0.i64(ptr [[DST]], ptr [[SRC]], i64 32, i1 false), !prof [[PROF1:![0-9]+]]
; CHECK-NEXT: tail call void @llvm.memcpy.inline.p0.p0.i64(ptr [[DST]], ptr [[SRC]], i64 32, i1 true)
; CHECK-NEXT: ret void
;
- call void @llvm.memcpy.inline.p0.p0.i64(ptr %dst, ptr %src, i64 32, i1 0)
+ call void @llvm.memcpy.inline.p0.p0.i64(ptr %dst, ptr %src, i64 32, i1 0), !prof !1
tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dst, ptr %src, i64 32, i1 1)
ret void
}
-define void @memcpy_x(ptr %dst, ptr %src, i64 %x) nounwind {
+define void @memcpy_x(ptr %dst, ptr %src, i64 %x) nounwind !prof !0 {
; CHECK-LABEL: define void @memcpy_x(
-; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[X:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i64 [[X]], 0
-; CHECK-NEXT: br i1 [[TMP1]], label %[[LOOP_MEMCPY_EXPANSION:.*]], label %[[POST_LOOP_MEMCPY_EXPANSION:.*]]
-; CHECK: [[LOOP_MEMCPY_EXPANSION]]:
-; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[LOOP_MEMCPY_EXPANSION]] ]
+; CHECK-NEXT: br i1 [[TMP1]], label %[[DYNAMIC_MEMCPY_EXPANSION_MAIN_BODY:.*]], label %[[DYNAMIC_MEMCPY_POST_EXPANSION:.*]], !prof [[PROF2:![0-9]+]]
+; CHECK: [[DYNAMIC_MEMCPY_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[DYNAMIC_MEMCPY_EXPANSION_MAIN_BODY]] ]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[LOOP_INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 1
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[LOOP_INDEX]]
; CHECK-NEXT: store i8 [[TMP3]], ptr [[TMP4]], align 1
; CHECK-NEXT: [[TMP5]] = add i64 [[LOOP_INDEX]], 1
; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], [[X]]
-; CHECK-NEXT: br i1 [[TMP6]], label %[[LOOP_MEMCPY_EXPANSION]], label %[[POST_LOOP_MEMCPY_EXPANSION]]
-; CHECK: [[POST_LOOP_MEMCPY_EXPANSION]]:
+; CHECK-NEXT: br i1 [[TMP6]], label %[[DYNAMIC_MEMCPY_EXPANSION_MAIN_BODY]], label %[[DYNAMIC_MEMCPY_POST_EXPANSION]], !prof [[PROF3:![0-9]+]]
+; CHECK: [[DYNAMIC_MEMCPY_POST_EXPANSION]]:
; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[X]], 0
-; CHECK-NEXT: br i1 [[TMP7]], label %[[LOOP_MEMCPY_EXPANSION2:.*]], label %[[POST_LOOP_MEMCPY_EXPANSION1:.*]]
-; CHECK: [[LOOP_MEMCPY_EXPANSION2]]:
-; CHECK-NEXT: [[LOOP_INDEX3:%.*]] = phi i64 [ 0, %[[POST_LOOP_MEMCPY_EXPANSION]] ], [ [[TMP11:%.*]], %[[LOOP_MEMCPY_EXPANSION2]] ]
+; CHECK-NEXT: br i1 [[TMP7]], label %[[DYNAMIC_MEMCPY_EXPANSION_MAIN_BODY2:.*]], label %[[DYNAMIC_MEMCPY_POST_EXPANSION1:.*]], !prof [[PROF2]]
+; CHECK: [[DYNAMIC_MEMCPY_EXPANSION_MAIN_BODY2]]:
+; CHECK-NEXT: [[LOOP_INDEX3:%.*]] = phi i64 [ 0, %[[DYNAMIC_MEMCPY_POST_EXPANSION]] ], [ [[TMP11:%.*]], %[[DYNAMIC_MEMCPY_EXPANSION_MAIN_BODY2]] ]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[LOOP_INDEX3]]
; CHECK-NEXT: [[TMP9:%.*]] = load volatile i8, ptr [[TMP8]], align 1
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[LOOP_INDEX3]]
; CHECK-NEXT: store volatile i8 [[TMP9]], ptr [[TMP10]], align 1
; CHECK-NEXT: [[TMP11]] = add i64 [[LOOP_INDEX3]], 1
; CHECK-NEXT: [[TMP12:%.*]] = icmp ult i64 [[TMP11]], [[...
[truncated]
|

If the mem intrinsics have value profile information associated, we can synthesize branch weights when converting them (the intrinsics) to loops.