Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ using namespace vpux;

namespace {

static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"convert", "softmax"};
static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"activation_swish", "eltwise_mul", "softmax", "convert", "rms_norm", "activation_swish", "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select", "topk"};

//
// AddSwKernelInstructionPrefetch
Expand Down Expand Up @@ -66,19 +66,21 @@ class AddSwKernelInstructionPrefetch final :
size_t clusterIdx, std::string& kernelName,
mlir::SymbolRefAttr functionSymbol);

VPUIP::SwKernelOp insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask, mlir::Value updateBarrier,
VPUIP::SwKernelOp insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask, mlir::ValueRange updateBarrier,
size_t clusterIdx, std::string& kernelName);
mlir::Operation* getFirstSwTaskInIRWaitingForBarrier(mlir::Value waitBarrier);
std::pair<std::string, size_t> getKernelNameAndSize(VPUIP::SwKernelOp swKernelOp);

using SwKernelPrefetchVec = std::vector<std::pair<std::string, size_t>>;
using SwKernelPrefetchVec = std::vector<std::tuple<std::string, size_t, size_t>>;
std::pair<SwKernelPrefetchVec, size_t> getPrefetchCandidatesAndFirstSwTask(mlir::Operation* funcOp,
VPURT::TaskConfigVec& allTasks);
std::tuple<mlir::Operation*, mlir::Value, size_t> getFirstSwTaskInIRAndBestUpdateBarrier(
VPURT::InferenceExecutionSimulator& infSim, VPURT::TaskConfigVec& allTasks, size_t firstShvTaskIndex);
std::vector<VPUIP::SwKernelOp> insertPrefetchTasks(mlir::Operation* funcOp, SwKernelPrefetchVec& kernelsToPrefetch,
mlir::Operation* firstShaveTaskInIR,
mlir::Value bestUpdateBarrier);
std::vector<VPUIP::SwKernelOp> insertPrefetchTasksDuringExec(mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch,
VPURT::TaskConfigVec& allTasks);

bool hasVPUSWModule(mlir::Operation* funcOp);
size_t getOffsetReservedMem(const mlir::ModuleOp module);
Expand All @@ -94,6 +96,7 @@ class AddSwKernelInstructionPrefetch final :
bool _minFreeCyclesHasValue = false;
size_t _minimumFreeCyclesForPrefetch = 250000;
bool _useDummyKernelForInstructionPrefetch = false;
size_t _dynamicPrefetchTileCounter = 0;
};

bool AddSwKernelInstructionPrefetch::hasVPUSWModule(mlir::Operation* funcOp) {
Expand Down Expand Up @@ -187,20 +190,26 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertPrefetchOpBeforeFirstKer

// For LNL, Shave kernel instruction prefetch needs to insert a dummy kernel instead of prefetch kernel
VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask,
mlir::Value updateBarrier,
mlir::ValueRange updateBarrier,
size_t clusterIdx,
std::string& kernelName) {
mlir::OpBuilder builder(firstSwTask);
auto moduleOp = firstSwTask->getParentOfType<mlir::ModuleOp>();
auto kernelOp = kernelNameToOps[kernelName];
auto moduleOp = kernelOp->getParentOfType<mlir::ModuleOp>();
auto reservedMemOffset = getOffsetReservedMem(moduleOp);
auto offsetAttr = getIntAttr(moduleOp->getContext(), reservedMemOffset);
auto kernelOp = kernelNameToOps[kernelName];
auto tileIndexAttr = kernelOp.getTileIndexAttr();
VPUX_THROW_UNLESS(tileIndexAttr, "SwKernelOp '{0}' does not have a tileIndex attribute", kernelOp->getLoc());
const int64_t tileIndex = static_cast<int64_t>(clusterIdx);

auto createBuffer = [&](mlir::Value io, StringRef suffix, mlir::SmallVector<mlir::Value>& buffers) {
if (auto bufOp = io.getDefiningOp<VPURT::DeclareBufferOp>()) {
auto newType = mlir::cast<NDTypeInterface>(io.getType()).changeShape({1, 1, 1, 1});
auto origType = mlir::cast<NDTypeInterface>(io.getType());
auto newMemSpaceAttr = vpux::IndexedSymbolAttr::get(moduleOp->getContext(), stringifyEnum(VPU::MemoryKind::CMX_NN), tileIndex);
auto newSectionIndexAttr = builder.getI64ArrayAttr({tileIndex});
auto newType = origType.changeShape({1, 1, 1, 1}).changeMemSpace(newMemSpaceAttr);
auto newBuff = builder.create<VPURT::DeclareBufferOp>(appendLoc(bufOp->getLoc(), suffix), newType,
bufOp.getSectionAttr(), bufOp.getSectionIndexAttr(),
bufOp.getSectionAttr(), newSectionIndexAttr,
offsetAttr, bufOp.getSwizzlingKeyAttr());
buffers.push_back(newBuff);
return true;
Expand Down Expand Up @@ -230,14 +239,17 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirst

auto cachePrefetchSwKernel = vpux::VPURT::wrapIntoTaskOp<VPUIP::SwKernelOp>(
builder, mlir::ValueRange(), updateBarrier, newLoc, mlir::ValueRange(srcBuffers),
mlir::ValueRange(dstBuffers), nullptr, kernelNameToSymbol[kernelName], kernelOp.getTileIndexAttr(),
mlir::ValueRange(dstBuffers), nullptr, kernelNameToSymbol[kernelName], builder.getI64IntegerAttr(tileIndex),
kernelOp.getInputStridesAttr(), kernelOp.getOutputStridesAttr());
// The dummy kernels here are generated after ActShaveProfilingPass,
// so we need to add skipProfiling as attribute to avoid capturing their metadata
cachePrefetchSwKernel->setAttr("skipProfiling", mlir::UnitAttr::get(firstSwTask->getContext()));

auto args =
(kernelName == "convert") ? mlir::ArrayAttr::get(moduleOp->getContext(), {}) : kernelNameToArgs[kernelName];
(kernelName == "convert" || kernelName == "eltwise_mul" || kernelName == "activation_cos"
|| kernelName == "activation_sin" || kernelName == "eltwise_equal"
|| kernelName == "eltwise_select" || kernelName == "rms_norm") ? mlir::ArrayAttr::get(moduleOp->getContext(), {}) : kernelNameToArgs[kernelName];

vpux::VPUIP::initSwKernel(cachePrefetchSwKernel, mlir::ValueRange(srcBuffers), mlir::ValueRange(dstBuffers), args,
_log.nest(), /*swKernelRunOp=*/nullptr);

Expand Down Expand Up @@ -316,7 +328,7 @@ AddSwKernelInstructionPrefetch::getPrefetchCandidatesAndFirstSwTask(mlir::Operat
}

if (!cache.isLoaded(kernelName)) {
kernelsToPrefetch.push_back(std::move(kernelNameAndSize));
kernelsToPrefetch.push_back(std::make_tuple(kernelName, kernelSize, shvTaskIndex));
}
cache.loadKernel(kernelName, kernelSize);

Expand Down Expand Up @@ -394,7 +406,7 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
for (size_t shaveIdx = 0; (shaveIdx < numClusters * noOfShavesPerCluster) && (shaveIdx < kernelsToPrefetch.size());
shaveIdx++) {
auto clusterIdx = shaveIdx / noOfShavesPerCluster;
auto [kernelName, kernelSize] = kernelsToPrefetch[shaveIdx];
auto [kernelName, kernelSize, shvTaskIndex] = kernelsToPrefetch[shaveIdx];
_log.trace("Prefetching kernel {0} on cluster {1}", kernelName, clusterIdx);
auto newPrefetchKernel =
_useDummyKernelForInstructionPrefetch
Expand All @@ -410,6 +422,197 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
return prefetchedKernels;
}

uint64_t findNextSaturationStart(size_t startIndex,
vpux::VPURT::TaskConfigVec& allTasks,
size_t numClusters,
std::map<uint64_t, size_t>& swKernelCountsCache) {

// Saturation is defined as 2x the number of clusters (e.g., 4 clusters -> 8 SW kernels)
const size_t saturationThreshold = numClusters * 2;

// Iterate through tasks strictly AFTER the startIndex
for (size_t i = startIndex + 1; i < allTasks.size(); ++i) {
uint64_t currentStartTime = static_cast<uint64_t>(allTasks[i].cycleStart);

if (swKernelCountsCache.find(currentStartTime) == swKernelCountsCache.end()) {
size_t swKernelCount = 0;
// Count all SW Kernels that start at this specific time
for (auto& task : allTasks) {
if (static_cast<uint64_t>(task.cycleStart) == currentStartTime) {
if (mlir::isa<VPUIP::SwKernelOp>(task.taskOp.getInnerTaskOp())) {
swKernelCount++;
}
}
if (static_cast<uint64_t>(task.cycleStart) > currentStartTime) {
break;
}
}
swKernelCountsCache[currentStartTime] = swKernelCount;
}

if (swKernelCountsCache[currentStartTime] >= saturationThreshold) {
return currentStartTime;
}
}

return std::numeric_limits<uint64_t>::max();
}

struct GapCandidate {
uint64_t lookaheadGap = 0;
int64_t insertionPointTaskIndex = -1;

// used for sort
bool operator>(const GapCandidate& other) const {
return lookaheadGap > other.lookaheadGap;
}
};

size_t getSwKernelCountAtTime(uint64_t startTime,
VPURT::TaskConfigVec& allTasks) {
size_t count = 0;
for (auto& taskConfig : allTasks) {
if (static_cast<uint64_t>(taskConfig.cycleStart) == startTime) {
if (mlir::isa<VPUIP::SwKernelOp>(taskConfig.taskOp.getInnerTaskOp())) {
count++;
}
}
if (static_cast<uint64_t>(taskConfig.cycleStart) > startTime) {
break;
}
}
return count;
}

std::optional<GapCandidate> findBestInsertionGap(
const std::string& kernelName,
uint64_t targetKernelGroupStartTime,
VPURT::TaskConfigVec& allTasks,
size_t numClusters,
Logger& log) {

const int64_t targetInsertTile = 1;
const uint64_t GAP_THRESHOLD = 50000;
const size_t saturationThreshold = numClusters * 2;

// <LookaheadGapSize, GapCandidate>
std::map<uint64_t, GapCandidate, std::greater<uint64_t>> validGaps;
std::map<uint64_t, size_t> swKernelCountsCache; // local cache

int64_t previousT1TaskIndex = -1;
uint64_t previousT1TaskStartTime = 0;

// find the largest gap between a non-saturated SW task and a saturated SW task / the kernel to be prefetched
for (size_t i = 0; i < allTasks.size(); ++i) {
auto& currentTaskConfig = allTasks[i];
uint64_t currentTaskStartTime = static_cast<uint64_t>(currentTaskConfig.cycleStart);
if (currentTaskStartTime > targetKernelGroupStartTime) {
break;
}

bool isT1Task = false;
if (auto swOp = mlir::dyn_cast<VPUIP::SwKernelOp>(currentTaskConfig.taskOp.getInnerTaskOp()); swOp != nullptr) {
isT1Task = (swOp.getTileIndexAttr().getInt() == targetInsertTile);
}

if (previousT1TaskIndex != -1 && isT1Task) {

auto& insertionPointTask = allTasks[previousT1TaskIndex];
auto insertionPointStartTime = static_cast<uint64_t>(insertionPointTask.cycleStart);

size_t simultaneousSwKernels = getSwKernelCountAtTime(insertionPointStartTime, allTasks);

if (simultaneousSwKernels < saturationThreshold) {
uint64_t nextSaturationStart = findNextSaturationStart(previousT1TaskIndex, allTasks, numClusters, swKernelCountsCache);
uint64_t gapEnd = std::min(nextSaturationStart, targetKernelGroupStartTime);
uint64_t lookaheadGap = 0;
if (gapEnd > previousT1TaskStartTime) {
lookaheadGap = gapEnd - previousT1TaskStartTime;
}

if (lookaheadGap >= GAP_THRESHOLD) {
GapCandidate gap;
gap.lookaheadGap = lookaheadGap;
gap.insertionPointTaskIndex = previousT1TaskIndex;
validGaps[lookaheadGap] = gap;
}
}
}

if (isT1Task) {
previousT1TaskIndex = static_cast<int64_t>(i);
previousT1TaskStartTime = currentTaskStartTime;
}
}

if (validGaps.empty()) {
log.trace("Kernel '{0}': No suitable insertion point found.", kernelName);
return std::nullopt;
}

return validGaps.begin()->second;
}

std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTasksDuringExec(
mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch,
VPURT::TaskConfigVec& allTasks) {

auto moduleOp = funcOp->getParentOfType<mlir::ModuleOp>();
const auto numClusters = getNumTiles(moduleOp);
VPUX_THROW_WHEN(numClusters == 0, "Number of tiles is zero.");

std::vector<VPUIP::SwKernelOp> prefetchedKernels{};

for (auto& kernelInfo : kernelsToPrefetch) {
std::string kernelName = std::get<0>(kernelInfo);
size_t firstAppearanceIndex = std::get<2>(kernelInfo);

if (firstAppearanceIndex >= allTasks.size()) {
_log.trace("Skipping kernel '{0}': Invalid firstAppearanceIndex {1}", kernelName, firstAppearanceIndex);
continue;
}
if (kernelNameToOps.count(kernelName) == 0) {
_log.trace("Skipping kernel '{0}': Missing dependencies (kernelNameToOps)", kernelName);
continue;
}

auto targetKernelGroupStartTime = static_cast<uint64_t>(allTasks[firstAppearanceIndex].cycleStart);

auto bestGapOpt = findBestInsertionGap(kernelName, targetKernelGroupStartTime,
allTasks, numClusters, _log);

if (!bestGapOpt.has_value()) {
_log.trace("Kernel '{0}': No valid gap found.", kernelName);
continue;
}

GapCandidate bestGap = bestGapOpt.value();
_log.trace("Kernel '{0}': Found best gap of {1} cycles. Inserting relative to task {2}.",
kernelName, bestGap.lookaheadGap, bestGap.insertionPointTaskIndex);

if (bestGap.insertionPointTaskIndex < 0 || static_cast<size_t>(bestGap.insertionPointTaskIndex) >= allTasks.size()) {
_log.error("Kernel '{0}': Invalid insertionPointTaskIndex {1}. Skipping insertion.",
kernelName, bestGap.insertionPointTaskIndex);
continue;
}

auto insertBeforeOp = allTasks[bestGap.insertionPointTaskIndex].taskOp;
size_t dynamicExecTile = _dynamicPrefetchTileCounter % numClusters;
_dynamicPrefetchTileCounter++;

auto newPrefetchKernel = insertDummyKernelOpBeforeFirstKernelTask(
insertBeforeOp,
mlir::ValueRange(),
dynamicExecTile,
kernelName
);

prefetchedKernels.push_back(newPrefetchKernel);
}

return prefetchedKernels;
}

void AddSwKernelInstructionPrefetch::safeRunOnFunc() {
auto funcOp = getOperation();
if (!hasVPUSWModule(funcOp)) {
Expand Down Expand Up @@ -444,18 +647,21 @@ void AddSwKernelInstructionPrefetch::safeRunOnFunc() {
auto [kernelsToPrefetch, firstShvTaskIndex] = getPrefetchCandidatesAndFirstSwTask(funcOp, allTasks);
auto [firstShaveTaskInIR, bestUpdateBarrier, bestReleaseCycle] =
getFirstSwTaskInIRAndBestUpdateBarrier(infSim, allTasks, firstShvTaskIndex);
if (firstShaveTaskInIR == nullptr || kernelsToPrefetch.empty()) {
return;
}
_log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle);

if (_useDummyKernelForInstructionPrefetch) {
auto memSpaceAttr = mlir::SymbolRefAttr::get(module->getContext(), stringifyEnum(VPU::MemoryKind::CMX_NN));
auto dummyKernelResMem = config::getDummySwKernelsForInstructionPrefetchReservedMemory(module, memSpaceAttr);
VPUX_THROW_WHEN(dummyKernelResMem == nullptr,
"Cannot find DummySWKernelsForInstructionPrefetchReservedMemory!");
}
auto newPrefetchKernels = insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier);
if (kernelsToPrefetch.empty()) {
return;
}
_log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle);

auto newPrefetchKernels = (firstShaveTaskInIR == nullptr)
? insertPrefetchTasksDuringExec(funcOp, kernelsToPrefetch, allTasks)
: insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier);

// Update dependencies for cache handling operations to meet requirements of control graph split.
auto& barrierInfo = getAnalysis<BarrierInfo>();
Expand Down
Loading