Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 19 additions & 6 deletions src/header/TransferBench.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5383,9 +5383,15 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
if (cfg.general.recordPerIteration) {
rss.perIterMsec.push_back(deltaMsec);
std::set<std::pair<int,int>> CUs;
// Copy subExecutor parameters back to host before reading them.
// subExecParamGpuPtr lives in device memory and is not guaranteed to be host-accessible
// (e.g. GPUs without large/resizable BAR), so dereferencing it directly can segfault.
std::vector<SubExecParam> subExecParamHost(numSubExecs);
ERR_CHECK(hipMemcpy(subExecParamHost.data(), rss.subExecParamGpuPtr,
numSubExecs * sizeof(SubExecParam), hipMemcpyDefault));
Comment on lines +5386 to +5391

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be gated only for GPUs that do not have largeBar support, so that it doesn't change the behavior of GPUs that do have largeBar support.

for (int i = 0; i < numSubExecs; i++) {
CUs.insert(std::make_pair(rss.subExecParamGpuPtr[i].xccId,
GetId(rss.subExecParamGpuPtr[i].hwId)));
CUs.insert(std::make_pair(subExecParamHost[i].xccId,
GetId(subExecParamHost[i].hwId)));
}
rss.perIterCUs.push_back(CUs);
}
Expand Down Expand Up @@ -5452,18 +5458,25 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// If Transfers were combined into a single launch, figure out per-Transfer timing
// Determine timing for each of the individual transfers that were part of this launch
if (!cfg.gfx.useMultiStream) {
// Copy subExecutor parameters back to host before reading timing data.
// subExecParamGpu lives in device memory and is not guaranteed to be host-accessible
// (e.g. GPUs without large/resizable BAR), so dereferencing it directly can segfault.
std::vector<SubExecParam> subExecParamHost(exeInfo.totalSubExecs);
ERR_CHECK(hipMemcpy(subExecParamHost.data(), exeInfo.subExecParamGpu,
exeInfo.totalSubExecs * sizeof(SubExecParam), hipMemcpyDefault));

Comment on lines +5461 to +5467

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above. Gate this copy based on whether or not this GPU has largeBar support.

for (int i = 0; i < exeInfo.resources.size(); i++) {
TransferResources& rss = exeInfo.resources[i];
int64_t minStartCycle = std::numeric_limits<int64_t>::max();
int64_t maxStopCycle = std::numeric_limits<int64_t>::min();
std::set<std::pair<int, int>> CUs;

for (auto subExecIdx : rss.subExecIdx) {
minStartCycle = std::min(minStartCycle, exeInfo.subExecParamGpu[subExecIdx].startCycle);
maxStopCycle = std::max(maxStopCycle, exeInfo.subExecParamGpu[subExecIdx].stopCycle);
minStartCycle = std::min(minStartCycle, subExecParamHost[subExecIdx].startCycle);
maxStopCycle = std::max(maxStopCycle, subExecParamHost[subExecIdx].stopCycle);
if (cfg.general.recordPerIteration) {
CUs.insert(std::make_pair(exeInfo.subExecParamGpu[subExecIdx].xccId,
GetId(exeInfo.subExecParamGpu[subExecIdx].hwId)));
CUs.insert(std::make_pair(subExecParamHost[subExecIdx].xccId,
GetId(subExecParamHost[subExecIdx].hwId)));
}
}

Expand Down
Loading