diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp index fa536155..3b14521e 100644 --- a/src/header/TransferBench.hpp +++ b/src/header/TransferBench.hpp @@ -5383,9 +5383,15 @@ static bool IsConfiguredGid(union ibv_gid const& gid) if (cfg.general.recordPerIteration) { rss.perIterMsec.push_back(deltaMsec); std::set> CUs; + // Copy subExecutor parameters back to host before reading them. + // subExecParamGpuPtr lives in device memory and is not guaranteed to be host-accessible + // (e.g. GPUs without large/resizable BAR), so dereferencing it directly can segfault. + std::vector subExecParamHost(numSubExecs); + ERR_CHECK(hipMemcpy(subExecParamHost.data(), rss.subExecParamGpuPtr, + numSubExecs * sizeof(SubExecParam), hipMemcpyDefault)); for (int i = 0; i < numSubExecs; i++) { - CUs.insert(std::make_pair(rss.subExecParamGpuPtr[i].xccId, - GetId(rss.subExecParamGpuPtr[i].hwId))); + CUs.insert(std::make_pair(subExecParamHost[i].xccId, + GetId(subExecParamHost[i].hwId))); } rss.perIterCUs.push_back(CUs); } @@ -5452,6 +5458,13 @@ static bool IsConfiguredGid(union ibv_gid const& gid) // If Transfers were combined into a single launch, figure out per-Transfer timing // Determine timing for each of the individual transfers that were part of this launch if (!cfg.gfx.useMultiStream) { + // Copy subExecutor parameters back to host before reading timing data. + // subExecParamGpu lives in device memory and is not guaranteed to be host-accessible + // (e.g. GPUs without large/resizable BAR), so dereferencing it directly can segfault. + std::vector subExecParamHost(exeInfo.totalSubExecs); + ERR_CHECK(hipMemcpy(subExecParamHost.data(), exeInfo.subExecParamGpu, + exeInfo.totalSubExecs * sizeof(SubExecParam), hipMemcpyDefault)); + for (int i = 0; i < exeInfo.resources.size(); i++) { TransferResources& rss = exeInfo.resources[i]; int64_t minStartCycle = std::numeric_limits::max(); @@ -5459,11 +5472,11 @@ static bool IsConfiguredGid(union ibv_gid const& gid) std::set> CUs; for (auto subExecIdx : rss.subExecIdx) { - minStartCycle = std::min(minStartCycle, exeInfo.subExecParamGpu[subExecIdx].startCycle); - maxStopCycle = std::max(maxStopCycle, exeInfo.subExecParamGpu[subExecIdx].stopCycle); + minStartCycle = std::min(minStartCycle, subExecParamHost[subExecIdx].startCycle); + maxStopCycle = std::max(maxStopCycle, subExecParamHost[subExecIdx].stopCycle); if (cfg.general.recordPerIteration) { - CUs.insert(std::make_pair(exeInfo.subExecParamGpu[subExecIdx].xccId, - GetId(exeInfo.subExecParamGpu[subExecIdx].hwId))); + CUs.insert(std::make_pair(subExecParamHost[subExecIdx].xccId, + GetId(subExecParamHost[subExecIdx].hwId))); } }