From f3576443b1051c9a1e41e58f6becd2a03e340dac Mon Sep 17 00:00:00 2001 From: Harsh Menon Date: Sat, 27 Jun 2026 16:04:16 -0700 Subject: [PATCH 1/4] AMDGPU: add hotswap entry trampoline core --- amd/comgr/CMakeLists.txt | 1 + amd/comgr/src/comgr-env.cpp | 5 + amd/comgr/src/comgr-env.h | 3 + amd/comgr/src/comgr-hotswap-b0a0.cpp | 118 ++- amd/comgr/src/comgr-hotswap-elf.cpp | 772 ++++++++++++++++-- .../src/comgr-hotswap-entry-trampoline.cpp | 597 ++++++++++++++ amd/comgr/src/comgr-hotswap-internal.h | 121 ++- amd/comgr/src/comgr-hotswap-llvm.cpp | 50 +- amd/comgr/src/comgr-hotswap.cpp | 188 ++++- amd/comgr/src/hotswap/README.md | 21 +- .../test-lit/comgr-sources/hotswap-rewrite.c | 21 +- .../hotswap-kernel-entry-trampoline.s | 81 ++ amd/comgr/test-unit/CMakeLists.txt | 1 + amd/comgr/test-unit/HotswapElfTest.cpp | 383 +++++---- amd/comgr/test-unit/HotswapMCTest.cpp | 511 ++++++++---- amd/comgr/test-unit/comgr-test-elf-utils.h | 266 ++++++ 16 files changed, 2680 insertions(+), 459 deletions(-) create mode 100644 amd/comgr/src/comgr-hotswap-entry-trampoline.cpp create mode 100644 amd/comgr/test-lit/hotswap-kernel-entry-trampoline.s diff --git a/amd/comgr/CMakeLists.txt b/amd/comgr/CMakeLists.txt index b72fc5444b769..d489667b1ecf2 100644 --- a/amd/comgr/CMakeLists.txt +++ b/amd/comgr/CMakeLists.txt @@ -103,6 +103,7 @@ set(SOURCES src/comgr-env.cpp src/comgr-hotswap.cpp src/comgr-hotswap-b0a0.cpp + src/comgr-hotswap-entry-trampoline.cpp src/comgr-hotswap-patch-trampoline.cpp src/comgr-hotswap-elf.cpp src/comgr-hotswap-llvm.cpp diff --git a/amd/comgr/src/comgr-env.cpp b/amd/comgr/src/comgr-env.cpp index 7922cf67895c1..52ffc17a625ec 100644 --- a/amd/comgr/src/comgr-env.cpp +++ b/amd/comgr/src/comgr-env.cpp @@ -84,6 +84,11 @@ bool shouldEmitVerboseLogs() { return VerboseLogs && StringRef(VerboseLogs) != "0"; } +bool shouldUseHotswapEntryTrampolines() { + static char *EntryTrampolines = getenv("AMD_COMGR_HOTSWAP_ENTRY_TRAMPOLINES"); + return EntryTrampolines && StringRef(EntryTrampolines) != "0"; +} + llvm::StringRef getLLVMPath() { static const char *EnvLLVMPath = std::getenv("LLVM_PATH"); return EnvLLVMPath; diff --git a/amd/comgr/src/comgr-env.h b/amd/comgr/src/comgr-env.h index 9715b48a32c46..75e9d30ab99ca 100644 --- a/amd/comgr/src/comgr-env.h +++ b/amd/comgr/src/comgr-env.h @@ -26,6 +26,9 @@ std::optional getRedirectLogs(); /// Return whether the environment requests verbose logging. bool shouldEmitVerboseLogs(); +/// Return whether hotswap should redirect kernel descriptors to entry stubs. +bool shouldUseHotswapEntryTrampolines(); + /// Return whether the environment requests time statistics collection. bool needTimeStatistics(); diff --git a/amd/comgr/src/comgr-hotswap-b0a0.cpp b/amd/comgr/src/comgr-hotswap-b0a0.cpp index 38599efbc96e2..9551ae1f13f83 100644 --- a/amd/comgr/src/comgr-hotswap-b0a0.cpp +++ b/amd/comgr/src/comgr-hotswap-b0a0.cpp @@ -7,7 +7,7 @@ /// /// \file /// Dispatcher for B0-to-A0 silicon stepping patches and the -/// retargetCodeObjectB0A0 orchestrator that drives the full pipeline: +/// retargetCodeObject orchestrator that drives the full pipeline: /// decode -> patch -> trampoline growth -> DWARF update. /// /// Patch passes are dispatched through HotswapPatchVTable. The membership @@ -32,6 +32,8 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/Support/Compiler.h" +#include + using namespace llvm; namespace COMGR { @@ -118,7 +120,7 @@ void patchDebugFrame(uint8_t *Elf, size_t ElfSize, uint64_t TextAddr, // invokes it eagerly on the singleton's private storage, which the C++11 // magic-static rule guarantees runs exactly once even under concurrent // first access. That removes both the explicit std::call_once at the -// retargetCodeObjectB0A0 entry point and any inter-TU static-init order +// retargetCodeObject entry point and any inter-TU static-init order // dependency on the patch modules. void installHotswapPatches(HotswapPatchVTable &VT) { @@ -426,7 +428,7 @@ applyGfx1250B0toA0Rules(std::vector &Decoded, return Patched; } -// -- retargetCodeObjectB0A0 helpers ------------------------------------------- +// -- retargetCodeObject helpers ------------------------------------------- /// Finalize the deferred trampolines produced by emitToTrampoline: resolves /// the branch-back at the tail of each trampoline to land on the next @@ -434,7 +436,7 @@ applyGfx1250B0toA0Rules(std::vector &Decoded, /// padding at the original .text slot, and reports per-trampoline encoding /// failures through log(). Runs after all patch passes finish so the /// post-.text layout of trampolines is known. Returns false if any -/// trampoline could not be fixed up, but still patches the ones that can. +/// trampoline could not be fixed up. [[nodiscard]] static bool fixupTrampolineBranches(std::vector &Trampolines, uint8_t *Text, uint64_t TextSize, const LLVMState &LS) { @@ -480,15 +482,15 @@ fixupTrampolineBranches(std::vector &Trampolines, uint8_t *Text, /// implementations land in separate PRs. static void patchDebugSections(WritableMemoryBuffer &ElfBuf, ArrayRef Trampolines, - const ElfView &Elf, size_t TrampTotal) { + const ElfView &Elf, size_t GrowthTotal) { uint8_t *Data = reinterpret_cast(ElfBuf.getBufferStart()); size_t Size = ElfBuf.getBufferSize(); if (!addTrampolineSymbols(ElfBuf, Trampolines, Elf.textSize(), Elf.textSectionIndex())) log() << "hotswap: error: addTrampolineSymbols failed\n"; - patchDebugRanges(Data, Size, Elf.textAddr(), Elf.textSize(), TrampTotal); - patchDebugInfo(Data, Size, Elf.textAddr(), Elf.textSize(), TrampTotal); - patchDebugFrame(Data, Size, Elf.textAddr(), Elf.textSize(), TrampTotal); + patchDebugRanges(Data, Size, Elf.textAddr(), Elf.textSize(), GrowthTotal); + patchDebugInfo(Data, Size, Elf.textAddr(), Elf.textSize(), GrowthTotal); + patchDebugFrame(Data, Size, Elf.textAddr(), Elf.textSize(), GrowthTotal); if (!patchDebugLine(ElfBuf, Trampolines, Elf.textSize(), Elf.textAddr())) log() << "hotswap: error: patchDebugLine failed\n"; } @@ -520,16 +522,31 @@ static void runScratchVerification(WritableMemoryBuffer &OutBuf, << "scratch conflicts\n"; } -// -- retargetCodeObjectB0A0 --------------------------------------------------- +// -- retargetCodeObject ------------------------------------------------------- -amd_comgr_status_t retargetCodeObjectB0A0(const void *ElfData, size_t ElfSize, - const TargetIdentifier &TargetIdent, - std::unique_ptr &Out) { +amd_comgr_status_t retargetCodeObject(const void *ElfData, size_t ElfSize, + const TargetIdentifier &TargetIdent, + const Gfx1250RewriteOptions &Options, + std::unique_ptr &Out) { // The dispatcher fetches the patch vtable lazily via // getHotswapPatchVTable() inside applyGfx1250B0toA0Rules; the singleton's // initializer binds every register*Patch slot on first access, so no // explicit install step is needed here. + if (!Options.RunB0A0Patches && !Options.RunEntryTrampolines) { + std::unique_ptr Result = + WritableMemoryBuffer::getNewUninitMemBuffer(ElfSize); + if (!Result) { + log() << "hotswap: error: retargetCodeObject: " + << "getNewUninitMemBuffer(" << ElfSize + << ") failed (out of memory) for the no-op output copy.\n"; + return AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES; + } + std::memcpy(Result->getBufferStart(), ElfData, ElfSize); + Out = std::move(Result); + return AMD_COMGR_STATUS_SUCCESS; + } + // Take a working copy so the input is preserved and we have a mutable // buffer to parse / patch. std::vector Buf(static_cast(ElfData), @@ -537,12 +554,12 @@ amd_comgr_status_t retargetCodeObjectB0A0(const void *ElfData, size_t ElfSize, Expected ViewOrErr = ElfView::create(Buf.data(), Buf.size()); if (!ViewOrErr) { - log() << "hotswap: error: retargetCodeObjectB0A0: input is not a " + log() << "hotswap: error: retargetCodeObject: input is not a " << "parseable ELF64 (" << toString(ViewOrErr.takeError()) << ").\n"; return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; } if (ViewOrErr->textSize() == 0) { - log() << "hotswap: error: retargetCodeObjectB0A0: input ELF has empty " + log() << "hotswap: error: retargetCodeObject: input ELF has empty " << ".text section; nothing to rewrite.\n"; return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; } @@ -550,7 +567,7 @@ amd_comgr_status_t retargetCodeObjectB0A0(const void *ElfData, size_t ElfSize, LLVMState LS = initLLVM(TargetIdent); if (!LS.Valid) { - log() << "hotswap: error: retargetCodeObjectB0A0: initLLVM failed " + log() << "hotswap: error: retargetCodeObject: initLLVM failed " << "for CPU '" << TargetIdent.Processor << "'; aborting rewrite.\n"; return AMD_COMGR_STATUS_ERROR; } @@ -558,41 +575,72 @@ amd_comgr_status_t retargetCodeObjectB0A0(const void *ElfData, size_t ElfSize, RewriteConfig Config = makeGfx1250B0A0Config(); uint8_t *Text = Elf.textData(); - std::vector Decoded; - if (!decodeTextSection(Text, Elf.textSize(), LS, Decoded)) { - log() << "hotswap: error: retargetCodeObjectB0A0: decodeTextSection " - << "failed on .text (" << Elf.textSize() << " bytes).\n"; - return AMD_COMGR_STATUS_ERROR; - } - + uint64_t Count = 0; std::vector Deferred; std::vector ScratchPatches; - uint32_t Count = applyGfx1250B0toA0Rules( - Decoded, Text, Elf.textSize(), LS, Deferred, Elf, ScratchPatches, Config); + if (Options.RunB0A0Patches) { + std::vector Decoded; + if (!decodeTextSection(Text, Elf.textSize(), LS, Decoded)) { + log() << "hotswap: error: retargetCodeObject: decodeTextSection " + << "failed on .text (" << Elf.textSize() << " bytes).\n"; + return AMD_COMGR_STATUS_ERROR; + } - log() << "hotswap: applied " << Count << " patches\n"; + Count = applyGfx1250B0toA0Rules(Decoded, Text, Elf.textSize(), LS, Deferred, + Elf, ScratchPatches, Config); + log() << "hotswap: applied " << Count << " B0-to-A0 patches\n"; + } else { + log() << "hotswap: B0-to-A0 patches disabled for this rewrite\n"; + } std::unique_ptr Result; + std::vector Growth = Deferred; if (!Deferred.empty()) { - if (!fixupTrampolineBranches(Deferred, Text, Elf.textSize(), LS)) - log() << "hotswap: error: some trampolines could not be fixed up\n"; + if (!fixupTrampolineBranches(Deferred, Text, Elf.textSize(), LS)) { + log() << "hotswap: error: trampoline branch fixup failed; aborting " + "rewrite\n"; + return AMD_COMGR_STATUS_ERROR; + } + Growth = Deferred; + } + + std::vector EntryFixups; + if (Options.RunEntryTrampolines) { + std::optional EntryCount = appendKernelEntryTrampolines( + Elf, LS, Config.MaxSgprs, Growth, EntryFixups); + if (!EntryCount) + return AMD_COMGR_STATUS_ERROR; + Count += *EntryCount; + } else { + log() << "hotswap: kernel-entry trampolines disabled for this rewrite\n"; + } - Result = Elf.growWithTrampolines(Deferred, LS.SNopBytes); + if (!Growth.empty()) { + Result = Elf.growWithTrampolines(Growth, LS.SNopBytes); if (!Result) { - log() << "hotswap: error: retargetCodeObjectB0A0: " + log() << "hotswap: error: retargetCodeObject: " << "ElfView::growWithTrampolines returned null with " - << Deferred.size() << " trampolines queued.\n"; + << Growth.size() << " trampolines queued.\n"; return AMD_COMGR_STATUS_ERROR; } - size_t TrampTotal = 0; - for (const Trampoline &T : Deferred) - TrampTotal += T.Bytes.size(); - patchDebugSections(*Result, Deferred, Elf, TrampTotal); + size_t GrowthTotal = 0; + for (const Trampoline &T : Growth) { + if (T.Bytes.size() > std::numeric_limits::max() - GrowthTotal) { + log() << "hotswap: error: retargetCodeObject: growth byte count " + << "overflows size_t.\n"; + return AMD_COMGR_STATUS_ERROR; + } + GrowthTotal += T.Bytes.size(); + } + patchDebugSections(*Result, Deferred, Elf, GrowthTotal); + if (!rewriteKernelEntryDescriptorOffsets(*Result, Elf.textSize(), + EntryFixups)) + return AMD_COMGR_STATUS_ERROR; } else { Result = WritableMemoryBuffer::getNewUninitMemBuffer(ElfSize); if (!Result) { - log() << "hotswap: error: retargetCodeObjectB0A0: " + log() << "hotswap: error: retargetCodeObject: " << "getNewUninitMemBuffer(" << ElfSize << ") failed (out of memory) for the patched output copy.\n"; return AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES; diff --git a/amd/comgr/src/comgr-hotswap-elf.cpp b/amd/comgr/src/comgr-hotswap-elf.cpp index 29552e6dd86ea..51e1538f6c9b8 100644 --- a/amd/comgr/src/comgr-hotswap-elf.cpp +++ b/amd/comgr/src/comgr-hotswap-elf.cpp @@ -16,7 +16,12 @@ #include "comgr-hotswap-internal.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Twine.h" #include "llvm/BinaryFormat/MsgPackDocument.h" +#include "llvm/Support/CheckedArithmetic.h" + +#include +#include using namespace llvm; @@ -29,18 +34,244 @@ using Phdr = ELF::Elf64_Phdr; using ELFT = ElfView::ELFT; using ELFFileT = ElfView::ELFFileT; +static constexpr unsigned SgprEncodingGranule = 8; + +enum class MetadataSgprUpdateStatus { + NotFound, + Found, + Error, +}; + +static std::optional checkedAdd(uint64_t LHS, uint64_t RHS, + StringRef Context) { + std::optional Result = checkedAddUnsigned(LHS, RHS); + if (Result) + return Result; + + log() << "hotswap: error: " << Context << " overflows uint64_t.\n"; + return std::nullopt; +} + +static std::optional +checkedAlignToSize(size_t Value, uint64_t Alignment, StringRef Context) { + if (Alignment <= 1) + return Value; + uint64_t Value64 = static_cast(Value); + uint64_t Remainder = Value64 % Alignment; + if (Remainder == 0) + return Value; + std::optional Aligned = + checkedAdd(Value64, Alignment - Remainder, Context); + if (!Aligned) + return std::nullopt; + if (*Aligned > static_cast(std::numeric_limits::max())) { + log() << "hotswap: error: " << Context << " exceeds size_t.\n"; + return std::nullopt; + } + return static_cast(*Aligned); +} + +static std::optional checkedSectionFileOffset(const ELFT::Shdr &Sec, + uint64_t VAddr, + uint64_t AccessSize, + uint64_t FileSize, + StringRef Context) { + if (VAddr < Sec.sh_addr) { + log() << "hotswap: error: " << Context << " has vaddr 0x" + << utohexstr(VAddr) << " before containing section vaddr 0x" + << utohexstr(Sec.sh_addr) << ".\n"; + return std::nullopt; + } + + uint64_t Delta = VAddr - Sec.sh_addr; + std::optional FileOffset = + checkedAdd(Sec.sh_offset, Delta, (Twine(Context) + " file offset").str()); + if (!FileOffset) + return std::nullopt; + + if (AccessSize > FileSize || *FileOffset > FileSize - AccessSize) { + log() << "hotswap: error: " << Context + << " extends past end of ELF at file offset 0x" + << utohexstr(*FileOffset) << ".\n"; + return std::nullopt; + } + return FileOffset; +} + +static std::optional +readSgprCountMetadataNode(const msgpack::DocNode &SgprNode, + StringRef KernelName, StringRef Context) { + if (SgprNode.getKind() == msgpack::Type::UInt) { + uint64_t SgprCount = SgprNode.getUInt(); + if (SgprCount > std::numeric_limits::max()) { + log() << "hotswap: error: " << Context << ": .sgpr_count for '" + << KernelName << "' exceeds unsigned.\n"; + return std::nullopt; + } + return static_cast(SgprCount); + } + + if (SgprNode.getKind() == msgpack::Type::Int) { + int64_t SgprCount = SgprNode.getInt(); + if (SgprCount < 0 || static_cast(SgprCount) > + std::numeric_limits::max()) { + log() << "hotswap: error: " << Context << ": .sgpr_count for '" + << KernelName << "' is outside unsigned range.\n"; + return std::nullopt; + } + return static_cast(SgprCount); + } + + log() << "hotswap: error: " << Context << ": .sgpr_count for '" << KernelName + << "' is not an integer.\n"; + return std::nullopt; +} + +static MetadataSgprUpdateStatus +updateKernelMetadataSgprCount(uint8_t *Elf, const ELFFileT &File, + StringRef KernelName, unsigned RequiredSgprs) { + Expected PhdrsOrErr = File.program_headers(); + if (!PhdrsOrErr) { + log() << "hotswap: error: updateKernelMetadataSgprCount: failed to read " + << "program headers: " << toString(PhdrsOrErr.takeError()) << "\n"; + return MetadataSgprUpdateStatus::Error; + } + + bool SawMetadataNote = false; + for (const ELFT::Phdr &Phdr : *PhdrsOrErr) { + if (Phdr.p_type != ELF::PT_NOTE) + continue; + + Error Err = Error::success(); + for (ELFT::Note Note : File.notes(Phdr, Err)) { + if (Note.getName() != "AMDGPU" || + Note.getType() != ELF::NT_AMDGPU_METADATA) + continue; + SawMetadataNote = true; + + ArrayRef Desc = Note.getDesc(4); + if (Desc.empty()) { + log() << "hotswap: error: updateKernelMetadataSgprCount: AMDGPU " + << "metadata note has an empty descriptor.\n"; + return MetadataSgprUpdateStatus::Error; + } + + StringRef Blob(reinterpret_cast(Desc.data()), Desc.size()); + msgpack::Document Doc; + if (!Doc.readFromBlob(Blob, false)) { + log() << "hotswap: error: updateKernelMetadataSgprCount: failed to " + << "parse AMDGPU metadata note.\n"; + return MetadataSgprUpdateStatus::Error; + } + + msgpack::DocNode Root = Doc.getRoot(); + if (!Root.isMap()) { + log() << "hotswap: error: updateKernelMetadataSgprCount: AMDGPU " + << "metadata root is not a map.\n"; + return MetadataSgprUpdateStatus::Error; + } + + msgpack::MapDocNode &RootMap = Root.getMap(); + msgpack::DocNode::MapTy::iterator KernelsIt = + RootMap.find("amdhsa.kernels"); + if (KernelsIt == RootMap.end() || !KernelsIt->second.isArray()) + continue; + + msgpack::ArrayDocNode &KernelArray = KernelsIt->second.getArray(); + for (msgpack::DocNode &KNode : KernelArray) { + if (!KNode.isMap()) + continue; + + msgpack::MapDocNode &KMap = KNode.getMap(); + msgpack::DocNode::MapTy::iterator NameIt = KMap.find(".name"); + if (NameIt == KMap.end() || !NameIt->second.isString() || + NameIt->second.getString() != KernelName) + continue; + + msgpack::DocNode::MapTy::iterator SgprIt = KMap.find(".sgpr_count"); + if (SgprIt == KMap.end()) { + log() << "hotswap: error: updateKernelMetadataSgprCount: metadata " + << "for kernel '" << KernelName << "' has no .sgpr_count.\n"; + return MetadataSgprUpdateStatus::Error; + } + + std::optional CurrentSgprs = readSgprCountMetadataNode( + SgprIt->second, KernelName, "updateKernelMetadataSgprCount"); + if (!CurrentSgprs) + return MetadataSgprUpdateStatus::Error; + if (RequiredSgprs <= *CurrentSgprs) + return MetadataSgprUpdateStatus::Found; + + SgprIt->second = static_cast(RequiredSgprs); + std::string NewBlob; + Doc.writeToBlob(NewBlob); + if (NewBlob.size() != Blob.size()) { + log() << "hotswap: error: updateKernelMetadataSgprCount: updating " + << ".sgpr_count for '" << KernelName << "' changes metadata " + << "note size from " << Blob.size() << " to " << NewBlob.size() + << " bytes; in-place rewrite cannot preserve ELF layout.\n"; + return MetadataSgprUpdateStatus::Error; + } + + const uint8_t *DescBegin = Desc.data(); + if (DescBegin < File.base() || DescBegin >= File.end()) { + log() << "hotswap: error: updateKernelMetadataSgprCount: metadata " + << "descriptor pointer is outside the ELF buffer.\n"; + return MetadataSgprUpdateStatus::Error; + } + size_t DescOffset = DescBegin - File.base(); + if (Desc.size() > File.getBufSize() || + DescOffset > File.getBufSize() - Desc.size()) { + log() << "hotswap: error: updateKernelMetadataSgprCount: metadata " + << "descriptor extends past the ELF buffer.\n"; + return MetadataSgprUpdateStatus::Error; + } + + std::memcpy(Elf + DescOffset, NewBlob.data(), NewBlob.size()); + return MetadataSgprUpdateStatus::Found; + } + } + + if (Err) { + log() << "hotswap: error: updateKernelMetadataSgprCount: failed to " + << "iterate AMDGPU notes: " << toString(std::move(Err)) << "\n"; + return MetadataSgprUpdateStatus::Error; + } + } + + if (SawMetadataNote) { + log() << "hotswap: error: updateKernelMetadataSgprCount: AMDGPU metadata " + << "has no entry for kernel '" << KernelName << "'.\n"; + return MetadataSgprUpdateStatus::Error; + } + return MetadataSgprUpdateStatus::NotFound; +} + // -- applyByteReplace --------------------------------------------------------- bool applyByteReplace(const RewriteRule &Rule, uint64_t InstOffset, uint32_t InstSize, uint8_t *Text, uint64_t TextSize, const LLVMState &S) { - if (InstOffset + InstSize > TextSize) + if (InstOffset > TextSize || InstSize > TextSize - InstOffset) { + log() << "hotswap: error: applyByteReplace: instruction range [0x" + << utohexstr(InstOffset) << ", 0x" + << utohexstr(InstOffset + static_cast(InstSize)) + << ") extends past .text size 0x" << utohexstr(TextSize) << ".\n"; return false; + } const size_t ReplaceSize = Rule.ReplaceBytes.size(); - if (ReplaceSize > InstSize) + if (ReplaceSize > InstSize) { + log() << "hotswap: error: applyByteReplace: replacement size " + << ReplaceSize << " exceeds original instruction size " << InstSize + << " at .text offset 0x" << utohexstr(InstOffset) << ".\n"; return false; - if (S.SNopBytes.size() != MinInstSize) + } + if (S.SNopBytes.size() != MinInstSize) { + log() << "hotswap: error: applyByteReplace: cached s_nop size " + << S.SNopBytes.size() << " does not match expected size " + << MinInstSize << ".\n"; return false; + } std::memcpy(Text + InstOffset, Rule.ReplaceBytes.data(), ReplaceSize); uint64_t PadOffset = InstOffset + ReplaceSize; uint64_t Remaining = InstSize - ReplaceSize; @@ -192,18 +423,199 @@ uint8_t *ElfView::findKernelDescriptor(StringRef KernelName) { continue; } const ELFT::Shdr &HostShdr = **HostShdrOrErr; - if (Sym.st_value < HostShdr.sh_addr) - continue; - uint64_t FileOffset = - HostShdr.sh_offset + (Sym.st_value - HostShdr.sh_addr); - if (FileOffset + KdSize > size()) + std::optional FileOffset = checkedSectionFileOffset( + HostShdr, Sym.st_value, KdSize, size(), + (Twine("findKernelDescriptor: descriptor symbol '") + *NameOrErr + + "'") + .str()); + if (!FileOffset) continue; - return data() + FileOffset; + return data() + *FileOffset; } } return nullptr; } +// -- ElfView::kernelDescriptors ----------------------------------------------- + +std::vector ElfView::kernelDescriptors() const { + namespace hsa = amdhsa; + std::vector Result; + + for (const ELFT::Shdr &SymShdr : Sections) { + if (SymShdr.sh_type != ELF::SHT_SYMTAB && + SymShdr.sh_type != ELF::SHT_DYNSYM) + continue; + + Expected SymsOrErr = File.symbols(&SymShdr); + if (!SymsOrErr) { + log() << "hotswap: error: kernelDescriptors: failed to read symbols: " + << toString(SymsOrErr.takeError()) << "\n"; + continue; + } + Expected StrTabOrErr = + File.getStringTableForSymtab(SymShdr, Sections); + if (!StrTabOrErr) { + log() << "hotswap: error: kernelDescriptors: failed to read symbol " + << "string table: " << toString(StrTabOrErr.takeError()) << "\n"; + continue; + } + + for (const ELFT::Sym &Sym : *SymsOrErr) { + Expected NameOrErr = Sym.getName(*StrTabOrErr); + if (!NameOrErr) { + log() << "hotswap: error: kernelDescriptors: failed to read symbol " + << "name: " << toString(NameOrErr.takeError()) << "\n"; + continue; + } + if (!NameOrErr->ends_with(".kd")) + continue; + + Expected HostShdrOrErr = + File.getSection(Sym.st_shndx); + if (!HostShdrOrErr) { + log() << "hotswap: error: kernelDescriptors: descriptor symbol '" + << *NameOrErr << "' has unreadable section index " << Sym.st_shndx + << ": " << toString(HostShdrOrErr.takeError()) << "\n"; + continue; + } + const ELFT::Shdr &HostShdr = **HostShdrOrErr; + std::optional FileOffset = checkedSectionFileOffset( + HostShdr, Sym.st_value, KdSize, size(), + (Twine("kernelDescriptors: descriptor symbol '") + *NameOrErr + "'") + .str()); + if (!FileOffset) + continue; + + int64_t EntryOffset = 0; + std::memcpy( + &EntryOffset, + data() + *FileOffset + + offsetof(hsa::kernel_descriptor_t, kernel_code_entry_byte_offset), + sizeof(EntryOffset)); + + std::string KernelName = NameOrErr->drop_back(3).str(); + const bool Seen = std::any_of( + Result.begin(), Result.end(), [&](const KernelDescriptorInfo &Info) { + return Info.KernelName == KernelName && Info.VAddr == Sym.st_value; + }); + if (!Seen) + Result.push_back({std::move(KernelName), Sym.st_value, EntryOffset}); + } + } + + return Result; +} + +std::optional +ElfView::getKernelDescriptorVAddr(StringRef KernelName) const { + for (const KernelDescriptorInfo &Info : kernelDescriptors()) { + if (Info.KernelName == KernelName) + return Info.VAddr; + } + return std::nullopt; +} + +bool ElfView::updateKernelDescriptorEntryOffset(StringRef KernelName, + int64_t NewEntryOffset) { + namespace hsa = amdhsa; + uint8_t *Kd = findKernelDescriptor(KernelName); + if (!Kd) { + log() << "hotswap: error: updateKernelDescriptorEntryOffset: kernel " + << "descriptor symbol '" << KernelName << ".kd' not found.\n"; + return false; + } + std::memcpy( + Kd + offsetof(hsa::kernel_descriptor_t, kernel_code_entry_byte_offset), + &NewEntryOffset, sizeof(NewEntryOffset)); + return true; +} + +bool ElfView::updateKernelDescriptorSgprCount(StringRef KernelName, + unsigned RequiredSgprs) { + namespace hsa = amdhsa; + if (RequiredSgprs == 0) + return true; + + uint8_t *Kd = findKernelDescriptor(KernelName); + if (!Kd) { + log() << "hotswap: error: updateKernelDescriptorSgprCount: kernel " + << "descriptor symbol '" << KernelName << ".kd' not found.\n"; + return false; + } + + uint32_t Rsrc1 = 0; + std::memcpy(&Rsrc1, + Kd + offsetof(hsa::kernel_descriptor_t, compute_pgm_rsrc1), + sizeof(Rsrc1)); + + uint32_t CurrentGranulated = AMDHSA_BITS_GET( + Rsrc1, hsa::COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT); + uint64_t CurrentSgprs = + (static_cast(CurrentGranulated) + 1) * SgprEncodingGranule; + + std::optional RequiredGranulated; + if (RequiredSgprs > CurrentSgprs) { + uint64_t RequiredGranulated64 = + (static_cast(RequiredSgprs) + SgprEncodingGranule - 1) / + SgprEncodingGranule - + 1; + uint32_t MaxGranulated = static_cast( + hsa::COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT >> + hsa::COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT); + if (RequiredGranulated64 > MaxGranulated) { + log() << "hotswap: error: updateKernelDescriptorSgprCount: kernel '" + << KernelName << "' needs " << RequiredSgprs + << " SGPRs, which exceeds the descriptor encoding limit.\n"; + return false; + } + RequiredGranulated = static_cast(RequiredGranulated64); + } + + MetadataSgprUpdateStatus MetadataStatus = + updateKernelMetadataSgprCount(data(), File, KernelName, RequiredSgprs); + if (MetadataStatus == MetadataSgprUpdateStatus::Error) + return false; + // NotFound is allowed for minimal code objects without AMDGPU metadata; in + // that case the descriptor field remains the only SGPR count to update. + + if (!RequiredGranulated) + return true; + + AMDHSA_BITS_SET(Rsrc1, hsa::COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT, + *RequiredGranulated); + std::memcpy(Kd + offsetof(hsa::kernel_descriptor_t, compute_pgm_rsrc1), + &Rsrc1, sizeof(Rsrc1)); + return true; +} + +std::optional +ElfView::getKernelDescriptorInstPrefSize(StringRef KernelName, + StringRef TargetCpu) const { + namespace hsa = amdhsa; + uint8_t *Kd = const_cast(this)->findKernelDescriptor(KernelName); + if (!Kd) { + log() << "hotswap: error: getKernelDescriptorInstPrefSize: kernel " + << "descriptor symbol '" << KernelName << ".kd' not found.\n"; + return std::nullopt; + } + + uint32_t Rsrc3 = 0; + std::memcpy(&Rsrc3, + Kd + offsetof(hsa::kernel_descriptor_t, compute_pgm_rsrc3), + sizeof(Rsrc3)); + + if (TargetCpu.starts_with("gfx12")) { + return AMDHSA_BITS_GET(Rsrc3, + hsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE); + } + + log() << "hotswap: error: getKernelDescriptorInstPrefSize: unsupported " + << "target CPU '" << TargetCpu << "' for kernel '" << KernelName + << "'.\n"; + return std::nullopt; +} + // -- ElfView::getKernelVgprCount ---------------------------------------------- std::optional @@ -231,7 +643,14 @@ ElfView::getKernelVgprCount(StringRef KernelName, sizeof(Rsrc1)); uint32_t Granulated = AMDHSA_BITS_GET( Rsrc1, hsa::COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT); - return (Granulated + 1) * VgprGranuleSize; + uint64_t VgprCount = + (static_cast(Granulated) + 1) * VgprGranuleSize; + if (VgprCount > std::numeric_limits::max()) { + log() << "hotswap: error: getKernelVgprCount: descriptor VGPR count for '" + << KernelName << "' exceeds unsigned.\n"; + return std::nullopt; + } + return static_cast(VgprCount); } // Reads the static (compile-time-fixed) LDS allocation from the kernel @@ -268,58 +687,86 @@ ElfView::getKernelStaticLdsSize(StringRef KernelName) const { // preferred source. Falls back to the KD field when no metadata note is // present (e.g. minimal test ELFs assembled with -nostdlib). -static constexpr unsigned SgprEncodingGranule = 8; - std::optional ElfView::getKernelSgprCount(StringRef KernelName) const { // --- Try msgpack metadata note first. --- Expected PhdrsOrErr = File.program_headers(); + bool SawMetadataNote = false; if (PhdrsOrErr) { for (const ELFT::Phdr &Phdr : *PhdrsOrErr) { if (Phdr.p_type != ELF::PT_NOTE) continue; Error Err = Error::success(); - for (const auto &Note : File.notes(Phdr, Err)) { + for (ELFT::Note Note : File.notes(Phdr, Err)) { if (Note.getName() != "AMDGPU" || Note.getType() != ELF::NT_AMDGPU_METADATA) continue; + SawMetadataNote = true; - StringRef Blob = Note.getDescAsStringRef(4); + ArrayRef Desc = Note.getDesc(4); + if (Desc.empty()) { + log() << "hotswap: error: getKernelSgprCount: AMDGPU metadata note " + << "has an empty descriptor.\n"; + return std::nullopt; + } + + StringRef Blob(reinterpret_cast(Desc.data()), + Desc.size()); msgpack::Document Doc; - if (!Doc.readFromBlob(Blob, false)) - continue; + if (!Doc.readFromBlob(Blob, false)) { + log() << "hotswap: error: getKernelSgprCount: failed to parse " + << "AMDGPU metadata note.\n"; + return std::nullopt; + } msgpack::DocNode Root = Doc.getRoot(); - if (!Root.isMap()) - continue; - auto KernelsIt = Root.getMap().find("amdhsa.kernels"); - if (KernelsIt == Root.getMap().end() || !KernelsIt->second.isArray()) + if (!Root.isMap()) { + log() << "hotswap: error: getKernelSgprCount: AMDGPU metadata root " + << "is not a map.\n"; + return std::nullopt; + } + msgpack::MapDocNode &RootMap = Root.getMap(); + msgpack::DocNode::MapTy::iterator KernelsIt = + RootMap.find("amdhsa.kernels"); + if (KernelsIt == RootMap.end() || !KernelsIt->second.isArray()) continue; - for (auto &KNode : KernelsIt->second.getArray()) { + msgpack::ArrayDocNode &KernelArray = KernelsIt->second.getArray(); + for (msgpack::DocNode &KNode : KernelArray) { if (!KNode.isMap()) continue; - auto &KMap = KNode.getMap(); - auto NameIt = KMap.find(".name"); + msgpack::MapDocNode &KMap = KNode.getMap(); + msgpack::DocNode::MapTy::iterator NameIt = KMap.find(".name"); if (NameIt == KMap.end() || !NameIt->second.isString() || NameIt->second.getString() != KernelName) continue; - auto SgprIt = KMap.find(".sgpr_count"); - if (SgprIt == KMap.end()) - break; - if (SgprIt->second.getKind() == msgpack::Type::UInt) - return static_cast(SgprIt->second.getUInt()); - if (SgprIt->second.getKind() == msgpack::Type::Int) - return static_cast(SgprIt->second.getInt()); - break; + msgpack::DocNode::MapTy::iterator SgprIt = KMap.find(".sgpr_count"); + if (SgprIt == KMap.end()) { + log() << "hotswap: error: getKernelSgprCount: metadata for kernel '" + << KernelName << "' has no .sgpr_count.\n"; + return std::nullopt; + } + return readSgprCountMetadataNode(SgprIt->second, KernelName, + "getKernelSgprCount"); } } - if (errorToBool(std::move(Err))) - break; + if (Err) { + log() << "hotswap: error: getKernelSgprCount: failed to iterate " + << "AMDGPU notes: " << toString(std::move(Err)) << "\n"; + return std::nullopt; + } } } else { - consumeError(PhdrsOrErr.takeError()); + log() << "hotswap: error: getKernelSgprCount: failed to read program " + << "headers: " << toString(PhdrsOrErr.takeError()) << "\n"; + return std::nullopt; + } + + if (SawMetadataNote) { + log() << "hotswap: error: getKernelSgprCount: AMDGPU metadata has no " + << ".sgpr_count entry for kernel '" << KernelName << "'.\n"; + return std::nullopt; } // --- Fallback: read the KD field. --- @@ -336,7 +783,14 @@ ElfView::getKernelSgprCount(StringRef KernelName) const { sizeof(Rsrc1)); uint32_t Granulated = AMDHSA_BITS_GET( Rsrc1, hsa::COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT); - return (Granulated + 1) * SgprEncodingGranule; + uint64_t SgprCount = + (static_cast(Granulated) + 1) * SgprEncodingGranule; + if (SgprCount > std::numeric_limits::max()) { + log() << "hotswap: error: getKernelSgprCount: descriptor SGPR count for '" + << KernelName << "' exceeds unsigned.\n"; + return std::nullopt; + } + return static_cast(SgprCount); } // -- ElfView::updateKernelDescriptor ------------------------------------------ @@ -364,22 +818,28 @@ void ElfView::updateKernelDescriptor(StringRef KernelName, unsigned ExtraVgprs, uint32_t MaxGran = static_cast( hsa::COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT >> hsa::COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT); - unsigned Extra = (ExtraVgprs + VgprGranuleSize - 1) / VgprGranuleSize; + uint64_t Extra = (static_cast(ExtraVgprs) + VgprGranuleSize - 1) / + VgprGranuleSize; + uint64_t NewGranulated = + std::min(static_cast(Current) + Extra, MaxGran); AMDHSA_BITS_SET(Rsrc1, hsa::COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT, - std::min(Current + Extra, MaxGran)); + static_cast(NewGranulated)); std::memcpy(Kd + offsetof(hsa::kernel_descriptor_t, compute_pgm_rsrc1), &Rsrc1, sizeof(Rsrc1)); } // -- Section/program header adjustment for trampoline growth ------------------ -static void adjustSectionHeaders(uint8_t *Elf, size_t ElfSize, +static bool adjustSectionHeaders(uint8_t *Elf, size_t ElfSize, uint64_t TextOffset, uint64_t TextSize, size_t TrampTotal) { if (ElfSize < sizeof(Ehdr)) - return; + return true; - uint64_t TextEnd = TextOffset + TextSize; + std::optional TextEnd = + checkedAdd(TextOffset, TextSize, "section header .text end"); + if (!TextEnd) + return false; uint64_t Shoff; uint16_t Shentsize; uint16_t Shnum; @@ -387,49 +847,74 @@ static void adjustSectionHeaders(uint8_t *Elf, size_t ElfSize, std::memcpy(&Shentsize, Elf + offsetof(Ehdr, e_shentsize), sizeof(Shentsize)); std::memcpy(&Shnum, Elf + offsetof(Ehdr, e_shnum), sizeof(Shnum)); if (Shentsize < sizeof(Shdr)) - return; + return true; - if (Shoff >= TextEnd) { - uint64_t NewShoff = Shoff + TrampTotal; - std::memcpy(Elf + offsetof(Ehdr, e_shoff), &NewShoff, sizeof(NewShoff)); - Shoff = NewShoff; + if (Shoff >= *TextEnd) { + std::optional NewShoff = + checkedAdd(Shoff, TrampTotal, "section header table offset"); + if (!NewShoff) + return false; + uint64_t NewShoffValue = *NewShoff; + std::memcpy(Elf + offsetof(Ehdr, e_shoff), &NewShoffValue, + sizeof(NewShoffValue)); + Shoff = NewShoffValue; } for (uint16_t I = 0; I < Shnum; ++I) { - uint64_t ShPos = Shoff + static_cast(I) * Shentsize; - if (ShPos + sizeof(Shdr) > ElfSize) + uint64_t ShTableDelta = static_cast(I) * Shentsize; + std::optional ShPos = + checkedAdd(Shoff, ShTableDelta, "section header entry offset"); + if (!ShPos) + return false; + if (*ShPos > ElfSize || sizeof(Shdr) > ElfSize - *ShPos) break; - uint8_t *Sh = Elf + ShPos; + uint8_t *Sh = Elf + *ShPos; uint64_t ShOffset; std::memcpy(&ShOffset, Sh + offsetof(Shdr, sh_offset), sizeof(ShOffset)); if (ShOffset == TextOffset) { - uint64_t NewTextSize = TextSize + TrampTotal; - std::memcpy(Sh + offsetof(Shdr, sh_size), &NewTextSize, - sizeof(NewTextSize)); + std::optional NewTextSize = + checkedAdd(TextSize, TrampTotal, ".text section size"); + if (!NewTextSize) + return false; + uint64_t NewTextSizeValue = *NewTextSize; + std::memcpy(Sh + offsetof(Shdr, sh_size), &NewTextSizeValue, + sizeof(NewTextSizeValue)); } else if (ShOffset > TextOffset) { - uint64_t NewOffset = ShOffset + TrampTotal; - std::memcpy(Sh + offsetof(Shdr, sh_offset), &NewOffset, - sizeof(NewOffset)); + std::optional NewOffset = + checkedAdd(ShOffset, TrampTotal, "post-.text section offset"); + if (!NewOffset) + return false; + uint64_t NewOffsetValue = *NewOffset; + std::memcpy(Sh + offsetof(Shdr, sh_offset), &NewOffsetValue, + sizeof(NewOffsetValue)); uint64_t ShFlags; std::memcpy(&ShFlags, Sh + offsetof(Shdr, sh_flags), sizeof(ShFlags)); if (ShFlags & ELF::SHF_ALLOC) { uint64_t ShAddr; std::memcpy(&ShAddr, Sh + offsetof(Shdr, sh_addr), sizeof(ShAddr)); - ShAddr += TrampTotal; + std::optional NewAddr = + checkedAdd(ShAddr, TrampTotal, "post-.text section address"); + if (!NewAddr) + return false; + ShAddr = *NewAddr; std::memcpy(Sh + offsetof(Shdr, sh_addr), &ShAddr, sizeof(ShAddr)); } } } + return true; } -static void adjustProgramHeaders(uint8_t *Elf, size_t ElfSize, +static bool adjustProgramHeaders(uint8_t *Elf, size_t ElfSize, uint64_t TextOffset, uint64_t TextSize, size_t TrampTotal) { if (ElfSize < sizeof(Ehdr)) - return; + return true; - uint64_t TextEnd = TextOffset + TextSize; + std::optional TextEnd = + checkedAdd(TextOffset, TextSize, "program header .text end"); + if (!TextEnd) + return false; uint64_t Phoff; uint16_t Phentsize; uint16_t Phnum; @@ -437,13 +922,17 @@ static void adjustProgramHeaders(uint8_t *Elf, size_t ElfSize, std::memcpy(&Phentsize, Elf + offsetof(Ehdr, e_phentsize), sizeof(Phentsize)); std::memcpy(&Phnum, Elf + offsetof(Ehdr, e_phnum), sizeof(Phnum)); if (Phentsize < sizeof(Phdr)) - return; + return true; for (uint16_t I = 0; I < Phnum; ++I) { - uint64_t PhPos = Phoff + static_cast(I) * Phentsize; - if (PhPos + sizeof(Phdr) > ElfSize) + uint64_t PhTableDelta = static_cast(I) * Phentsize; + std::optional PhPos = + checkedAdd(Phoff, PhTableDelta, "program header entry offset"); + if (!PhPos) + return false; + if (*PhPos > ElfSize || sizeof(Phdr) > ElfSize - *PhPos) break; - uint8_t *Ph = Elf + PhPos; + uint8_t *Ph = Elf + *PhPos; uint64_t POffset; uint64_t PFilesz; uint64_t PMemsz; @@ -451,24 +940,126 @@ static void adjustProgramHeaders(uint8_t *Elf, size_t ElfSize, std::memcpy(&PFilesz, Ph + offsetof(Phdr, p_filesz), sizeof(PFilesz)); std::memcpy(&PMemsz, Ph + offsetof(Phdr, p_memsz), sizeof(PMemsz)); - if (POffset <= TextOffset && POffset + PFilesz >= TextEnd) { - PFilesz += TrampTotal; - PMemsz += TrampTotal; + std::optional PEnd = + checkedAdd(POffset, PFilesz, "program header file end"); + if (!PEnd) + return false; + if (POffset <= TextOffset && *PEnd >= *TextEnd) { + std::optional NewPFilesz = + checkedAdd(PFilesz, TrampTotal, "program header file size"); + std::optional NewPMemsz = + checkedAdd(PMemsz, TrampTotal, "program header memory size"); + if (!NewPFilesz || !NewPMemsz) + return false; + PFilesz = *NewPFilesz; + PMemsz = *NewPMemsz; std::memcpy(Ph + offsetof(Phdr, p_filesz), &PFilesz, sizeof(PFilesz)); std::memcpy(Ph + offsetof(Phdr, p_memsz), &PMemsz, sizeof(PMemsz)); } else if (POffset > TextOffset) { - POffset += TrampTotal; + std::optional NewPOffset = + checkedAdd(POffset, TrampTotal, "post-.text program offset"); + if (!NewPOffset) + return false; + POffset = *NewPOffset; std::memcpy(Ph + offsetof(Phdr, p_offset), &POffset, sizeof(POffset)); uint64_t PVaddr; std::memcpy(&PVaddr, Ph + offsetof(Phdr, p_vaddr), sizeof(PVaddr)); - PVaddr += TrampTotal; + std::optional NewPVaddr = + checkedAdd(PVaddr, TrampTotal, "post-.text program vaddr"); + if (!NewPVaddr) + return false; + PVaddr = *NewPVaddr; std::memcpy(Ph + offsetof(Phdr, p_vaddr), &PVaddr, sizeof(PVaddr)); uint64_t PPaddr; std::memcpy(&PPaddr, Ph + offsetof(Phdr, p_paddr), sizeof(PPaddr)); - PPaddr += TrampTotal; + std::optional NewPPaddr = + checkedAdd(PPaddr, TrampTotal, "post-.text program paddr"); + if (!NewPPaddr) + return false; + PPaddr = *NewPPaddr; std::memcpy(Ph + offsetof(Phdr, p_paddr), &PPaddr, sizeof(PPaddr)); } } + return true; +} + +static bool adjustSymbolValues(uint8_t *Elf, size_t ElfSize, + uint64_t TextOffset, size_t TrampTotal) { + if (TrampTotal == 0) + return true; + + Expected FileOrErr = + ELFFileT::create(StringRef(reinterpret_cast(Elf), ElfSize)); + if (!FileOrErr) { + log() << "hotswap: error: adjustSymbolValues: failed to parse grown ELF: " + << toString(FileOrErr.takeError()) << "\n"; + return false; + } + ELFFileT File = std::move(*FileOrErr); + + if (File.getHeader().e_type == ELF::ET_REL) + return true; + + Expected SectionsOrErr = File.sections(); + if (!SectionsOrErr) { + log() << "hotswap: error: adjustSymbolValues: failed to read section " + << "headers: " << toString(SectionsOrErr.takeError()) << "\n"; + return false; + } + ELFT::ShdrRange Sections = *SectionsOrErr; + + unsigned SectionIndex = 0; + for (const ELFT::Shdr &SymShdr : Sections) { + if (SymShdr.sh_type != ELF::SHT_SYMTAB && + SymShdr.sh_type != ELF::SHT_DYNSYM) { + ++SectionIndex; + continue; + } + + Expected SymsOrErr = File.symbols(&SymShdr); + if (!SymsOrErr) { + log() << "hotswap: error: adjustSymbolValues: failed to read symbol " + << "table section " << SectionIndex << ": " + << toString(SymsOrErr.takeError()) << "\n"; + ++SectionIndex; + continue; + } + + for (const ELFT::Sym &Sym : *SymsOrErr) { + if (Sym.st_shndx == ELF::SHN_UNDEF || Sym.st_shndx >= ELF::SHN_LORESERVE) + continue; + + Expected DefShdrOrErr = File.getSection(Sym.st_shndx); + if (!DefShdrOrErr) { + log() << "hotswap: error: adjustSymbolValues: symbol references " + << "missing section " << Sym.st_shndx << ": " + << toString(DefShdrOrErr.takeError()) << "\n"; + continue; + } + const ELFT::Shdr &DefShdr = **DefShdrOrErr; + if (!(DefShdr.sh_flags & ELF::SHF_ALLOC) || + DefShdr.sh_offset <= TextOffset) + continue; + + const uint8_t *SymBytes = reinterpret_cast(&Sym); + if (SymBytes < File.base() || SymBytes + sizeof(ELFT::Sym) > File.end()) { + log() << "hotswap: error: adjustSymbolValues: symbol table entry is " + << "outside the ELF buffer.\n"; + continue; + } + + uint64_t SymOffset = SymBytes - File.base(); + std::optional Value = + checkedAdd(Sym.st_value, TrampTotal, "post-.text symbol value"); + if (!Value) + return false; + uint64_t Value64 = *Value; + std::memcpy(Elf + SymOffset + offsetof(ELFT::Sym, st_value), &Value64, + sizeof(Value64)); + } + ++SectionIndex; + } + return true; } // -- ElfView::growWithTrampolines --------------------------------------------- @@ -480,21 +1071,33 @@ ElfView::growWithTrampolines(ArrayRef Trampolines, const uint8_t *Input = data(); size_t TrampTotal = 0; - for (const Trampoline &T : Trampolines) + for (const Trampoline &T : Trampolines) { + if (T.Bytes.size() > std::numeric_limits::max() - TrampTotal) { + log() << "hotswap: error: growWithTrampolines: trampoline byte count " + << "overflows size_t.\n"; + return nullptr; + } TrampTotal += T.Bytes.size(); + } if (TrampTotal == 0) { log() << "hotswap: growWithTrampolines: no trampolines to insert; " << "returning empty result.\n"; return nullptr; } - if (TrampTotal > SIZE_MAX - InputSize) { + if (TrampTotal > std::numeric_limits::max() - InputSize) { log() << "hotswap: error: growWithTrampolines: trampoline bytes (" << TrampTotal << ") + existing ELF size (" << InputSize << ") overflow size_t.\n"; return nullptr; } - uint64_t TextEnd = textOffset() + textSize(); + std::optional TextEnd = + checkedAdd(textOffset(), textSize(), "growWithTrampolines .text end"); + if (!TextEnd || *TextEnd > InputSize) { + log() << "hotswap: error: growWithTrampolines: .text range exceeds input " + << "ELF size.\n"; + return nullptr; + } // Pad TrampTotal to the maximum alignment of all post-.text sections so // that shifting file offsets preserves sh_addralign invariants. The @@ -506,8 +1109,12 @@ ElfView::growWithTrampolines(ArrayRef Trampolines, if (Shdr.sh_addralign > MaxPostTextAlign) MaxPostTextAlign = Shdr.sh_addralign; } - size_t PaddedTrampTotal = llvm::alignTo(TrampTotal, MaxPostTextAlign); - if (PaddedTrampTotal > SIZE_MAX - InputSize) { + std::optional PaddedTrampTotalOrErr = checkedAlignToSize( + TrampTotal, MaxPostTextAlign, "padded trampoline byte count"); + if (!PaddedTrampTotalOrErr) + return nullptr; + size_t PaddedTrampTotal = *PaddedTrampTotalOrErr; + if (PaddedTrampTotal > std::numeric_limits::max() - InputSize) { log() << "hotswap: error: growWithTrampolines: padded trampoline bytes (" << PaddedTrampTotal << ") + ELF size (" << InputSize << ") overflow size_t.\n"; @@ -526,8 +1133,9 @@ ElfView::growWithTrampolines(ArrayRef Trampolines, } uint8_t *Out = reinterpret_cast(Buf->getBufferStart()); - std::memcpy(Out, Input, TextEnd); - uint64_t Pos = TextEnd; + size_t TextEndSize = static_cast(*TextEnd); + std::memcpy(Out, Input, TextEndSize); + size_t Pos = TextEndSize; for (const Trampoline &T : Trampolines) { std::memcpy(Out + Pos, T.Bytes.data(), T.Bytes.size()); Pos += T.Bytes.size(); @@ -540,13 +1148,15 @@ ElfView::growWithTrampolines(ArrayRef Trampolines, std::memset(Out + Pos, 0, PadBytes); Pos += PadBytes; } - if (TextEnd < InputSize) - std::memcpy(Out + Pos, Input + TextEnd, InputSize - TextEnd); + if (TextEndSize < InputSize) + std::memcpy(Out + Pos, Input + TextEndSize, InputSize - TextEndSize); - adjustSectionHeaders(Out, NewSize, textOffset(), textSize(), - PaddedTrampTotal); - adjustProgramHeaders(Out, NewSize, textOffset(), textSize(), - PaddedTrampTotal); + if (!adjustSectionHeaders(Out, NewSize, textOffset(), textSize(), + PaddedTrampTotal) || + !adjustProgramHeaders(Out, NewSize, textOffset(), textSize(), + PaddedTrampTotal) || + !adjustSymbolValues(Out, NewSize, textOffset(), PaddedTrampTotal)) + return nullptr; log() << "hotswap: growWithTrampolines: grew ELF from " << InputSize << " to " << NewSize << " bytes (" << Trampolines.size() << " trampoline" << (Trampolines.size() == 1 ? "" : "s") << ", " << TrampTotal diff --git a/amd/comgr/src/comgr-hotswap-entry-trampoline.cpp b/amd/comgr/src/comgr-hotswap-entry-trampoline.cpp new file mode 100644 index 0000000000000..b8497fea7665e --- /dev/null +++ b/amd/comgr/src/comgr-hotswap-entry-trampoline.cpp @@ -0,0 +1,597 @@ +//===- comgr-hotswap-entry-trampoline.cpp - Kernel-entry stubs ------------===// +// +// Part of Comgr, under the Apache License v2.0 with LLVM Exceptions. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Kernel-entry redirection pass for COMGR HotSwap. This pass is +/// independent of the gfx1250 B0-to-A0 instruction patcher: it appends one +/// PC-relative entry stub per kernel descriptor and rewrites the descriptor's +/// kernel_code_entry_byte_offset to point at that stub. +/// +//===----------------------------------------------------------------------===// + +#include "comgr-hotswap-internal.h" + +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Support/CheckedArithmetic.h" + +#include +#include + +using namespace llvm; + +namespace COMGR { +namespace hotswap { + +static bool appendAsm(SmallVectorImpl &Out, StringRef Asm, + const LLVMState &LS) { + SmallVector Bytes = assembleSingleInst(Asm, LS); + if (Bytes.empty()) { + log() << "hotswap: error: failed to assemble entry-stub instruction: " + << Asm << "\n"; + return false; + } + Out.append(Bytes.begin(), Bytes.end()); + return true; +} + +static SmallVector getCodeEndBytes(const LLVMState &LS) { + SmallVector CodeEnd = assembleSingleInst("s_code_end", LS); + if (CodeEnd.empty()) + log() << "hotswap: error: failed to assemble s_code_end for entry-stub " + << "padding.\n"; + return CodeEnd; +} + +static std::optional checkedAdd(uint64_t LHS, uint64_t RHS, + StringRef Context) { + std::optional Result = checkedAddUnsigned(LHS, RHS); + if (Result) + return Result; + + log() << "hotswap: error: " << Context << " overflows uint64_t.\n"; + return std::nullopt; +} + +SmallVector buildKernelEntryTrampoline(uint64_t StubVAddr, + uint64_t EntryVAddr, + unsigned ScratchSgpr, + const LLVMState &LS) { + if (ScratchSgpr == std::numeric_limits::max()) { + log() << "hotswap: error: kernel-entry stub scratch SGPR pair overflows " + << "unsigned.\n"; + return {}; + } + + SmallVector Bytes; + std::string ScratchPair = + (Twine("s[") + Twine(ScratchSgpr) + ":" + Twine(ScratchSgpr + 1) + "]") + .str(); + std::string ScratchLo = (Twine("s") + Twine(ScratchSgpr)).str(); + std::string ScratchHi = (Twine("s") + Twine(ScratchSgpr + 1)).str(); + + // Assemble through the MC layer instead of spelling encoded bytes; the LIT + // test pins the generated stub's disassembly. + if (!appendAsm(Bytes, "global_wb", LS)) + return {}; + if (!appendAsm(Bytes, "v_nop", LS)) + return {}; + if (!appendAsm(Bytes, "s_get_pc_i64 " + ScratchPair, LS)) + return {}; + + // s_get_pc_i64 returns the address of the following s_add_u32 instruction. + // Materialize the original entry with a 64-bit PC-relative add so the code + // object can be rewritten before ROCR knows final device addresses. + std::optional PcBase = + checkedAdd(StubVAddr, static_cast(Bytes.size()), + "kernel-entry stub PC base"); + if (!PcBase) + return {}; + // Unsigned subtraction is intentional: the immediate pair materializes the + // 64-bit two's-complement delta, including backward jumps. + const uint64_t Delta = EntryVAddr - *PcBase; + const uint32_t Lo = static_cast(Delta); + const uint32_t Hi = static_cast(Delta >> 32); + + if (!appendAsm(Bytes, + "s_add_u32 " + ScratchLo + ", " + ScratchLo + ", 0x" + + utohexstr(Lo), + LS)) + return {}; + if (!appendAsm(Bytes, + "s_addc_u32 " + ScratchHi + ", " + ScratchHi + ", 0x" + + utohexstr(Hi), + LS)) + return {}; + if (!appendAsm(Bytes, "s_set_pc_i64 " + ScratchPair, LS)) + return {}; + + SmallVector CodeEnd = getCodeEndBytes(LS); + if (CodeEnd.empty()) + return {}; + if (Bytes.size() > KernelEntryStubStride) { + log() << "hotswap: error: kernel-entry stub grew past " + << KernelEntryStubStride << " bytes.\n"; + return {}; + } + while (Bytes.size() < KernelEntryStubStride) { + if (Bytes.size() + CodeEnd.size() > KernelEntryStubStride) { + log() << "hotswap: error: s_code_end padding does not evenly fill " + << "kernel-entry stub stride " << KernelEntryStubStride << ".\n"; + return {}; + } + Bytes.append(CodeEnd.begin(), CodeEnd.end()); + } + return Bytes; +} + +uint64_t computeKernelEntryPrefetchGuardBytes(uint32_t InstPrefLines) { + const uint64_t PrefetchBytes = + static_cast(InstPrefLines) * KernelEntryInstPrefUnitBytes; + if (PrefetchBytes <= KernelEntryStubStride) + return 0; + return PrefetchBytes - KernelEntryStubStride; +} + +static bool hasResolvedEntryStubState(const LLVMState &LS, StringRef Context) { + if (!LS.MCII || LS.GlobalWbOpcode >= LS.MCII->getNumOpcodes() || + LS.SGetPcI64Opcode >= LS.MCII->getNumOpcodes() || + LS.SAddU32Opcode >= LS.MCII->getNumOpcodes() || + LS.SAddcU32Opcode >= LS.MCII->getNumOpcodes() || + LS.SSetPcI64Opcode >= LS.MCII->getNumOpcodes()) { + log() << "hotswap: error: " << Context + << ": LLVMState lacks resolved entry-stub opcodes.\n"; + return false; + } + + if (!LS.MRI) { + log() << "hotswap: error: " << Context + << ": LLVMState lacks register info.\n"; + return false; + } + + return true; +} + +static bool decodeKernelEntryStub(ArrayRef Bytes, const LLVMState &LS, + std::vector &Decoded, + StringRef Context) { + if (Bytes.size() < KernelEntryStubStride) + return false; + + if (!hasResolvedEntryStubState(LS, Context)) + return false; + + if (!decodeTextSection(Bytes.data(), KernelEntryStubStride, LS, Decoded)) { + log() << "hotswap: error: " << Context << ": failed to decode " + << KernelEntryStubStride << "-byte candidate.\n"; + return false; + } + return Decoded.size() >= 6; +} + +static bool hasRegOperand(const MCInst &Inst, unsigned Index) { + return Inst.getNumOperands() > Index && Inst.getOperand(Index).isReg(); +} + +static bool hasImmOperand(const MCInst &Inst, unsigned Index) { + return Inst.getNumOperands() > Index && Inst.getOperand(Index).isImm(); +} + +static bool sameRegOperand(const MCInst &LHS, unsigned LHSIndex, + const MCInst &RHS, unsigned RHSIndex) { + return hasRegOperand(LHS, LHSIndex) && hasRegOperand(RHS, RHSIndex) && + LHS.getOperand(LHSIndex).getReg() == RHS.getOperand(RHSIndex).getReg(); +} + +static bool hasEntryStubOperandShape(ArrayRef Decoded, + const LLVMState &LS) { + if (Decoded.size() < 6) + return false; + + if (Decoded[0].Inst.getOpcode() != LS.GlobalWbOpcode || + Decoded[1].Inst.getOpcode() != LS.VNopInst.getOpcode() || + Decoded[2].Inst.getOpcode() != LS.SGetPcI64Opcode || + Decoded[3].Inst.getOpcode() != LS.SAddU32Opcode || + Decoded[4].Inst.getOpcode() != LS.SAddcU32Opcode || + Decoded[5].Inst.getOpcode() != LS.SSetPcI64Opcode) + return false; + + const MCInst &GlobalWb = Decoded[0].Inst; + const MCInst &VNop = Decoded[1].Inst; + const MCInst &GetPc = Decoded[2].Inst; + const MCInst &AddLo = Decoded[3].Inst; + const MCInst &AddHi = Decoded[4].Inst; + const MCInst &SetPc = Decoded[5].Inst; + + if (GlobalWb.getNumOperands() != 1 || !GlobalWb.getOperand(0).isImm() || + GlobalWb.getOperand(0).getImm() != 0 || VNop.getNumOperands() != 0) + return false; + + if (GetPc.getNumOperands() != 1 || SetPc.getNumOperands() != 1 || + !sameRegOperand(GetPc, 0, SetPc, 0)) + return false; + + if (AddLo.getNumOperands() != 3 || AddHi.getNumOperands() != 3 || + !sameRegOperand(AddLo, 0, AddLo, 1) || + !sameRegOperand(AddHi, 0, AddHi, 1) || !hasImmOperand(AddLo, 2) || + !hasImmOperand(AddHi, 2)) + return false; + + MCRegister PairReg = GetPc.getOperand(0).getReg(); + MCRegister LoReg = AddLo.getOperand(0).getReg(); + MCRegister HiReg = AddHi.getOperand(0).getReg(); + unsigned LoSubRegIndex = LS.MRI->getSubRegIndex(PairReg, LoReg); + unsigned HiSubRegIndex = LS.MRI->getSubRegIndex(PairReg, HiReg); + return LoSubRegIndex != 0 && HiSubRegIndex != 0 && + LoSubRegIndex != HiSubRegIndex && LoSubRegIndex < HiSubRegIndex; +} + +static std::optional +decodeEntryStubTargetVAddr(ArrayRef Decoded, + uint64_t StubVAddr) { + std::optional PcBaseOffset = + checkedAdd(Decoded[2].Offset, Decoded[2].Size, + "decoded kernel-entry stub PC-base offset"); + if (!PcBaseOffset) + return std::nullopt; + std::optional PcBase = + checkedAdd(StubVAddr, *PcBaseOffset, "decoded kernel-entry stub PC base"); + if (!PcBase) + return std::nullopt; + + const uint64_t Lo = + static_cast(Decoded[3].Inst.getOperand(2).getImm()); + const uint64_t Hi = + static_cast(Decoded[4].Inst.getOperand(2).getImm()); + const uint64_t Delta = Lo | (Hi << 32); + return *PcBase + Delta; +} + +bool isKernelEntryTrampoline(ArrayRef Bytes, const LLVMState &LS) { + std::vector Decoded; + return decodeKernelEntryStub(Bytes, LS, Decoded, "isKernelEntryTrampoline") && + hasEntryStubOperandShape(Decoded, LS); +} + +static std::optional +checkedAlignTo(uint64_t Value, uint64_t Alignment, StringRef Context) { + if (Alignment == 0) + return Value; + + uint64_t Remainder = Value % Alignment; + if (Remainder == 0) + return Value; + return checkedAdd(Value, Alignment - Remainder, Context); +} + +static std::optional entryVAddr(const KernelDescriptorInfo &KD) { + if (KD.EntryOffset >= 0) + return checkedAdd( + KD.VAddr, static_cast(KD.EntryOffset), + (Twine("kernel entry vaddr for '") + KD.KernelName + "'").str()); + + const uint64_t Magnitude = + KD.EntryOffset == std::numeric_limits::min() + ? static_cast(std::numeric_limits::max()) + 1 + : static_cast(-KD.EntryOffset); + if (KD.VAddr < Magnitude) { + log() << "hotswap: error: kernel entry vaddr for '" << KD.KernelName + << "' underflows uint64_t.\n"; + return std::nullopt; + } + return KD.VAddr - Magnitude; +} + +static std::optional +descriptorAlreadyTargetsEntryStub(const ElfView &Elf, + const KernelDescriptorInfo &KD, + const LLVMState &LS) { + std::optional Entry = entryVAddr(KD); + if (!Entry) + return std::nullopt; + if (*Entry < Elf.textAddr()) + return false; + + std::optional TextEnd = + checkedAdd(Elf.textAddr(), Elf.textSize(), "entry trampoline text end"); + if (!TextEnd) + return std::nullopt; + + const uint64_t TextOffset = *Entry - Elf.textAddr(); + if (TextOffset > Elf.textSize() || + KernelEntryStubStride > Elf.textSize() - TextOffset) + return false; + + std::vector Decoded; + if (!decodeKernelEntryStub( + ArrayRef(Elf.textData() + TextOffset, KernelEntryStubStride), + LS, Decoded, "entry trampoline idempotency matcher")) + return false; + if (!hasEntryStubOperandShape(Decoded, LS)) + return false; + + std::optional Target = decodeEntryStubTargetVAddr(Decoded, *Entry); + if (!Target) + return std::nullopt; + + return *Target >= Elf.textAddr() && *Target < *TextEnd && *Target < *Entry; +} + +static std::optional +totalTrampolineBytes(ArrayRef Trampolines) { + uint64_t Total = 0; + for (const Trampoline &T : Trampolines) { + std::optional NewTotal = + checkedAdd(Total, static_cast(T.Bytes.size()), + "existing trampoline byte count"); + if (!NewTotal) + return std::nullopt; + Total = *NewTotal; + } + return Total; +} + +static std::optional +checkedSignedDifference(uint64_t LHS, uint64_t RHS, StringRef Context) { + if (LHS >= RHS) { + uint64_t Diff = LHS - RHS; + if (Diff > static_cast(std::numeric_limits::max())) { + log() << "hotswap: error: " << Context + << " positive offset is not representable as int64_t.\n"; + return std::nullopt; + } + return static_cast(Diff); + } + + uint64_t Diff = RHS - LHS; + constexpr uint64_t Int64MinMagnitude = + static_cast(std::numeric_limits::max()) + 1; + if (Diff > Int64MinMagnitude) { + log() << "hotswap: error: " << Context + << " negative offset is not representable as int64_t.\n"; + return std::nullopt; + } + if (Diff == Int64MinMagnitude) + return std::numeric_limits::min(); + return -static_cast(Diff); +} + +static std::optional allocateEntryStubScratchSgprs( + const ElfView &Elf, const KernelDescriptorInfo &KD, unsigned MaxSgprs) { + constexpr unsigned ScratchSgprs = 2; + std::optional SgprCount = Elf.getKernelSgprCount(KD.KernelName); + if (!SgprCount) { + log() << "hotswap: error: entry trampoline: failed to read SGPR count for '" + << KD.KernelName << "'.\n"; + return std::nullopt; + } + if (*SgprCount > MaxSgprs) { + log() << "hotswap: error: entry trampoline: kernel '" << KD.KernelName + << "' uses " << *SgprCount << " SGPRs, above max " << MaxSgprs + << ".\n"; + return std::nullopt; + } + + unsigned ScratchBase = (*SgprCount + 1) & ~1u; + if (ScratchBase > MaxSgprs || MaxSgprs - ScratchBase < ScratchSgprs) { + log() << "hotswap: error: entry trampoline: kernel '" << KD.KernelName + << "' uses " << *SgprCount << " SGPRs; no aligned scratch pair fits " + << "below max " << MaxSgprs << ".\n"; + return std::nullopt; + } + return ScratchBase; +} + +static bool appendPaddingTrampoline(std::vector &Out, + uint64_t PadBytes, ArrayRef Fill) { + if (PadBytes == 0) + return true; + if (Fill.empty()) { + log() << "hotswap: error: entry-stub alignment padding requested without " + << "cached s_nop bytes.\n"; + return false; + } + if (PadBytes % Fill.size() != 0) { + log() << "hotswap: error: entry-stub alignment padding size " << PadBytes + << " is not a multiple of cached s_nop size " << Fill.size() << ".\n"; + return false; + } + if (PadBytes > static_cast(std::numeric_limits::max())) { + log() << "hotswap: error: entry-stub alignment padding size " << PadBytes + << " exceeds size_t.\n"; + return false; + } + + Trampoline Pad; + while (static_cast(Pad.Bytes.size()) < PadBytes) + Pad.Bytes.append(Fill.begin(), Fill.end()); + Out.push_back(std::move(Pad)); + return true; +} + +std::optional appendKernelEntryTrampolines( + const ElfView &Elf, const LLVMState &LS, unsigned MaxSgprs, + std::vector &Growth, + std::vector &OutFixups) { + std::vector Descriptors = Elf.kernelDescriptors(); + if (Descriptors.empty()) + return 0; + + std::vector Work; + uint32_t MaxInstPrefLines = 0; + for (const KernelDescriptorInfo &KD : Descriptors) { + std::optional AlreadyHasEntryStub = + descriptorAlreadyTargetsEntryStub(Elf, KD, LS); + if (!AlreadyHasEntryStub) + return std::nullopt; + if (*AlreadyHasEntryStub) + continue; + std::optional InstPrefLines = + Elf.getKernelDescriptorInstPrefSize(KD.KernelName, LS.Cpu); + if (!InstPrefLines) + return std::nullopt; + MaxInstPrefLines = std::max(MaxInstPrefLines, *InstPrefLines); + Work.push_back(KD); + } + if (Work.empty()) + return 0; + + std::optional ExistingGrowthBytes = totalTrampolineBytes(Growth); + if (!ExistingGrowthBytes) + return std::nullopt; + uint64_t AppendOffset = *ExistingGrowthBytes; + std::optional TextEndVAddr = + checkedAdd(Elf.textAddr(), Elf.textSize(), "entry trampoline text end"); + if (!TextEndVAddr) + return std::nullopt; + std::optional StubPoolBaseVAddr = + checkedAdd(*TextEndVAddr, AppendOffset, "entry trampoline stub-pool base"); + if (!StubPoolBaseVAddr) + return std::nullopt; + std::optional AlignedStubPoolBaseVAddr = + checkedAlignTo(*StubPoolBaseVAddr, KernelEntryStubStride, + "entry trampoline aligned stub-pool base"); + if (!AlignedStubPoolBaseVAddr) + return std::nullopt; + const uint64_t StubStart = *AlignedStubPoolBaseVAddr - *TextEndVAddr; + std::vector LocalGrowth; + std::vector LocalFixups; + if (!appendPaddingTrampoline(LocalGrowth, StubStart - AppendOffset, + LS.SNopBytes)) + return std::nullopt; + AppendOffset = StubStart; + + for (const KernelDescriptorInfo &KD : Work) { + std::optional StubTextEnd = checkedAdd( + Elf.textSize(), AppendOffset, + (Twine("entry trampoline append offset for '") + KD.KernelName + "'") + .str()); + if (!StubTextEnd) + return std::nullopt; + std::optional StubVAddr = checkedAdd( + Elf.textAddr(), *StubTextEnd, + (Twine("entry trampoline vaddr for '") + KD.KernelName + "'").str()); + if (!StubVAddr) + return std::nullopt; + std::optional ScratchSgpr = + allocateEntryStubScratchSgprs(Elf, KD, MaxSgprs); + if (!ScratchSgpr) + return std::nullopt; + std::optional Entry = entryVAddr(KD); + if (!Entry) + return std::nullopt; + SmallVector Stub = + buildKernelEntryTrampoline(*StubVAddr, *Entry, *ScratchSgpr, LS); + if (Stub.empty()) { + log() << "hotswap: error: failed to build kernel-entry trampoline for '" + << KD.KernelName << "' at original entry vaddr 0x" + << utohexstr(*Entry) << ".\n"; + return std::nullopt; + } + + Trampoline T; + T.Bytes.assign(Stub.begin(), Stub.end()); + LocalGrowth.push_back(std::move(T)); + LocalFixups.push_back({KD.KernelName, AppendOffset, *ScratchSgpr + 2}); + std::optional NewAppendOffset = checkedAdd( + AppendOffset, KernelEntryStubStride, + (Twine("entry trampoline append offset after '") + KD.KernelName + "'") + .str()); + if (!NewAppendOffset) + return std::nullopt; + AppendOffset = *NewAppendOffset; + } + + const uint64_t GuardBytes = + computeKernelEntryPrefetchGuardBytes(MaxInstPrefLines); + if (GuardBytes != 0) { + SmallVector CodeEnd = getCodeEndBytes(LS); + if (CodeEnd.empty() || + !appendPaddingTrampoline(LocalGrowth, GuardBytes, CodeEnd)) + return std::nullopt; + } + + if (LocalFixups.empty()) + return 0; + + if (LocalFixups.size() > std::numeric_limits::max()) { + log() << "hotswap: error: kernel-entry trampoline count " + << LocalFixups.size() << " exceeds uint32_t.\n"; + return std::nullopt; + } + + for (Trampoline &T : LocalGrowth) + Growth.push_back(std::move(T)); + OutFixups.insert(OutFixups.end(), LocalFixups.begin(), LocalFixups.end()); + + log() << "hotswap: installed " << LocalFixups.size() + << " kernel-entry trampoline" << (LocalFixups.size() == 1 ? "" : "s") + << " with " << GuardBytes << " prefetch guard bytes\n"; + return static_cast(LocalFixups.size()); +} + +bool rewriteKernelEntryDescriptorOffsets( + WritableMemoryBuffer &OutBuf, uint64_t OldTextSize, + ArrayRef Fixups) { + if (Fixups.empty()) + return true; + + uint8_t *Data = reinterpret_cast(OutBuf.getBufferStart()); + Expected ViewOrErr = ElfView::create(Data, OutBuf.getBufferSize()); + if (!ViewOrErr) { + log() << "hotswap: error: failed to reparse grown ELF for entry " + << "descriptor rewrites: " << toString(ViewOrErr.takeError()) << "\n"; + return false; + } + + bool Ok = true; + ElfView &OutElf = *ViewOrErr; + for (const KernelEntryTrampolineFixup &Fixup : Fixups) { + std::optional KdVAddr = + OutElf.getKernelDescriptorVAddr(Fixup.KernelName); + if (!KdVAddr) { + log() << "hotswap: error: missing kernel descriptor for entry " + << "trampoline fixup '" << Fixup.KernelName << "'.\n"; + Ok = false; + continue; + } + std::optional StubTextOffset = checkedAdd( + OldTextSize, Fixup.StubTextOffset, + (Twine("entry trampoline text offset for '") + Fixup.KernelName + "'") + .str()); + if (!StubTextOffset) { + Ok = false; + continue; + } + std::optional StubVAddr = checkedAdd( + OutElf.textAddr(), *StubTextOffset, + (Twine("entry trampoline vaddr for '") + Fixup.KernelName + "'").str()); + if (!StubVAddr) { + Ok = false; + continue; + } + std::optional NewOffset = checkedSignedDifference( + *StubVAddr, *KdVAddr, + (Twine("entry trampoline descriptor offset for '") + Fixup.KernelName + + "'") + .str()); + if (!NewOffset) { + Ok = false; + continue; + } + bool UpdatedEntry = + OutElf.updateKernelDescriptorEntryOffset(Fixup.KernelName, *NewOffset); + bool UpdatedSgprs = OutElf.updateKernelDescriptorSgprCount( + Fixup.KernelName, Fixup.RequiredSgprs); + Ok = UpdatedEntry && UpdatedSgprs && Ok; + } + return Ok; +} + +} // namespace hotswap +} // namespace COMGR diff --git a/amd/comgr/src/comgr-hotswap-internal.h b/amd/comgr/src/comgr-hotswap-internal.h index 98666f42e07a9..fd00d4e08baa2 100644 --- a/amd/comgr/src/comgr-hotswap-internal.h +++ b/amd/comgr/src/comgr-hotswap-internal.h @@ -81,6 +81,17 @@ struct Trampoline { llvm::SmallVector Bytes; }; +// Kernel-entry stubs are appended as normal .text growth. Keep each entry on +// the same 256-byte alignment expected by AMDGPU kernel descriptors. +static constexpr uint64_t KernelEntryStubStride = 256; +static constexpr uint64_t KernelEntryInstPrefUnitBytes = 128; + +struct KernelDescriptorInfo { + std::string KernelName; + uint64_t VAddr = 0; + int64_t EntryOffset = 0; +}; + struct NopSled { uint64_t Start = 0; uint64_t End = 0; @@ -189,6 +200,28 @@ class ElfView { /// or nullptr if not found. uint8_t *findKernelDescriptor(llvm::StringRef KernelName); + /// Enumerate kernel descriptor symbols named ".kd" and read their + /// current kernel_code_entry_byte_offset values. + std::vector kernelDescriptors() const; + + /// Return the virtual address of the kernel descriptor symbol for + /// \p KernelName, or std::nullopt when the descriptor is not present. + std::optional + getKernelDescriptorVAddr(llvm::StringRef KernelName) const; + + /// Rewrite kernel_code_entry_byte_offset for \p KernelName. + bool updateKernelDescriptorEntryOffset(llvm::StringRef KernelName, + int64_t NewEntryOffset); + + /// Ensure the kernel descriptor reserves at least \p RequiredSgprs SGPRs. + bool updateKernelDescriptorSgprCount(llvm::StringRef KernelName, + unsigned RequiredSgprs); + + /// Read COMPUTE_PGM_RSRC3.INST_PREF_SIZE for \p KernelName. + std::optional + getKernelDescriptorInstPrefSize(llvm::StringRef KernelName, + llvm::StringRef TargetCpu) const; + /// Read the VGPR count from the kernel descriptor for \p KernelName. /// Returns std::nullopt if the descriptor is not found. std::optional getKernelVgprCount(llvm::StringRef KernelName, @@ -213,11 +246,12 @@ class ElfView { getKernelStaticLdsSize(llvm::StringRef KernelName) const; /// Read the SGPR count for \p KernelName from the \c amdhsa.kernels - /// msgpack metadata note (\c .sgpr_count key). On GFX10+ the kernel + /// msgpack metadata note (\c .sgpr_count key), falling back to the kernel + /// descriptor when the metadata note is absent. On GFX10+ the kernel /// descriptor's \c GRANULATED_WAVEFRONT_SGPR_COUNT is architecturally - /// reserved, so this is the only reliable source. - /// Returns std::nullopt if the metadata note is missing or the kernel - /// is not found. + /// reserved, so metadata is the only reliable source when present. + /// Returns std::nullopt if the matching metadata is malformed, the kernel is + /// missing from present metadata, or the descriptor fallback is unavailable. std::optional getKernelSgprCount(llvm::StringRef KernelName) const; /// Update the RSRC1 VGPR granule count in the kernel descriptor for @@ -336,6 +370,16 @@ struct LLVMState { /// round-trips. llvm::MCInst VNopInst; + /// MC opcodes for the kernel-entry stub sequence, resolved once at + /// initLLVM() time by parsing representative asm snippets. The idempotency + /// matcher compares decoded opcodes against these cached values instead of + /// matching disassembled mnemonic strings. + unsigned GlobalWbOpcode = 0; + unsigned SGetPcI64Opcode = 0; + unsigned SAddU32Opcode = 0; + unsigned SAddcU32Opcode = 0; + unsigned SSetPcI64Opcode = 0; + bool Valid = false; /// Encode a relative `s_branch` from \p FromOffset to \p ToOffset and @@ -630,20 +674,65 @@ HotswapPatchVTable &getHotswapPatchVTable(); #include "comgr-hotswap-patches.def" #undef HOTSWAP_PATCH -// -- Function declarations (B0-to-A0 policy layer) ---------------------------- +// -- Function declarations (kernel-entry trampoline pass) --------------------- + +struct KernelEntryTrampolineFixup { + std::string KernelName; + uint64_t StubTextOffset = 0; + unsigned RequiredSgprs = 0; +}; + +/// Build a 256-byte, entry-aligned HotSwap kernel-entry stub at +/// \p StubVAddr that jumps to \p EntryVAddr using PC-relative address +/// materialization. Returns an empty vector if MC assembly fails. +llvm::SmallVector buildKernelEntryTrampoline(uint64_t StubVAddr, + uint64_t EntryVAddr, + unsigned ScratchSgpr, + const LLVMState &LS); + +/// Structural matcher for the entry stubs produced by +/// buildKernelEntryTrampoline, used to keep the rewrite idempotent. +bool isKernelEntryTrampoline(llvm::ArrayRef Bytes, + const LLVMState &LS); + +/// Compute the trailing readable guard needed after an appended kernel-entry +/// stub pool so CP instruction prefetches from the last stub cannot run past +/// mapped .text bytes. +uint64_t computeKernelEntryPrefetchGuardBytes(uint32_t InstPrefLines); + +/// Append one entry stub per kernel descriptor that does not already target a +/// HotSwap entry stub. The stubs are appended to \p Growth and descriptor +/// rewrites are recorded in \p OutFixups for application after ELF growth. +std::optional appendKernelEntryTrampolines( + const ElfView &Elf, const LLVMState &LS, unsigned MaxSgprs, + std::vector &Growth, + std::vector &OutFixups); + +/// Apply descriptor entry-offset rewrites recorded by +/// appendKernelEntryTrampolines after the ELF has been grown. +bool rewriteKernelEntryDescriptorOffsets( + llvm::WritableMemoryBuffer &OutBuf, uint64_t OldTextSize, + llvm::ArrayRef Fixups); + +// -- Function declarations (GFX1250 hotswap policy layer) --------------------- + +struct Gfx1250RewriteOptions { + bool RunB0A0Patches = true; + bool RunEntryTrampolines = false; +}; -/// Run the full GFX1250 B0-to-A0 rewrite pipeline on \p ElfData / \p ElfSize. +/// Run the selected GFX1250 hotswap rewrite passes on \p ElfData / \p ElfSize. /// \p TargetIdent is the parsed target ISA (produced upstream by Comgr's -/// parseTargetIdentifier()); it is threaded into the MC init so the subtarget -/// triple and feature flags are preserved rather than being reconstructed -/// from just the processor name. On success \p Out is populated with an owned -/// buffer containing the rewritten code object. The caller can transfer the -/// buffer directly to a comgr DataObject via -/// DataObject::setData(std::unique_ptr). -amd_comgr_status_t -retargetCodeObjectB0A0(const void *ElfData, size_t ElfSize, - const TargetIdentifier &TargetIdent, - std::unique_ptr &Out); +/// parseTargetIdentifier() or the hotswap-local stepping parser); it is +/// threaded into the MC init so the subtarget triple and feature flags are +/// preserved rather than being reconstructed from just the processor name. On +/// success \p Out is populated with an owned buffer containing the rewritten +/// code object. The caller can transfer the buffer directly to a comgr +/// DataObject via DataObject::setData(std::unique_ptr). +amd_comgr_status_t retargetCodeObject(const void *ElfData, size_t ElfSize, + const TargetIdentifier &TargetIdent, + const Gfx1250RewriteOptions &Options, + std::unique_ptr &Out); } // namespace hotswap } // namespace COMGR diff --git a/amd/comgr/src/comgr-hotswap-llvm.cpp b/amd/comgr/src/comgr-hotswap-llvm.cpp index 378a109b3126e..1e948bde87301 100644 --- a/amd/comgr/src/comgr-hotswap-llvm.cpp +++ b/amd/comgr/src/comgr-hotswap-llvm.cpp @@ -26,6 +26,7 @@ #include "llvm/MC/MCParser/MCAsmParser.h" #include "llvm/MC/MCParser/MCTargetAsmParser.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCTargetOptions.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/SourceMgr.h" @@ -136,10 +137,8 @@ static SmallVector encodeMCInst(const MCInst &Inst, /// Run the AMDGPU asm parser over \p AsmStr and return the captured MCInsts. /// Used by assembleSingleInst() for the full parse-and-encode path, and by -/// initLLVM() / resolveOpcodeViaParse() to pick subtarget-specific opcodes -/// (e.g. s_branch, s_nop) without hardcoding opcode numbers or doing fragile -/// case-insensitive name matching over `MCInstrInfo::getName` (which returns -/// enum-style names such as `S_BRANCH_gfx12`, not the assembly mnemonic). +/// initLLVM() / resolveOpcodeViaParse() for instructions where parsing the +/// assembly mnemonic is the least fragile way to pick the target opcode. static SmallVector parseAsmToMCInsts(StringRef AsmStr, const LLVMState &S) { S.Ctx->reset(); @@ -196,6 +195,20 @@ static unsigned resolveOpcodeViaParse(StringRef AsmSnippet, return Parsed[0].getOpcode(); } +static bool resolveRequiredOpcodeViaParse(StringRef AsmSnippet, + StringRef AssemblyName, + const LLVMState &S, + unsigned &OutOpcode) { + OutOpcode = resolveOpcodeViaParse(AsmSnippet, S); + if (OutOpcode < S.MCII->getNumOpcodes()) + return true; + + log() << "hotswap: error: initLLVM: failed to resolve '" << AssemblyName + << "' opcode via asm parser for CPU '" << S.Cpu << "' using asm:\n" + << " " << AsmSnippet << "\n"; + return false; +} + // -- LLVM MC target init ------------------------------------------------------ LLVMState initLLVM(const TargetIdentifier &TI) { @@ -312,6 +325,22 @@ LLVMState initLLVM(const TargetIdentifier &TI) { } S.VNopInst = VNopInsts[0]; + if (!resolveRequiredOpcodeViaParse("global_wb", "global_wb", S, + S.GlobalWbOpcode)) + return S; + if (!resolveRequiredOpcodeViaParse("s_get_pc_i64 s[0:1]", "s_get_pc_i64", S, + S.SGetPcI64Opcode)) + return S; + if (!resolveRequiredOpcodeViaParse("s_add_u32 s0, s0, 0", "s_add_u32", S, + S.SAddU32Opcode)) + return S; + if (!resolveRequiredOpcodeViaParse("s_addc_u32 s1, s1, 0", "s_addc_u32", S, + S.SAddcU32Opcode)) + return S; + if (!resolveRequiredOpcodeViaParse("s_set_pc_i64 s[0:1]", "s_set_pc_i64", S, + S.SSetPcI64Opcode)) + return S; + S.Valid = true; return S; } @@ -379,16 +408,17 @@ bool decodeTextSection(const uint8_t *Text, uint64_t TextSize, DI.Mnemonic = UnknownMnemonic.str(); } else { DI.Size = static_cast(InstSize); - // MCInstPrinter::getMnemonic returns a pointer into the tblgen-generated - // AsmStrs table (see AMDGPUGenAsmWriter.inc). Storage is process- - // lifetime static; the trailing whitespace baked into AsmStrs must be - // trimmed. Falls back to MCII->getName for targets that leave it null. + // MCInstPrinter::getMnemonic returns a pointer into the generated AsmStrs + // table. Storage is process-lifetime static; the trailing whitespace + // baked into AsmStrs must be trimmed. If the printer cannot provide an + // assembly mnemonic, leave the instruction unmatchable instead of falling + // back to TableGen opcode names. if (S.MCIP) { std::pair Mnem = S.MCIP->getMnemonic(DI.Inst); DI.Mnemonic = Mnem.first ? StringRef(Mnem.first).rtrim().str() - : S.MCII->getName(DI.Inst.getOpcode()).str(); + : UnknownMnemonic.str(); } else { - DI.Mnemonic = S.MCII->getName(DI.Inst.getOpcode()).str(); + DI.Mnemonic = UnknownMnemonic.str(); } } Pos += DI.Size; diff --git a/amd/comgr/src/comgr-hotswap.cpp b/amd/comgr/src/comgr-hotswap.cpp index 32f68ed2db564..2017f8b92f2d4 100644 --- a/amd/comgr/src/comgr-hotswap.cpp +++ b/amd/comgr/src/comgr-hotswap.cpp @@ -6,41 +6,207 @@ //===----------------------------------------------------------------------===// #include "amd_comgr.h" +#include "comgr-env.h" #include "comgr-hotswap-internal.h" #include "comgr.h" +#include "llvm/TargetParser/AMDGPUTargetParser.h" + +#include +#include + using namespace COMGR; +namespace { + +constexpr llvm::StringLiteral Gfx1250B0Feature = "gfx1250-b0-specific"; +constexpr llvm::StringLiteral Gfx1250B0FeatureOn = "gfx1250-b0-specific+"; +constexpr llvm::StringLiteral Gfx1250B0FeatureOff = "gfx1250-b0-specific-"; + +struct ParsedHotswapIsa { + TargetIdentifier Ident; + std::string CanonicalIsa; + std::optional IsB0; +}; + +static bool parseGfx1250B0Feature(llvm::StringRef Feature, + std::optional &IsB0) { + if (Feature == Gfx1250B0FeatureOn) { + IsB0 = true; + return true; + } + if (Feature == Gfx1250B0FeatureOff) { + IsB0 = false; + return true; + } + return false; +} + +static bool isGfx12_5Processor(llvm::StringRef Processor) { + llvm::AMDGPU::IsaVersion Version = llvm::AMDGPU::getIsaVersion(Processor); + return Version.Major == 12 && Version.Minor == 5; +} + +static amd_comgr_status_t parseHotswapIsaName(const char *IsaName, + ParsedHotswapIsa &Parsed) { + Parsed = ParsedHotswapIsa{}; + + llvm::SmallVector Parts; + llvm::StringRef OriginalIsa(IsaName); + if (OriginalIsa.empty()) { + hotswap::log() << "hotswap: error: parseHotswapIsaName: empty ISA name\n"; + return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; + } + + OriginalIsa.split(Parts, ':'); + if (Parts.empty()) { + hotswap::log() << "hotswap: error: parseHotswapIsaName: empty ISA name\n"; + return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; + } + + llvm::SmallVector CanonicalParts; + for (llvm::StringRef Part : Parts) { + std::optional IsB0; + if (parseGfx1250B0Feature(Part, IsB0)) { + if (Parsed.IsB0) { + hotswap::log() << "hotswap: error: parseHotswapIsaName: duplicate " + << Gfx1250B0Feature << " feature in '" << OriginalIsa + << "'\n"; + return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; + } + Parsed.IsB0 = IsB0; + continue; + } + CanonicalParts.push_back(Part); + } + + if (CanonicalParts.empty() || CanonicalParts[0].empty()) { + hotswap::log() + << "hotswap: error: parseHotswapIsaName: missing canonical ISA in '" + << OriginalIsa << "'\n"; + return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; + } + + Parsed.CanonicalIsa = CanonicalParts[0].str(); + for (size_t I = 1; I < CanonicalParts.size(); ++I) { + Parsed.CanonicalIsa += ":"; + Parsed.CanonicalIsa += CanonicalParts[I].str(); + } + + if (parseTargetIdentifier(Parsed.CanonicalIsa, Parsed.Ident)) { + hotswap::log() + << "hotswap: error: parseHotswapIsaName: failed to parse ISA '" + << Parsed.CanonicalIsa << "' from '" << OriginalIsa << "'\n"; + return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; + } + + if (Parsed.IsB0 && Parsed.Ident.Processor != "gfx1250") { + hotswap::log() << "hotswap: error: parseHotswapIsaName: " + << Gfx1250B0Feature << " is only valid for gfx1250, not '" + << Parsed.Ident.Processor << "'\n"; + return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; + } + + if (Parsed.IsB0) + Parsed.Ident.Features.push_back(*Parsed.IsB0 ? Gfx1250B0FeatureOn + : Gfx1250B0FeatureOff); + + return AMD_COMGR_STATUS_SUCCESS; +} + +static bool shouldRunB0A0Patches(const ParsedHotswapIsa &Source, + const ParsedHotswapIsa &Target) { + // Legacy callers only pass gfx1250 today; preserve the existing B0-to-A0 + // rewrite behavior by defaulting an unspecified source to B0 and an + // unspecified target to A0. If either side explicitly names a stepping, honor + // that side instead of forcing the legacy path. + const bool SourceIsB0 = Source.IsB0.value_or(true); + const bool TargetIsB0 = Target.IsB0.value_or(false); + return SourceIsB0 && !TargetIsB0; +} + +} // namespace + amd_comgr_status_t AMD_COMGR_API amd_comgr_hotswap_rewrite( amd_comgr_data_t input, const char *source_isa_name, const char *target_isa_name, amd_comgr_data_t *output) { DataObject *InputP = DataObject::convert(input); - if (!InputP || !InputP->Data || - InputP->DataKind != AMD_COMGR_DATA_KIND_EXECUTABLE || !source_isa_name || - !target_isa_name || !output) + if (!InputP) { + hotswap::log() + << "hotswap: error: amd_comgr_hotswap_rewrite: invalid input data " + "handle\n"; return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; + } + if (!InputP->Data) { + hotswap::log() + << "hotswap: error: amd_comgr_hotswap_rewrite: input data is null\n"; + return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; + } + if (InputP->DataKind != AMD_COMGR_DATA_KIND_EXECUTABLE) { + hotswap::log() + << "hotswap: error: amd_comgr_hotswap_rewrite: input data kind must " + "be executable\n"; + return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; + } + if (!source_isa_name || !target_isa_name || !output) { + hotswap::log() + << "hotswap: error: amd_comgr_hotswap_rewrite: source ISA, target " + "ISA, and output handle are required\n"; + return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; + } - TargetIdentifier SourceIdent, TargetIdent; - if (parseTargetIdentifier(source_isa_name, SourceIdent) || - parseTargetIdentifier(target_isa_name, TargetIdent)) + ParsedHotswapIsa SourceIdent, TargetIdent; + if (parseHotswapIsaName(source_isa_name, SourceIdent) || + parseHotswapIsaName(target_isa_name, TargetIdent)) return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; - if (SourceIdent.Processor != "gfx1250" || TargetIdent.Processor != "gfx1250") + if (!isGfx12_5Processor(SourceIdent.Ident.Processor) || + !isGfx12_5Processor(TargetIdent.Ident.Processor)) { + hotswap::log() + << "hotswap: error: amd_comgr_hotswap_rewrite: only gfx125x " + "processors are supported, got source '" + << SourceIdent.Ident.Processor << "' and target '" + << TargetIdent.Ident.Processor << "'\n"; return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; + } + if (SourceIdent.Ident.Processor != TargetIdent.Ident.Processor) { + hotswap::log() + << "hotswap: error: amd_comgr_hotswap_rewrite: processor retargeting " + "is not supported, got source '" + << SourceIdent.Ident.Processor << "' and target '" + << TargetIdent.Ident.Processor << "'\n"; + return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; + } + + hotswap::Gfx1250RewriteOptions Options; + Options.RunB0A0Patches = SourceIdent.Ident.Processor == "gfx1250" && + shouldRunB0A0Patches(SourceIdent, TargetIdent); + Options.RunEntryTrampolines = env::shouldUseHotswapEntryTrampolines(); std::unique_ptr OutBuffer; - amd_comgr_status_t Status = hotswap::retargetCodeObjectB0A0( - InputP->Data, InputP->Size, TargetIdent, OutBuffer); + amd_comgr_status_t Status = hotswap::retargetCodeObject( + InputP->Data, InputP->Size, TargetIdent.Ident, Options, OutBuffer); if (Status != AMD_COMGR_STATUS_SUCCESS) return Status; - if (!OutBuffer) + if (!OutBuffer) { + hotswap::log() + << "hotswap: error: amd_comgr_hotswap_rewrite: rewrite returned no " + "output buffer\n"; return AMD_COMGR_STATUS_ERROR; + } DataObject *OutputP = DataObject::allocate(AMD_COMGR_DATA_KIND_EXECUTABLE); - if (!OutputP) + if (!OutputP) { + hotswap::log() << "hotswap: error: amd_comgr_hotswap_rewrite: output data " + "allocation failed\n"; return AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES; + } if (amd_comgr_status_t SetStatus = OutputP->setData(std::move(OutBuffer))) { + hotswap::log() + << "hotswap: error: amd_comgr_hotswap_rewrite: output setData " + "failed with status " + << SetStatus << "\n"; OutputP->release(); return SetStatus; } diff --git a/amd/comgr/src/hotswap/README.md b/amd/comgr/src/hotswap/README.md index bceb1ee280039..2a2eccec20105 100644 --- a/amd/comgr/src/hotswap/README.md +++ b/amd/comgr/src/hotswap/README.md @@ -6,9 +6,24 @@ target ISA names, then returns a new executable code object with the applicable rewrite applied. The input code object is not modified. This directory contains COMGR's hotswap transpiler scaffolding, the raiser-based -path for heavier cross-ISA transformations. The same-family stepping patches are -implemented in the surrounding COMGR source files and are exposed through -`amd_comgr_hotswap_rewrite`. +path for heavier cross-ISA transformations. The same-family stepping patches and +optional entry trampolines are implemented in the surrounding COMGR source files +and are exposed through `amd_comgr_hotswap_rewrite`. + +## Supported transformations + +| Transformation | Status | +| -------------- | ------ | +| gfx1250 B0 to A0 | Supported | +| gfx125x entry trampolines | Supported, on by default | +| gfx950 | Coming soon | +| gfx942 | Coming soon | + +## Environment variables + +| Variable | Effect | +| -------- | ------ | +| `AMD_COMGR_HOTSWAP_ENTRY_TRAMPOLINES` | Set to a nonzero value to enable gfx125x kernel descriptor entry redirection through COMGR-generated entry stubs, independent of A0/B0 stepping. Off by default. | ## Transpiler (cross-gen) diff --git a/amd/comgr/test-lit/comgr-sources/hotswap-rewrite.c b/amd/comgr/test-lit/comgr-sources/hotswap-rewrite.c index 146e8aa46b965..01cb19fad38d3 100644 --- a/amd/comgr/test-lit/comgr-sources/hotswap-rewrite.c +++ b/amd/comgr/test-lit/comgr-sources/hotswap-rewrite.c @@ -31,7 +31,7 @@ int main(int argc, char *argv[]) { if (argc < 4) fail("usage: hotswap-rewrite " "[--zero-size] [--output ] [--dump ] " - "[--check-idempotent]"); + "[--check-idempotent] [--expect-status ]"); const char *ElfFile = argv[1]; const char *SourceISA = argv[2]; @@ -39,6 +39,7 @@ int main(int argc, char *argv[]) { int ZeroSize = 0; const char *OutputPath = NULL; const char *DumpFile = NULL; + const char *ExpectStatus = NULL; int CheckIdempotent = 0; for (int I = 4; I < argc; ++I) { @@ -50,6 +51,8 @@ int main(int argc, char *argv[]) { DumpFile = argv[++I]; else if (strcmp(argv[I], "--check-idempotent") == 0) CheckIdempotent = 1; + else if (strcmp(argv[I], "--expect-status") == 0 && I + 1 < argc) + ExpectStatus = argv[++I]; else { fprintf(stderr, "error: unknown argument: %s\n", argv[I]); return 1; @@ -69,6 +72,20 @@ int main(int argc, char *argv[]) { amd_comgr_status_t Status = amd_comgr_hotswap_rewrite(InputData, SourceISA, TargetISA, &OutputData); + const char *StatusString; + amd_comgr_(status_string(Status, &StatusString)); + + if (ExpectStatus) { + printf("RESULT: %s\n", StatusString); + if (strcmp(StatusString, ExpectStatus) != 0) + fail("expected status %s, saw %s", ExpectStatus, StatusString); + if (Status == AMD_COMGR_STATUS_SUCCESS) + amd_comgr_(release_data(OutputData)); + amd_comgr_(release_data(InputData)); + free(ElfBuf); + return 0; + } + if (Status == AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT) { printf("RESULT: INVALID_ARGUMENT\n"); amd_comgr_(release_data(InputData)); @@ -77,7 +94,7 @@ int main(int argc, char *argv[]) { } if (Status != AMD_COMGR_STATUS_SUCCESS) - fail("unexpected error status %d", (int)Status); + fail("unexpected error status %s", StatusString); size_t OutSize = 0; amd_comgr_(get_data(OutputData, &OutSize, NULL)); diff --git a/amd/comgr/test-lit/hotswap-kernel-entry-trampoline.s b/amd/comgr/test-lit/hotswap-kernel-entry-trampoline.s new file mode 100644 index 0000000000000..7df45acd8ca9d --- /dev/null +++ b/amd/comgr/test-lit/hotswap-kernel-entry-trampoline.s @@ -0,0 +1,81 @@ +// COM: HotSwap redirects kernel descriptors to appended PC-relative entry +// COM: stubs when the entry-trampoline flag is explicitly enabled. + +// RUN: %clang -target amdgcn-amd-amdhsa -mcpu=gfx1250 -nostdlib %s -o %t.elf + +// RUN: hotswap-rewrite %t.elf \ +// RUN: amdgcn-amd-amdhsa--gfx1250 amdgcn-amd-amdhsa--gfx1250 \ +// RUN: --output %t.default.elf \ +// RUN: | %FileCheck --check-prefix=API %s +// RUN: cmp %t.elf %t.default.elf +// RUN: %llvm-objdump -d %t.default.elf | %FileCheck --check-prefix=NO-TRAMP %s +// NO-TRAMP-LABEL: : +// NO-TRAMP: s_endpgm +// NO-TRAMP-NOT: global_wb + +// RUN: AMD_COMGR_HOTSWAP_ENTRY_TRAMPOLINES=1 hotswap-rewrite %t.elf \ +// RUN: amdgcn-amd-amdhsa--gfx1250 amdgcn-amd-amdhsa--gfx1250 \ +// RUN: --output %t.out.elf \ +// RUN: | %FileCheck --check-prefix=API %s +// API: RESULT: SUCCESS + +// RUN: %llvm-objdump -d %t.out.elf | %FileCheck --check-prefix=DISASM %s +// RUN: %llvm-readelf --notes %t.out.elf | %FileCheck --check-prefix=METADATA %s + +// DISASM-LABEL: : +// DISASM: s_endpgm +// DISASM: global_wb +// DISASM-NEXT: v_nop +// DISASM-NEXT: s_get_pc_i64 s[8:9] +// DISASM-NEXT: s_add_co_u32 s8 +// DISASM-NEXT: s_add_co_ci_u32 s9 +// DISASM-NEXT: s_set_pc_i64 s[8:9] + +// METADATA: .name: entry_tramp_kernel +// METADATA: .sgpr_count: 10 + +// COM: If the requested entry trampoline cannot allocate an aligned scratch +// COM: SGPR pair, the rewrite fails instead of returning a partial output. +// RUN: sed 's/.sgpr_count: 8/.sgpr_count: 105/' %s > %t.highsgpr.s +// RUN: %clang -target amdgcn-amd-amdhsa -mcpu=gfx1250 -nostdlib \ +// RUN: %t.highsgpr.s -o %t.highsgpr.elf +// RUN: AMD_COMGR_HOTSWAP_ENTRY_TRAMPOLINES=1 hotswap-rewrite %t.highsgpr.elf amdgcn-amd-amdhsa--gfx1250 \ +// RUN: amdgcn-amd-amdhsa--gfx1250 --expect-status ERROR \ +// RUN: | %FileCheck --check-prefix=NO-SCRATCH %s +// NO-SCRATCH: RESULT: ERROR + +.amdgcn_target "amdgcn-amd-amdhsa--gfx1250" +.text +.globl entry_tramp_kernel +.p2align 8 +.type entry_tramp_kernel,@function +entry_tramp_kernel: + v_mov_b32_e32 v0, 0 + s_endpgm +.Lentry_tramp_kernel_end: +.size entry_tramp_kernel, .Lentry_tramp_kernel_end-entry_tramp_kernel + +.rodata +.p2align 8 +.amdhsa_kernel entry_tramp_kernel + .amdhsa_next_free_vgpr 1 + .amdhsa_next_free_sgpr 1 + .amdhsa_inst_pref_size 7 +.end_amdhsa_kernel + +.amdgpu_metadata + amdhsa.version: + - 3 + - 0 + amdhsa.kernels: + - .name: entry_tramp_kernel + .symbol: entry_tramp_kernel.kd + .sgpr_count: 8 + .vgpr_count: 1 + .kernarg_segment_size: 0 + .group_segment_fixed_size: 0 + .private_segment_fixed_size: 0 + .kernarg_segment_align: 8 + .wavefront_size: 64 + .max_flat_workgroup_size: 256 +.end_amdgpu_metadata diff --git a/amd/comgr/test-unit/CMakeLists.txt b/amd/comgr/test-unit/CMakeLists.txt index c40fbad107e23..1322543489ce1 100644 --- a/amd/comgr/test-unit/CMakeLists.txt +++ b/amd/comgr/test-unit/CMakeLists.txt @@ -103,6 +103,7 @@ comgr_configure_test_target(HotswapElfTests) add_executable(HotswapMCTests HotswapMCTest.cpp ../src/comgr-hotswap-b0a0.cpp + ../src/comgr-hotswap-entry-trampoline.cpp ../src/comgr-hotswap-elf.cpp ../src/comgr-hotswap-llvm.cpp ../src/comgr-hotswap-patch-f32-to-e5m3.cpp diff --git a/amd/comgr/test-unit/HotswapElfTest.cpp b/amd/comgr/test-unit/HotswapElfTest.cpp index 206dfd3fe429c..0121f25606239 100644 --- a/amd/comgr/test-unit/HotswapElfTest.cpp +++ b/amd/comgr/test-unit/HotswapElfTest.cpp @@ -9,10 +9,31 @@ #include "comgr-hotswap-internal.h" #include "comgr-test-elf-utils.h" #include "gtest/gtest.h" + #include +#include using namespace COMGR::hotswap; +static std::vector makeText(size_t Size = 16) { + return std::vector(Size, 0); +} + +static unsigned readReservedSgprs(const std::vector &Bytes, + uint64_t KernelDescriptorOffset) { + namespace hsa = llvm::amdhsa; + + uint32_t Rsrc1 = 0; + std::memcpy(&Rsrc1, + Bytes.data() + KernelDescriptorOffset + + offsetof(hsa::kernel_descriptor_t, compute_pgm_rsrc1), + sizeof(Rsrc1)); + return (AMDHSA_BITS_GET( + Rsrc1, hsa::COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT) + + 1) * + 8; +} + // -- ElfView::create ---------------------------------------------------------- TEST(ElfView, RejectsTruncatedInput) { @@ -30,179 +51,221 @@ TEST(ElfView, RejectsNonElfInput) { } // -- ElfView::getKernelStaticLdsSize ------------------------------------------ -// -// getKernelStaticLdsSize reads group_segment_fixed_size (the *static* LDS -// allocation; dynamic LDS is set by the host at dispatch time and not -// visible in the ELF) from a kernel descriptor symbol ".kd". -// Two unit tests cover the helper: -// * negative path: no .kd symbol -> std::nullopt -// * positive path: hand-crafted ELF with a .kd symbol pointing at an -// embedded kernel descriptor -> the embedded LDS size -// Real gfx1250 code-object coverage is added by the lit tests in #2302. TEST(ElfView, GetKernelStaticLdsSizeReturnsNulloptWhenKdMissing) { - // Build a minimal valid ELF64: header + .text + .shstrtab. ELFFile::create - // succeeds, but no .kd symbol exists, so getKernelStaticLdsSize must take - // the missing-KD branch. - using namespace llvm::ELF; - static constexpr size_t BufSize = 512; - alignas(8) uint8_t Buf[BufSize] = {}; - - static constexpr uint64_t ShOff = sizeof(Elf64_Ehdr); - static constexpr uint64_t StrTabOff = 256; - static constexpr uint64_t TextOff = 320; - static constexpr uint64_t TextSize = 16; - - const char StrTab[] = "\0.text\0.shstrtab\0"; - std::memcpy(Buf + StrTabOff, StrTab, sizeof(StrTab)); - - Elf64_Ehdr Ehdr = comgr_test::makeElf64Ehdr(EM_AMDGPU); - Ehdr.e_ident[EI_OSABI] = ELFOSABI_AMDGPU_HSA; - Ehdr.e_type = ET_REL; - Ehdr.e_version = EV_CURRENT; - Ehdr.e_shoff = ShOff; - Ehdr.e_ehsize = sizeof(Elf64_Ehdr); - Ehdr.e_shentsize = sizeof(Elf64_Shdr); - Ehdr.e_shnum = 3; - Ehdr.e_shstrndx = 2; - std::memcpy(Buf, &Ehdr, sizeof(Ehdr)); - - // Shdr[1] = .text - Elf64_Shdr Sh1{}; - Sh1.sh_name = 1; - Sh1.sh_type = SHT_PROGBITS; - Sh1.sh_flags = SHF_ALLOC | SHF_EXECINSTR; - Sh1.sh_offset = TextOff; - Sh1.sh_size = TextSize; - std::memcpy(Buf + ShOff + 1 * sizeof(Elf64_Shdr), &Sh1, sizeof(Sh1)); - - // Shdr[2] = .shstrtab - Elf64_Shdr Sh2{}; - Sh2.sh_name = 7; - Sh2.sh_type = SHT_STRTAB; - Sh2.sh_offset = StrTabOff; - Sh2.sh_size = sizeof(StrTab); - std::memcpy(Buf + ShOff + 2 * sizeof(Elf64_Shdr), &Sh2, sizeof(Sh2)); - - llvm::Expected ViewOrErr = ElfView::create(Buf, BufSize); + comgr_test::KernelDescriptorElf Obj = + comgr_test::makeKernelDescriptorElf(makeText()); + + llvm::Expected ViewOrErr = + ElfView::create(Obj.Bytes.data(), Obj.Bytes.size()); ASSERT_TRUE((bool)ViewOrErr) << llvm::toString(ViewOrErr.takeError()); EXPECT_EQ(ViewOrErr->getKernelStaticLdsSize("nonexistent_kernel"), std::nullopt); } TEST(ElfView, GetKernelStaticLdsSizeReadsLdsSizeFromKernelDescriptor) { - // Build a minimal AMDGPU ELF64 with the section topology that - // findKernelDescriptor walks: 6 sections (NULL, .text, .rodata, .strtab, - // .symtab, .shstrtab). The kernel descriptor is embedded at the start of - // .rodata with a known group_segment_fixed_size value, and a symbol named - // "test_kernel.kd" in .symtab points at it. getKernelStaticLdsSize must - // return the embedded static-LDS size unchanged. - using namespace llvm::ELF; - static constexpr size_t BufSize = 1024; - alignas(8) uint8_t Buf[BufSize] = {}; - - // Section file offsets and sizes. Layout choices keep each section - // 8-byte aligned so the ELF parser is happy. - static constexpr uint64_t ShOff = sizeof(Elf64_Ehdr); - static constexpr uint64_t TextOff = 0x1C0; - static constexpr uint64_t TextSize = 16; - static constexpr uint64_t RodataOff = 0x1D0; - static constexpr uint64_t KdSize = 64; - static constexpr uint64_t StrTabOff = 0x210; - static constexpr uint64_t SymTabOff = 0x220; - static constexpr uint64_t ShStrTabOff = 0x250; - static constexpr uint64_t SymCount = 2; static constexpr uint32_t TestLdsSize = 16384; - // Section name string table. Entries: "" .text .rodata .strtab .symtab - // .shstrtab. Offsets pinned in the shdr fields below. - const char ShStrTab[] = "\0.text\0.rodata\0.strtab\0.symtab\0.shstrtab\0"; - std::memcpy(Buf + ShStrTabOff, ShStrTab, sizeof(ShStrTab)); - - // Symbol name string table. Single named symbol "test_kernel.kd" at - // offset 1; offset 0 is the conventional empty name. - const char StrTab[] = "\0test_kernel.kd\0"; - std::memcpy(Buf + StrTabOff, StrTab, sizeof(StrTab)); - - Elf64_Ehdr Ehdr = comgr_test::makeElf64Ehdr(EM_AMDGPU); - Ehdr.e_ident[EI_OSABI] = ELFOSABI_AMDGPU_HSA; - Ehdr.e_type = ET_REL; - Ehdr.e_version = EV_CURRENT; - Ehdr.e_shoff = ShOff; - Ehdr.e_ehsize = sizeof(Elf64_Ehdr); - Ehdr.e_shentsize = sizeof(Elf64_Shdr); - Ehdr.e_shnum = 6; - Ehdr.e_shstrndx = 5; - std::memcpy(Buf, &Ehdr, sizeof(Ehdr)); - - // Section header table. Shdr[0] is the conventional NULL section (left - // as the buffer's zero-init). Each non-null shdr is zero-initialized by - // Elf64_Shdr{} so unspecified fields (sh_addr, sh_info, sh_addralign, - // ...) are explicitly zero. - - // Shdr[1] = .text - Elf64_Shdr Sh1{}; - Sh1.sh_name = 1; - Sh1.sh_type = SHT_PROGBITS; - Sh1.sh_flags = SHF_ALLOC | SHF_EXECINSTR; - Sh1.sh_offset = TextOff; - Sh1.sh_size = TextSize; - std::memcpy(Buf + ShOff + 1 * sizeof(Elf64_Shdr), &Sh1, sizeof(Sh1)); - - // Shdr[2] = .rodata (holds the kernel descriptor) - Elf64_Shdr Sh2{}; - Sh2.sh_name = 7; - Sh2.sh_type = SHT_PROGBITS; - Sh2.sh_flags = SHF_ALLOC; - Sh2.sh_offset = RodataOff; - Sh2.sh_size = KdSize; - std::memcpy(Buf + ShOff + 2 * sizeof(Elf64_Shdr), &Sh2, sizeof(Sh2)); - - // Shdr[3] = .strtab (symbol names) - Elf64_Shdr Sh3{}; - Sh3.sh_name = 15; - Sh3.sh_type = SHT_STRTAB; - Sh3.sh_offset = StrTabOff; - Sh3.sh_size = sizeof(StrTab); - std::memcpy(Buf + ShOff + 3 * sizeof(Elf64_Shdr), &Sh3, sizeof(Sh3)); - - // Shdr[4] = .symtab; sh_link = 3 (.strtab) - Elf64_Shdr Sh4{}; - Sh4.sh_name = 23; - Sh4.sh_type = SHT_SYMTAB; - Sh4.sh_offset = SymTabOff; - Sh4.sh_size = sizeof(Elf64_Sym) * SymCount; - Sh4.sh_link = 3; - Sh4.sh_entsize = sizeof(Elf64_Sym); - std::memcpy(Buf + ShOff + 4 * sizeof(Elf64_Shdr), &Sh4, sizeof(Sh4)); - - // Shdr[5] = .shstrtab (section names) - Elf64_Shdr Sh5{}; - Sh5.sh_name = 31; - Sh5.sh_type = SHT_STRTAB; - Sh5.sh_offset = ShStrTabOff; - Sh5.sh_size = sizeof(ShStrTab); - std::memcpy(Buf + ShOff + 5 * sizeof(Elf64_Shdr), &Sh5, sizeof(Sh5)); - - // Kernel descriptor body: group_segment_fixed_size at offset 0. The rest - // of the 64-byte descriptor stays zero, which is fine for a read-only - // helper that only consumes one field. - std::memcpy(Buf + RodataOff, &TestLdsSize, sizeof(TestLdsSize)); - - // Symbol table. Slot 0 is the conventional null symbol (left as the - // buffer's zero-init). Slot 1 names "test_kernel.kd" at .strtab offset 1 - // and points at the start of .rodata (st_value=0). - Elf64_Sym Sym1{}; - Sym1.st_name = 1; - Sym1.setBindingAndType(STB_GLOBAL, STT_OBJECT); - Sym1.st_shndx = 2; - Sym1.st_size = KdSize; - std::memcpy(Buf + SymTabOff + 1 * sizeof(Elf64_Sym), &Sym1, sizeof(Sym1)); - - llvm::Expected ViewOrErr = ElfView::create(Buf, BufSize); + comgr_test::KernelDescriptorElfOptions Opts; + Opts.ElfType = llvm::ELF::ET_REL; + Opts.KernelName = "test_kernel"; + Opts.TextAddr = 0; + Opts.RodataAddr = 0; + Opts.GroupSegmentFixedSize = TestLdsSize; + comgr_test::KernelDescriptorElf Obj = + comgr_test::makeKernelDescriptorElf(makeText(), Opts); + + llvm::Expected ViewOrErr = + ElfView::create(Obj.Bytes.data(), Obj.Bytes.size()); ASSERT_TRUE((bool)ViewOrErr) << llvm::toString(ViewOrErr.takeError()); std::optional Lds = ViewOrErr->getKernelStaticLdsSize("test_kernel"); ASSERT_TRUE(Lds.has_value()); EXPECT_EQ(*Lds, TestLdsSize); } + +TEST(ElfView, KernelDescriptorsEnumeratesAndUpdatesEntryOffset) { + namespace hsa = llvm::amdhsa; + + comgr_test::KernelDescriptorElfOptions Opts; + Opts.KernelName = "entry_kernel"; + comgr_test::KernelDescriptorElf Obj = + comgr_test::makeKernelDescriptorElf(makeText(), Opts); + + llvm::Expected ViewOrErr = + ElfView::create(Obj.Bytes.data(), Obj.Bytes.size()); + ASSERT_TRUE((bool)ViewOrErr) << llvm::toString(ViewOrErr.takeError()); + std::vector KDs = ViewOrErr->kernelDescriptors(); + ASSERT_EQ(KDs.size(), 1u); + EXPECT_EQ(KDs[0].KernelName, "entry_kernel"); + EXPECT_EQ(KDs[0].VAddr, Obj.RodataAddr); + EXPECT_EQ(KDs[0].EntryOffset, Obj.EntryOffset); + EXPECT_EQ(ViewOrErr->getKernelDescriptorVAddr("entry_kernel"), + Obj.RodataAddr); + + const int64_t NewOff = -128; + ASSERT_TRUE( + ViewOrErr->updateKernelDescriptorEntryOffset("entry_kernel", NewOff)); + int64_t ReadBack = 0; + std::memcpy( + &ReadBack, + Obj.Bytes.data() + Obj.KernelDescriptorOffset + + offsetof(hsa::kernel_descriptor_t, kernel_code_entry_byte_offset), + sizeof(ReadBack)); + EXPECT_EQ(ReadBack, NewOff); + + ASSERT_TRUE(ViewOrErr->updateKernelDescriptorSgprCount("entry_kernel", 10)); + EXPECT_GE(readReservedSgprs(Obj.Bytes, Obj.KernelDescriptorOffset), 10u); +} + +TEST(ElfView, KernelDescriptorsSkipsKdWhenFileOffsetOverflows) { + comgr_test::KernelDescriptorElfOptions Opts; + Opts.KernelName = "overflow_kernel"; + Opts.RodataAddr = 0x1000; + Opts.KernelDescriptorSymbolValue = + std::numeric_limits::max() - 0x20; + comgr_test::KernelDescriptorElf Obj = + comgr_test::makeKernelDescriptorElf(makeText(), Opts); + + llvm::Expected ViewOrErr = + ElfView::create(Obj.Bytes.data(), Obj.Bytes.size()); + ASSERT_TRUE((bool)ViewOrErr) << llvm::toString(ViewOrErr.takeError()); + EXPECT_TRUE(ViewOrErr->kernelDescriptors().empty()); + EXPECT_EQ(ViewOrErr->findKernelDescriptor("overflow_kernel"), nullptr); +} + +TEST(ElfView, GrowWithTrampolinesShiftsAllocSectionSymbols) { + static constexpr uint64_t GrowthBytes = 8; + + comgr_test::KernelDescriptorElfOptions Opts; + Opts.KernelName = "entry_kernel"; + comgr_test::KernelDescriptorElf Obj = + comgr_test::makeKernelDescriptorElf(makeText(), Opts); + + llvm::Expected ViewOrErr = + ElfView::create(Obj.Bytes.data(), Obj.Bytes.size()); + ASSERT_TRUE((bool)ViewOrErr) << llvm::toString(ViewOrErr.takeError()); + + Trampoline T; + T.Bytes.assign(GrowthBytes, 0); + std::vector Trampolines; + Trampolines.push_back(T); + const uint8_t SNop[4] = {}; + std::unique_ptr Out = + ViewOrErr->growWithTrampolines(Trampolines, SNop); + ASSERT_NE(Out, nullptr); + + uint8_t *OutData = reinterpret_cast(Out->getBufferStart()); + llvm::Expected OutView = + ElfView::create(OutData, Out->getBufferSize()); + ASSERT_TRUE((bool)OutView) << llvm::toString(OutView.takeError()); + std::vector KDs = OutView->kernelDescriptors(); + ASSERT_EQ(KDs.size(), 1u); + EXPECT_EQ(KDs[0].VAddr, Obj.RodataAddr + GrowthBytes); +} + +TEST(ElfView, UpdateKernelDescriptorSgprCountUpdatesMetadataAndDescriptor) { + comgr_test::KernelDescriptorElfOptions Opts; + Opts.KernelName = "entry_kernel"; + Opts.MetadataSgprCount = 8; + comgr_test::KernelDescriptorElf Obj = + comgr_test::makeKernelDescriptorElf(makeText(), Opts); + + llvm::Expected ViewOrErr = + ElfView::create(Obj.Bytes.data(), Obj.Bytes.size()); + ASSERT_TRUE((bool)ViewOrErr) << llvm::toString(ViewOrErr.takeError()); + + ASSERT_TRUE(ViewOrErr->updateKernelDescriptorSgprCount("entry_kernel", 10)); + std::optional MetadataSgprs = + ViewOrErr->getKernelSgprCount("entry_kernel"); + ASSERT_TRUE(MetadataSgprs.has_value()); + EXPECT_EQ(*MetadataSgprs, 10u); + EXPECT_GE(readReservedSgprs(Obj.Bytes, Obj.KernelDescriptorOffset), 10u); +} + +TEST(ElfView, UpdateKernelDescriptorSgprCountRejectsMissingMetadataCount) { + comgr_test::KernelDescriptorElfOptions Opts; + Opts.KernelName = "entry_kernel"; + Opts.MetadataOmitSgprCount = true; + comgr_test::KernelDescriptorElf Obj = + comgr_test::makeKernelDescriptorElf(makeText(), Opts); + + llvm::Expected ViewOrErr = + ElfView::create(Obj.Bytes.data(), Obj.Bytes.size()); + ASSERT_TRUE((bool)ViewOrErr) << llvm::toString(ViewOrErr.takeError()); + + EXPECT_FALSE(ViewOrErr->updateKernelDescriptorSgprCount("entry_kernel", 10)); + EXPECT_EQ(readReservedSgprs(Obj.Bytes, Obj.KernelDescriptorOffset), 8u); +} + +TEST(ElfView, UpdateKernelDescriptorSgprCountRejectsMissingMetadataKernel) { + comgr_test::KernelDescriptorElfOptions Opts; + Opts.KernelName = "entry_kernel"; + Opts.MetadataKernelName = "other_kernel"; + Opts.MetadataSgprCount = 8; + comgr_test::KernelDescriptorElf Obj = + comgr_test::makeKernelDescriptorElf(makeText(), Opts); + + llvm::Expected ViewOrErr = + ElfView::create(Obj.Bytes.data(), Obj.Bytes.size()); + ASSERT_TRUE((bool)ViewOrErr) << llvm::toString(ViewOrErr.takeError()); + + EXPECT_EQ(ViewOrErr->getKernelSgprCount("entry_kernel"), std::nullopt); + EXPECT_FALSE(ViewOrErr->updateKernelDescriptorSgprCount("entry_kernel", 10)); + EXPECT_EQ(readReservedSgprs(Obj.Bytes, Obj.KernelDescriptorOffset), 8u); +} + +TEST(ElfView, UpdateKernelDescriptorSgprCountRejectsNonIntegerMetadataCount) { + comgr_test::KernelDescriptorElfOptions Opts; + Opts.KernelName = "entry_kernel"; + Opts.MetadataSgprCountAsString = true; + comgr_test::KernelDescriptorElf Obj = + comgr_test::makeKernelDescriptorElf(makeText(), Opts); + + llvm::Expected ViewOrErr = + ElfView::create(Obj.Bytes.data(), Obj.Bytes.size()); + ASSERT_TRUE((bool)ViewOrErr) << llvm::toString(ViewOrErr.takeError()); + + EXPECT_EQ(ViewOrErr->getKernelSgprCount("entry_kernel"), std::nullopt); + EXPECT_FALSE(ViewOrErr->updateKernelDescriptorSgprCount("entry_kernel", 10)); + EXPECT_EQ(readReservedSgprs(Obj.Bytes, Obj.KernelDescriptorOffset), 8u); +} + +TEST(ElfView, UpdateKernelDescriptorSgprCountRejectsMetadataSizeChange) { + comgr_test::KernelDescriptorElfOptions Opts; + Opts.KernelName = "entry_kernel"; + Opts.MetadataSgprCount = 9; + comgr_test::KernelDescriptorElf Obj = + comgr_test::makeKernelDescriptorElf(makeText(), Opts); + + llvm::Expected ViewOrErr = + ElfView::create(Obj.Bytes.data(), Obj.Bytes.size()); + ASSERT_TRUE((bool)ViewOrErr) << llvm::toString(ViewOrErr.takeError()); + + EXPECT_FALSE(ViewOrErr->updateKernelDescriptorSgprCount("entry_kernel", 128)); + std::optional MetadataSgprs = + ViewOrErr->getKernelSgprCount("entry_kernel"); + ASSERT_TRUE(MetadataSgprs.has_value()); + EXPECT_EQ(*MetadataSgprs, 9u); + EXPECT_EQ(readReservedSgprs(Obj.Bytes, Obj.KernelDescriptorOffset), 8u); +} + +TEST(ElfView, UpdateKernelDescriptorSgprCountRejectsDescriptorLimitFirst) { + comgr_test::KernelDescriptorElfOptions Opts; + Opts.KernelName = "entry_kernel"; + Opts.MetadataSgprCount = 200; + comgr_test::KernelDescriptorElf Obj = + comgr_test::makeKernelDescriptorElf(makeText(), Opts); + + llvm::Expected ViewOrErr = + ElfView::create(Obj.Bytes.data(), Obj.Bytes.size()); + ASSERT_TRUE((bool)ViewOrErr) << llvm::toString(ViewOrErr.takeError()); + + EXPECT_FALSE( + ViewOrErr->updateKernelDescriptorSgprCount("entry_kernel", 100000)); + std::optional MetadataSgprs = + ViewOrErr->getKernelSgprCount("entry_kernel"); + ASSERT_TRUE(MetadataSgprs.has_value()); + EXPECT_EQ(*MetadataSgprs, 200u); + EXPECT_EQ(readReservedSgprs(Obj.Bytes, Obj.KernelDescriptorOffset), 8u); +} diff --git a/amd/comgr/test-unit/HotswapMCTest.cpp b/amd/comgr/test-unit/HotswapMCTest.cpp index bd82b6308fb9c..60029d3aad360 100644 --- a/amd/comgr/test-unit/HotswapMCTest.cpp +++ b/amd/comgr/test-unit/HotswapMCTest.cpp @@ -15,8 +15,11 @@ //===----------------------------------------------------------------------===// #include "comgr-hotswap-internal.h" +#include "comgr-test-elf-utils.h" #include "comgr.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Twine.h" #include "llvm/Support/TargetSelect.h" #include "gtest/gtest.h" @@ -229,6 +232,48 @@ static llvm::MCInst assembleOne(llvm::StringRef Asm, const LLVMState &S) { return Decoded.empty() ? llvm::MCInst() : Decoded[0].Inst; } +static void expectSameOperands(const llvm::MCInst &Actual, + const llvm::MCInst &Expected, + llvm::StringRef Context) { + EXPECT_EQ(Actual.getOpcode(), Expected.getOpcode()) << Context.str(); + ASSERT_EQ(Actual.getNumOperands(), Expected.getNumOperands()) + << Context.str(); + for (unsigned I = 0, E = Actual.getNumOperands(); I != E; ++I) { + const llvm::MCOperand &ActualOp = Actual.getOperand(I); + const llvm::MCOperand &ExpectedOp = Expected.getOperand(I); + EXPECT_EQ(ActualOp.isReg(), ExpectedOp.isReg()) + << Context.str() << " operand " << I; + EXPECT_EQ(ActualOp.isImm(), ExpectedOp.isImm()) + << Context.str() << " operand " << I; + EXPECT_EQ(ActualOp.isSFPImm(), ExpectedOp.isSFPImm()) + << Context.str() << " operand " << I; + EXPECT_EQ(ActualOp.isDFPImm(), ExpectedOp.isDFPImm()) + << Context.str() << " operand " << I; + EXPECT_EQ(ActualOp.isExpr(), ExpectedOp.isExpr()) + << Context.str() << " operand " << I; + if (ExpectedOp.isReg()) { + EXPECT_EQ(ActualOp.getReg(), ExpectedOp.getReg()) + << Context.str() << " operand " << I; + } else if (ExpectedOp.isImm()) { + EXPECT_EQ(ActualOp.getImm(), ExpectedOp.getImm()) + << Context.str() << " operand " << I; + } else if (ExpectedOp.isSFPImm()) { + EXPECT_EQ(ActualOp.getSFPImm(), ExpectedOp.getSFPImm()) + << Context.str() << " operand " << I; + } else if (ExpectedOp.isDFPImm()) { + EXPECT_EQ(ActualOp.getDFPImm(), ExpectedOp.getDFPImm()) + << Context.str() << " operand " << I; + } + } +} + +static void expectInstMatchesAsm(const llvm::MCInst &Actual, + llvm::StringRef Asm, + const LLVMState &S) { + llvm::MCInst Expected = assembleOne(Asm, S); + expectSameOperands(Actual, Expected, Asm); +} + TEST(CheckVgprOverlap, DetectsDirectOverlap) { LLVMState S = initLLVM(makeGfx1250Ident()); ASSERT_TRUE(S.Valid); @@ -295,72 +340,298 @@ TEST(BuildTrampoline, EmptyOnBadAsm) { EXPECT_TRUE(T.Bytes.empty()); } -// -- classifyWmmaNops --------------------------------------------------------- +// -- buildKernelEntryTrampoline ----------------------------------------------- -TEST(ClassifyWmmaNops, NonWmmaReturnsDefault) { - WmmaNopReq Req = classifyWmmaNops("v_add_f32"); - EXPECT_EQ(Req.A0Nops, 4); - EXPECT_EQ(Req.B0Nops, 4); -} +TEST(BuildKernelEntryTrampoline, BuildsRecognizedPcRelativeStub) { + LLVMState S = initLLVM(makeGfx1250Ident()); + ASSERT_TRUE(S.Valid); -TEST(ClassifyWmmaNops, IntegerWmmaReturns8) { - WmmaNopReq Req = classifyWmmaNops("v_wmma_i32_16x16x32_iu8"); - EXPECT_EQ(Req.A0Nops, 8); - EXPECT_EQ(Req.B0Nops, 4); -} + constexpr uint64_t StubVAddr = 0x200000; + constexpr uint64_t EntryVAddr = 0x10100; + llvm::SmallVector GlobalWb = assembleSingleInst("global_wb", S); + ASSERT_EQ(GlobalWb.size(), 3 * MinInstSize); -TEST(ClassifyWmmaNops, Iu4Returns8) { - WmmaNopReq Req = classifyWmmaNops("v_wmma_i32_16x16x64_iu4"); - EXPECT_EQ(Req.A0Nops, 8); - EXPECT_EQ(Req.B0Nops, 4); -} + llvm::SmallVector Bytes = + buildKernelEntryTrampoline(StubVAddr, EntryVAddr, /*ScratchSgpr=*/8, S); -TEST(ClassifyWmmaNops, F8f6f4Returns1) { - WmmaNopReq Req = classifyWmmaNops("v_wmma_f32_16x16x128_f8f6f4"); - EXPECT_EQ(Req.A0Nops, 1); - EXPECT_EQ(Req.B0Nops, 4); -} + ASSERT_EQ(Bytes.size(), KernelEntryStubStride); + EXPECT_TRUE(isKernelEntryTrampoline(Bytes, S)); -TEST(ClassifyWmmaNops, Fp8_16x16x128Returns3) { - WmmaNopReq Req = classifyWmmaNops("v_wmma_f32_16x16x128_fp8_fp8"); - EXPECT_EQ(Req.A0Nops, 3); - EXPECT_EQ(Req.B0Nops, 4); -} + std::vector Decoded; + ASSERT_TRUE(decodeTextSection(Bytes.data(), Bytes.size(), S, Decoded)); + ASSERT_GE(Decoded.size(), 6u); + EXPECT_EQ(Decoded[0].Inst.getOpcode(), S.GlobalWbOpcode); + EXPECT_EQ(Decoded[1].Inst.getOpcode(), S.VNopInst.getOpcode()); + EXPECT_EQ(Decoded[2].Inst.getOpcode(), S.SGetPcI64Opcode); + EXPECT_EQ(Decoded[3].Inst.getOpcode(), S.SAddU32Opcode); + EXPECT_EQ(Decoded[4].Inst.getOpcode(), S.SAddcU32Opcode); + EXPECT_EQ(Decoded[5].Inst.getOpcode(), S.SSetPcI64Opcode); + + const uint64_t PcBase = StubVAddr + Decoded[2].Offset + Decoded[2].Size; + const uint64_t Delta = EntryVAddr - PcBase; + const uint32_t Lo = static_cast(Delta); + const uint32_t Hi = static_cast(Delta >> 32); + expectInstMatchesAsm(Decoded[0].Inst, "global_wb", S); + expectInstMatchesAsm(Decoded[1].Inst, "v_nop", S); + expectInstMatchesAsm(Decoded[2].Inst, "s_get_pc_i64 s[8:9]", S); + expectInstMatchesAsm( + Decoded[3].Inst, + (llvm::Twine("s_add_u32 s8, s8, 0x") + llvm::utohexstr(Lo)).str(), S); + expectInstMatchesAsm( + Decoded[4].Inst, + (llvm::Twine("s_addc_u32 s9, s9, 0x") + llvm::utohexstr(Hi)).str(), S); + expectInstMatchesAsm(Decoded[5].Inst, "s_set_pc_i64 s[8:9]", S); +} + +TEST(BuildKernelEntryTrampoline, MatcherRejectsNonStubBytes) { + LLVMState S = initLLVM(makeGfx1250Ident()); + ASSERT_TRUE(S.Valid); -TEST(ClassifyWmmaNops, Fp8SmallReturns1) { - WmmaNopReq Req = classifyWmmaNops("v_wmma_f32_16x16x32_fp8_fp8"); - EXPECT_EQ(Req.A0Nops, 1); - EXPECT_EQ(Req.B0Nops, 4); -} + std::vector Bytes(KernelEntryStubStride, 0); + for (size_t I = 0; I < Bytes.size(); I += MinInstSize) + std::memcpy(Bytes.data() + I, S.SNopBytes.data(), MinInstSize); -TEST(ClassifyWmmaNops, F16Returns4) { - WmmaNopReq Req = classifyWmmaNops("v_wmma_f32_16x16x16_f16"); - EXPECT_EQ(Req.A0Nops, 4); - EXPECT_EQ(Req.B0Nops, 4); + EXPECT_FALSE(isKernelEntryTrampoline(Bytes, S)); } -TEST(ClassifyWmmaNops, Bf16Returns4) { - WmmaNopReq Req = classifyWmmaNops("v_wmma_f32_16x16x16_bf16"); - EXPECT_EQ(Req.A0Nops, 4); - EXPECT_EQ(Req.B0Nops, 4); +TEST(BuildKernelEntryTrampoline, MatcherRejectsWrongOperandShape) { + LLVMState S = initLLVM(makeGfx1250Ident()); + ASSERT_TRUE(S.Valid); + + llvm::SmallVector Bytes; + auto Append = [&](llvm::StringRef Asm) { + llvm::SmallVector Inst = assembleSingleInst(Asm, S); + if (Inst.empty()) { + ADD_FAILURE() << "failed to assemble: " << Asm.str(); + return false; + } + Bytes.append(Inst.begin(), Inst.end()); + return true; + }; + + ASSERT_TRUE(Append("global_wb")); + ASSERT_TRUE(Append("v_nop")); + ASSERT_TRUE(Append("s_get_pc_i64 s[8:9]")); + ASSERT_TRUE(Append("s_add_u32 s8, s8, 0")); + ASSERT_TRUE(Append("s_addc_u32 s10, s10, 0")); + ASSERT_TRUE(Append("s_set_pc_i64 s[8:9]")); + + llvm::SmallVector CodeEnd = assembleSingleInst("s_code_end", S); + ASSERT_EQ(CodeEnd.size(), MinInstSize); + while (Bytes.size() < KernelEntryStubStride) + Bytes.append(CodeEnd.begin(), CodeEnd.end()); + ASSERT_EQ(Bytes.size(), KernelEntryStubStride); + + EXPECT_FALSE(isKernelEntryTrampoline(Bytes, S)); } -TEST(ClassifyWmmaNops, SwmmacIu8Returns8) { - WmmaNopReq Req = classifyWmmaNops("v_swmmac_i32_16x16x64_iu8"); - EXPECT_EQ(Req.A0Nops, 8); - EXPECT_EQ(Req.B0Nops, 4); +TEST(KernelEntryTrampoline, PreservesInstPrefSizeAndAddsPrefetchGuard) { + namespace hsa = llvm::amdhsa; + + LLVMState S = initLLVM(makeGfx1250Ident()); + ASSERT_TRUE(S.Valid); + + llvm::SmallVector Text = assembleSingleInst("s_endpgm", S); + ASSERT_EQ(Text.size(), MinInstSize); + + uint32_t Rsrc3 = 0; + AMDHSA_BITS_SET(Rsrc3, hsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE, 7); + Rsrc3 |= hsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_GLG_EN; + comgr_test::KernelDescriptorElfOptions Opts; + Opts.ComputePgmRsrc3 = Rsrc3; + comgr_test::KernelDescriptorElf Obj = + comgr_test::makeKernelDescriptorElf(Text, Opts); + llvm::Expected ViewOrErr = + ElfView::create(Obj.Bytes.data(), Obj.Bytes.size()); + ASSERT_TRUE((bool)ViewOrErr) << llvm::toString(ViewOrErr.takeError()); + + uint8_t *Kd = ViewOrErr->findKernelDescriptor("kernel"); + ASSERT_NE(Kd, nullptr); + + std::vector Growth; + std::vector Fixups; + std::optional Count = appendKernelEntryTrampolines( + *ViewOrErr, S, /*MaxSgprs=*/106, Growth, Fixups); + ASSERT_TRUE(Count.has_value()); + EXPECT_EQ(*Count, 1u); + ASSERT_EQ(Fixups.size(), 1u); + + const uint64_t ExpectedGuard = computeKernelEntryPrefetchGuardBytes(7); + EXPECT_EQ(ExpectedGuard, + 7u * KernelEntryInstPrefUnitBytes - KernelEntryStubStride); + ASSERT_FALSE(Growth.empty()); + EXPECT_EQ(Growth.back().Bytes.size(), ExpectedGuard); + + const uint64_t OldTextSize = ViewOrErr->textSize(); + const uint64_t TextEndVAddr = ViewOrErr->textAddr() + OldTextSize; + const uint64_t ExpectedStubOffset = + ((TextEndVAddr + KernelEntryStubStride - 1) & + ~(KernelEntryStubStride - 1)) - + TextEndVAddr; + EXPECT_EQ(Fixups[0].StubTextOffset, ExpectedStubOffset); + + uint64_t GrowthTotal = 0; + for (const Trampoline &T : Growth) + GrowthTotal += T.Bytes.size(); + EXPECT_EQ(GrowthTotal, + ExpectedStubOffset + KernelEntryStubStride + ExpectedGuard); + + std::unique_ptr Out = + ViewOrErr->growWithTrampolines(Growth, S.SNopBytes); + ASSERT_NE(Out, nullptr); + + ASSERT_TRUE(rewriteKernelEntryDescriptorOffsets(*Out, OldTextSize, Fixups)); + + uint8_t *OutData = reinterpret_cast(Out->getBufferStart()); + llvm::Expected OutView = + ElfView::create(OutData, Out->getBufferSize()); + ASSERT_TRUE((bool)OutView) << llvm::toString(OutView.takeError()); + + uint8_t *OutKd = OutView->findKernelDescriptor("kernel"); + ASSERT_NE(OutKd, nullptr); + uint32_t OutRsrc3 = 0; + std::memcpy(&OutRsrc3, + OutKd + offsetof(hsa::kernel_descriptor_t, compute_pgm_rsrc3), + sizeof(OutRsrc3)); + EXPECT_EQ(AMDHSA_BITS_GET(OutRsrc3, + hsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE), + 7u); + EXPECT_NE(OutRsrc3 & hsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_GLG_EN, 0u); + EXPECT_EQ(Fixups[0].RequiredSgprs, 10u); + uint32_t OutRsrc1 = 0; + std::memcpy(&OutRsrc1, + OutKd + offsetof(hsa::kernel_descriptor_t, compute_pgm_rsrc1), + sizeof(OutRsrc1)); + unsigned ReservedSgprs = + (AMDHSA_BITS_GET(OutRsrc1, + hsa::COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT) + + 1) * + 8; + EXPECT_GE(ReservedSgprs, Fixups[0].RequiredSgprs); + + std::vector KDs = OutView->kernelDescriptors(); + ASSERT_EQ(KDs.size(), 1u); + std::optional KdVAddr = OutView->getKernelDescriptorVAddr("kernel"); + ASSERT_TRUE(KdVAddr.has_value()); + const uint64_t StubVAddr = + ViewOrErr->textAddr() + OldTextSize + Fixups[0].StubTextOffset; + EXPECT_EQ(KDs[0].EntryOffset, static_cast(StubVAddr - *KdVAddr)); +} + +TEST(KernelEntryTrampoline, AlignsStubByVirtualAddress) { + LLVMState S = initLLVM(makeGfx1250Ident()); + ASSERT_TRUE(S.Valid); + + llvm::SmallVector Text = assembleSingleInst("s_endpgm", S); + ASSERT_EQ(Text.size(), MinInstSize); + + comgr_test::KernelDescriptorElfOptions Opts; + Opts.TextAddr = 0x1080; + comgr_test::KernelDescriptorElf Obj = + comgr_test::makeKernelDescriptorElf(Text, Opts); + llvm::Expected ViewOrErr = + ElfView::create(Obj.Bytes.data(), Obj.Bytes.size()); + ASSERT_TRUE((bool)ViewOrErr) << llvm::toString(ViewOrErr.takeError()); + + std::vector Growth; + std::vector Fixups; + std::optional Count = appendKernelEntryTrampolines( + *ViewOrErr, S, /*MaxSgprs=*/106, Growth, Fixups); + + ASSERT_TRUE(Count.has_value()); + EXPECT_EQ(*Count, 1u); + ASSERT_EQ(Fixups.size(), 1u); + const uint64_t StubVAddr = + ViewOrErr->textAddr() + ViewOrErr->textSize() + Fixups[0].StubTextOffset; + EXPECT_EQ(StubVAddr % KernelEntryStubStride, 0u); + EXPECT_NE((ViewOrErr->textSize() + Fixups[0].StubTextOffset) % + KernelEntryStubStride, + 0u); +} + +TEST(KernelEntryTrampoline, AppendReturnsZeroWhenNoDescriptorsExist) { + LLVMState S = initLLVM(makeGfx1250Ident()); + ASSERT_TRUE(S.Valid); + + llvm::SmallVector Text = assembleSingleInst("s_endpgm", S); + ASSERT_EQ(Text.size(), MinInstSize); + + comgr_test::KernelDescriptorElfOptions Opts; + Opts.EmitKernelDescriptorSymbol = false; + comgr_test::KernelDescriptorElf Obj = + comgr_test::makeKernelDescriptorElf(Text, Opts); + llvm::Expected ViewOrErr = + ElfView::create(Obj.Bytes.data(), Obj.Bytes.size()); + ASSERT_TRUE((bool)ViewOrErr) << llvm::toString(ViewOrErr.takeError()); + + std::vector Growth; + std::vector Fixups; + std::optional Count = appendKernelEntryTrampolines( + *ViewOrErr, S, /*MaxSgprs=*/106, Growth, Fixups); + + ASSERT_TRUE(Count.has_value()); + EXPECT_EQ(*Count, 0u); + EXPECT_TRUE(Growth.empty()); + EXPECT_TRUE(Fixups.empty()); } -TEST(ClassifyWmmaNops, F32WmmaFallsToDefault) { - WmmaNopReq Req = classifyWmmaNops("v_wmma_f32_16x16x4_f32"); - EXPECT_EQ(Req.A0Nops, 4); - EXPECT_EQ(Req.B0Nops, 4); +TEST(KernelEntryTrampoline, AppendFailsWithoutSgprScratchPair) { + LLVMState S = initLLVM(makeGfx1250Ident()); + ASSERT_TRUE(S.Valid); + + llvm::SmallVector Text = assembleSingleInst("s_endpgm", S); + ASSERT_EQ(Text.size(), MinInstSize); + + comgr_test::KernelDescriptorElfOptions Opts; + Opts.MetadataSgprCount = 105; + comgr_test::KernelDescriptorElf Obj = + comgr_test::makeKernelDescriptorElf(Text, Opts); + llvm::Expected ViewOrErr = + ElfView::create(Obj.Bytes.data(), Obj.Bytes.size()); + ASSERT_TRUE((bool)ViewOrErr) << llvm::toString(ViewOrErr.takeError()); + + Trampoline Existing; + Existing.Bytes.assign(S.SNopBytes.begin(), S.SNopBytes.end()); + std::vector Growth; + Growth.push_back(Existing); + std::vector Fixups; + std::optional Count = appendKernelEntryTrampolines( + *ViewOrErr, S, /*MaxSgprs=*/106, Growth, Fixups); + + EXPECT_FALSE(Count.has_value()); + ASSERT_EQ(Growth.size(), 1u); + EXPECT_EQ(llvm::ArrayRef(Growth[0].Bytes), + llvm::ArrayRef(Existing.Bytes)); + EXPECT_TRUE(Fixups.empty()); } -TEST(ClassifyWmmaNops, OrderingMostRestrictiveWins) { - // A mnemonic containing both _iu8 and _f16 should return 8 (iu8 first) - WmmaNopReq Req = classifyWmmaNops("v_wmma_f16_something_iu8"); - EXPECT_EQ(Req.A0Nops, 8); +// -- classifyWmmaNops --------------------------------------------------------- + +TEST(ClassifyWmmaNops, CoversKnownMnemonics) { + struct Case { + llvm::StringLiteral Mnemonic; + int A0Nops; + int B0Nops; + }; + const Case Cases[] = { + {"v_add_f32", 4, 4}, + {"v_wmma_i32_16x16x32_iu8", 8, 4}, + {"v_wmma_i32_16x16x64_iu4", 8, 4}, + {"v_wmma_f32_16x16x128_f8f6f4", 1, 4}, + {"v_wmma_f32_16x16x128_fp8_fp8", 3, 4}, + {"v_wmma_f32_16x16x32_fp8_fp8", 1, 4}, + {"v_wmma_f32_16x16x16_f16", 4, 4}, + {"v_wmma_f32_16x16x16_bf16", 4, 4}, + {"v_swmmac_i32_16x16x64_iu8", 8, 4}, + {"v_wmma_f32_16x16x4_f32", 4, 4}, + {"v_wmma_f16_something_iu8", 8, 4}, + }; + + for (const Case &C : Cases) { + WmmaNopReq Req = classifyWmmaNops(C.Mnemonic); + EXPECT_EQ(Req.A0Nops, C.A0Nops) << C.Mnemonic.str(); + EXPECT_EQ(Req.B0Nops, C.B0Nops) << C.Mnemonic.str(); + } } // -- patchScaleSrc2 ----------------------------------------------------------- @@ -505,8 +776,8 @@ TEST(HotswapPatchVTable, ProcessSingletonIdentityAndEagerInstall) { // Tests for the ds_load_addtid_b32 / ds_store_addtid_b32 trampoline patch // (DEGFXMI400-12025). Coverage is bottom-up: first that the encode/decode // of ADDTID instructions exposes the expected MCInst operand layout, then -// that the trampoline replacement asm round-trips through the MC layer, -// then that buildTrampoline integrates a full ADDTID body. +// that buildTrampoline assembles and decodes a full ADDTID replacement body +// plus its branch-back tail. namespace { @@ -527,107 +798,53 @@ llvm::MCInst decodeOne(llvm::StringRef Asm, const LLVMState &S) { return Decoded.empty() ? llvm::MCInst() : Decoded[0].Inst; } -} // namespace - -TEST(AddTid, LoadAddTidDecodesWithExpectedLayout) { - LLVMState S = initLLVM(makeGfx1250Ident()); - ASSERT_TRUE(S.Valid); - - llvm::MCInst Inst = decodeOne("ds_load_addtid_b32 v5 offset:128", S); +void expectAddTidLayout(llvm::StringRef Asm, int64_t Offset, + llvm::StringRef RegName, const LLVMState &S) { + llvm::MCInst Inst = decodeOne(Asm, S); ASSERT_GE(Inst.getNumOperands(), 3u); - // Direct operand access: register, then offset, then gds bit. No - // print-and-parse round-trip -- production code uses the same operand - // indices to reach the destination VGPR. EXPECT_TRUE(Inst.getOperand(AddtidOpReg).isReg()); EXPECT_NE(Inst.getOperand(AddtidOpReg).getReg(), 0u); EXPECT_TRUE(Inst.getOperand(AddtidOpOffset).isImm()); - EXPECT_EQ(Inst.getOperand(AddtidOpOffset).getImm(), 128); + EXPECT_EQ(Inst.getOperand(AddtidOpOffset).getImm(), Offset); EXPECT_TRUE(Inst.getOperand(AddtidOpGds).isImm()); EXPECT_EQ(Inst.getOperand(AddtidOpGds).getImm(), 0); - // Production code uses MRI.getName() to resolve the VGPR identifier - // ("VGPR5" for v5); pin that so a tablegen rename in upstream catches - // here rather than silently breaking the trampoline. const char *N = S.MRI->getName(Inst.getOperand(AddtidOpReg).getReg()); ASSERT_NE(N, nullptr); - EXPECT_STREQ(N, "VGPR5"); + EXPECT_EQ(llvm::StringRef(N).str(), RegName.str()); } -TEST(AddTid, StoreAddTidDecodesWithExpectedLayout) { - LLVMState S = initLLVM(makeGfx1250Ident()); - ASSERT_TRUE(S.Valid); - - llvm::MCInst Inst = decodeOne("ds_store_addtid_b32 v10 offset:256", S); - ASSERT_GE(Inst.getNumOperands(), 3u); - EXPECT_TRUE(Inst.getOperand(AddtidOpReg).isReg()); - EXPECT_NE(Inst.getOperand(AddtidOpReg).getReg(), 0u); - EXPECT_TRUE(Inst.getOperand(AddtidOpOffset).isImm()); - EXPECT_EQ(Inst.getOperand(AddtidOpOffset).getImm(), 256); - EXPECT_TRUE(Inst.getOperand(AddtidOpGds).isImm()); - EXPECT_EQ(Inst.getOperand(AddtidOpGds).getImm(), 0); - - const char *N = S.MRI->getName(Inst.getOperand(AddtidOpReg).getReg()); - ASSERT_NE(N, nullptr); - EXPECT_STREQ(N, "VGPR10"); +void expectDecodedMnemonics(llvm::ArrayRef Decoded, + llvm::ArrayRef Expected) { + ASSERT_EQ(Decoded.size(), Expected.size()); + for (size_t I = 0; I < Expected.size(); ++I) + EXPECT_EQ(Decoded[I].Mnemonic, Expected[I].str()) << "index " << I; } -TEST(AddTid, LoadTrampolineAsmAssemblesAndDecodes) { - LLVMState S = initLLVM(makeGfx1250Ident()); - ASSERT_TRUE(S.Valid); - - // Replacement asm for ds_load_addtid_b32 v7 offset:64. - // The v_and_b32 with 0xfffff masks M0 to the 20 bits that B0's DS unit - // would have read, keeping the rewrite bit-exact with B0 hardware - // regardless of stale bits in M0[31:20] on entry. - std::string Asm = "v_mbcnt_lo_u32_b32 v7, -1, 0\n" - "v_mbcnt_hi_u32_b32 v7, -1, v7\n" - "v_lshlrev_b32 v7, 2, v7\n" - "v_add_nc_u32 v7, m0, v7\n" - "v_and_b32 v7, 0xfffff, v7\n" - "ds_load_b32 v7, v7 offset:64\n"; - - llvm::SmallVector Bytes = assembleSingleInst(Asm, S); - ASSERT_FALSE(Bytes.empty()); - - std::vector Decoded; - ASSERT_TRUE(decodeTextSection(Bytes.data(), Bytes.size(), S, Decoded)); - ASSERT_EQ(Decoded.size(), 6u); - EXPECT_EQ(Decoded[0].Mnemonic, "v_mbcnt_lo_u32_b32"); - EXPECT_EQ(Decoded[1].Mnemonic, "v_mbcnt_hi_u32_b32"); - EXPECT_EQ(Decoded[2].Mnemonic, "v_lshlrev_b32"); - EXPECT_EQ(Decoded[3].Mnemonic, "v_add_nc_u32"); - EXPECT_EQ(Decoded[4].Mnemonic, "v_and_b32"); - EXPECT_EQ(Decoded[5].Mnemonic, "ds_load_b32"); +void expectDecodedBodyMatchesAsm(llvm::ArrayRef Decoded, + llvm::ArrayRef AsmLines, + const LLVMState &S) { + ASSERT_GE(Decoded.size(), AsmLines.size()); + for (size_t I = 0; I < AsmLines.size(); ++I) { + llvm::MCInst Expected = decodeOne(AsmLines[I], S); + expectSameOperands(Decoded[I].Inst, Expected, AsmLines[I]); + } } -TEST(AddTid, StoreTrampolineAsmAssemblesAndDecodes) { +} // namespace + +TEST(AddTid, AddTidDecodesWithExpectedLayout) { LLVMState S = initLLVM(makeGfx1250Ident()); ASSERT_TRUE(S.Valid); - // Replacement asm for ds_store_addtid_b32 v10 offset:0 with v42 as the - // address-compute scratch (the data VGPR v10 is not clobbered). The - // v_and_b32 with 0xfffff masks M0 to the 20-bit DS-unit width; see - // LoadTrampolineAsmAssemblesAndDecodes for the rationale. - std::string Asm = "v_mbcnt_lo_u32_b32 v42, -1, 0\n" - "v_mbcnt_hi_u32_b32 v42, -1, v42\n" - "v_lshlrev_b32 v42, 2, v42\n" - "v_add_nc_u32 v42, m0, v42\n" - "v_and_b32 v42, 0xfffff, v42\n" - "ds_store_b32 v42, v10\n"; - - llvm::SmallVector Bytes = assembleSingleInst(Asm, S); - ASSERT_FALSE(Bytes.empty()); - - std::vector Decoded; - ASSERT_TRUE(decodeTextSection(Bytes.data(), Bytes.size(), S, Decoded)); - ASSERT_EQ(Decoded.size(), 6u); - EXPECT_EQ(Decoded[0].Mnemonic, "v_mbcnt_lo_u32_b32"); - EXPECT_EQ(Decoded[1].Mnemonic, "v_mbcnt_hi_u32_b32"); - EXPECT_EQ(Decoded[2].Mnemonic, "v_lshlrev_b32"); - EXPECT_EQ(Decoded[3].Mnemonic, "v_add_nc_u32"); - EXPECT_EQ(Decoded[4].Mnemonic, "v_and_b32"); - EXPECT_EQ(Decoded[5].Mnemonic, "ds_store_b32"); + // Direct operand access: register, then offset, then gds bit. No + // print-and-parse round-trip -- production code uses the same operand + // indices to reach the destination VGPR. + // Production code uses MRI.getName() to resolve the VGPR identifier + // ("VGPR5" for v5, etc.); pin that so a tablegen rename catches here. + expectAddTidLayout("ds_load_addtid_b32 v5 offset:128", 128, "VGPR5", S); + expectAddTidLayout("ds_store_addtid_b32 v10 offset:256", 256, "VGPR10", S); } TEST(AddTid, LoadTrampolineThroughBuildTrampoline) { @@ -651,8 +868,15 @@ TEST(AddTid, LoadTrampolineThroughBuildTrampoline) { // 6 body instructions + 1 branch-back tail. std::vector Decoded; ASSERT_TRUE(decodeTextSection(T.Bytes.data(), T.Bytes.size(), S, Decoded)); - ASSERT_EQ(Decoded.size(), 7u); - EXPECT_EQ(Decoded[6].Mnemonic, "s_branch"); + const llvm::StringRef Expected[] = {"v_mbcnt_lo_u32_b32", + "v_mbcnt_hi_u32_b32", + "v_lshlrev_b32", + "v_add_nc_u32", + "v_and_b32", + "ds_load_b32", + "s_branch"}; + expectDecodedMnemonics(Decoded, Expected); + expectDecodedBodyMatchesAsm(Decoded, AsmLines, S); } TEST(AddTid, StoreTrampolineThroughBuildTrampoline) { @@ -681,8 +905,13 @@ TEST(AddTid, StoreTrampolineThroughBuildTrampoline) { // 6 body instructions + 1 branch-back tail, matching the load variant. std::vector Decoded; ASSERT_TRUE(decodeTextSection(T.Bytes.data(), T.Bytes.size(), S, Decoded)); - ASSERT_EQ(Decoded.size(), 7u); - EXPECT_EQ(Decoded[0].Mnemonic, "v_mbcnt_lo_u32_b32"); - EXPECT_EQ(Decoded[5].Mnemonic, "ds_store_b32"); - EXPECT_EQ(Decoded[6].Mnemonic, "s_branch"); + const llvm::StringRef Expected[] = {"v_mbcnt_lo_u32_b32", + "v_mbcnt_hi_u32_b32", + "v_lshlrev_b32", + "v_add_nc_u32", + "v_and_b32", + "ds_store_b32", + "s_branch"}; + expectDecodedMnemonics(Decoded, Expected); + expectDecodedBodyMatchesAsm(Decoded, AsmLines, S); } diff --git a/amd/comgr/test-unit/comgr-test-elf-utils.h b/amd/comgr/test-unit/comgr-test-elf-utils.h index 25d8726ee377b..6bfc01f347538 100644 --- a/amd/comgr/test-unit/comgr-test-elf-utils.h +++ b/amd/comgr/test-unit/comgr-test-elf-utils.h @@ -9,9 +9,20 @@ #ifndef COMGR_TEST_UNIT_ELF_UTILS_H #define COMGR_TEST_UNIT_ELF_UTILS_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/BinaryFormat/ELF.h" +#include "llvm/BinaryFormat/MsgPackDocument.h" +#include "llvm/Support/AMDHSAKernelDescriptor.h" +#include "llvm/Support/MathExtras.h" +#include +#include #include +#include +#include +#include +#include +#include namespace comgr_test { @@ -33,6 +44,261 @@ inline llvm::ELF::Elf64_Ehdr makeElf64Ehdr(uint16_t Machine, return Ehdr; } +inline uint64_t alignTo4(uint64_t V) { return llvm::alignTo(V, 4); } + +inline uint64_t alignTo8(uint64_t V) { return llvm::alignTo(V, 8); } + +struct KernelDescriptorElfOptions { + uint16_t ElfType = llvm::ELF::ET_DYN; + std::string KernelName = "kernel"; + uint64_t TextAddr = 0x1000; + uint64_t RodataAddr = 0x2000; + bool EmitKernelDescriptorSymbol = true; + std::optional KernelDescriptorSymbolValue; + uint32_t GroupSegmentFixedSize = 0; + uint32_t ComputePgmRsrc3 = 0; + std::optional MetadataKernelName; + std::optional MetadataSgprCount; + bool MetadataOmitSgprCount = false; + bool MetadataSgprCountAsString = false; +}; + +struct KernelDescriptorElf { + std::vector Bytes; + uint64_t RodataAddr = 0; + uint64_t KernelDescriptorOffset = 0; + int64_t EntryOffset = 0; +}; + +inline std::string +makeAmdgpuMetadataBlob(const KernelDescriptorElfOptions &Options) { + llvm::msgpack::Document Doc; + llvm::msgpack::MapDocNode Root = Doc.getRoot().getMap(/*Convert=*/true); + llvm::msgpack::ArrayDocNode Kernels = Doc.getArrayNode(); + llvm::msgpack::MapDocNode Kernel = Doc.getMapNode(); + + const std::string &MetadataKernelName = Options.MetadataKernelName + ? *Options.MetadataKernelName + : Options.KernelName; + Kernel[".name"] = Doc.getNode(MetadataKernelName, /*Copy=*/true); + if (!Options.MetadataOmitSgprCount) { + if (Options.MetadataSgprCountAsString) + Kernel[".sgpr_count"] = Doc.getNode("not-an-integer", /*Copy=*/true); + else + Kernel[".sgpr_count"] = + static_cast(Options.MetadataSgprCount.value_or(0)); + } + Kernels.push_back(Kernel); + Root["amdhsa.kernels"] = Kernels; + + std::string Blob; + Doc.writeToBlob(Blob); + return Blob; +} + +inline void appendBytes(std::vector &Out, const void *Data, + size_t Size) { + const uint8_t *Begin = reinterpret_cast(Data); + Out.insert(Out.end(), Begin, Begin + Size); +} + +inline void appendPadding(std::vector &Out, uint64_t Alignment) { + assert(llvm::isPowerOf2_64(Alignment)); + uint64_t PaddedSize = + llvm::alignTo(static_cast(Out.size()), Alignment); + assert(PaddedSize <= std::numeric_limits::max()); + Out.resize(static_cast(PaddedSize), 0); +} + +inline std::vector makeAmdgpuMetadataNote(llvm::StringRef Blob) { + using namespace llvm::ELF; + + static constexpr char NoteName[] = "AMDGPU"; + + assert(Blob.size() <= std::numeric_limits::max()); + + Elf64_Nhdr Header{}; + Header.n_namesz = sizeof(NoteName); + Header.n_descsz = static_cast(Blob.size()); + Header.n_type = NT_AMDGPU_METADATA; + + std::vector Note; + appendBytes(Note, &Header, sizeof(Header)); + appendBytes(Note, NoteName, sizeof(NoteName)); + appendPadding(Note, 4); + appendBytes(Note, Blob.data(), Blob.size()); + appendPadding(Note, 4); + return Note; +} + +inline KernelDescriptorElf +makeKernelDescriptorElf(llvm::ArrayRef Text, + const KernelDescriptorElfOptions &Options = {}) { + using namespace llvm::ELF; + namespace hsa = llvm::amdhsa; + + static constexpr uint64_t ShOff = sizeof(Elf64_Ehdr); + static constexpr uint64_t TextOffset = 0x240; + static constexpr uint64_t KdBytes = sizeof(hsa::kernel_descriptor_t); + static constexpr char ShStrTab[] = + "\0.text\0.rodata\0.strtab\0.symtab\0.shstrtab\0"; + + std::string StrTab; + StrTab.push_back('\0'); + uint32_t KernelNameOff = StrTab.size(); + StrTab += Options.KernelName; + StrTab.push_back('\0'); + uint32_t KdNameOff = StrTab.size(); + StrTab += Options.KernelName; + StrTab += ".kd"; + StrTab.push_back('\0'); + + const bool HasMetadataNote = Options.MetadataSgprCount || + Options.MetadataOmitSgprCount || + Options.MetadataSgprCountAsString; + std::vector MetadataNote; + if (HasMetadataNote) { + std::string MetadataBlob = makeAmdgpuMetadataBlob(Options); + MetadataNote = makeAmdgpuMetadataNote(MetadataBlob); + } + + const uint64_t RodataOff = alignTo8(TextOffset + Text.size()); + const uint64_t StrTabOff = alignTo8(RodataOff + KdBytes); + const uint64_t SymTabOff = alignTo8(StrTabOff + StrTab.size()); + const uint64_t SymCount = Options.EmitKernelDescriptorSymbol ? 3 : 2; + const uint64_t ShStrTabOff = + alignTo8(SymTabOff + SymCount * sizeof(Elf64_Sym)); + const uint64_t NoteOff = + HasMetadataNote ? alignTo4(ShStrTabOff + sizeof(ShStrTab)) : 0; + const uint64_t PhOff = + HasMetadataNote ? alignTo8(NoteOff + MetadataNote.size()) : 0; + const uint64_t ContentEnd = HasMetadataNote ? PhOff + sizeof(Elf64_Phdr) + : ShStrTabOff + sizeof(ShStrTab); + const uint64_t BufSize = alignTo8(ContentEnd + 64); + + KernelDescriptorElf Result; + Result.Bytes.assign(BufSize, 0); + Result.RodataAddr = Options.RodataAddr; + Result.KernelDescriptorOffset = RodataOff; + const uint64_t KernelDescriptorAddr = + Options.KernelDescriptorSymbolValue.value_or( + Options.ElfType == ET_REL ? 0 : Options.RodataAddr); + Result.EntryOffset = static_cast(Options.TextAddr) - + static_cast(Options.RodataAddr); + + uint8_t *Buf = Result.Bytes.data(); + std::memcpy(Buf + TextOffset, Text.data(), Text.size()); + std::memcpy(Buf + StrTabOff, StrTab.data(), StrTab.size()); + std::memcpy(Buf + ShStrTabOff, ShStrTab, sizeof(ShStrTab)); + if (HasMetadataNote) + std::memcpy(Buf + NoteOff, MetadataNote.data(), MetadataNote.size()); + + Elf64_Ehdr Ehdr = makeElf64Ehdr(EM_AMDGPU); + Ehdr.e_ident[EI_OSABI] = ELFOSABI_AMDGPU_HSA; + Ehdr.e_type = Options.ElfType; + Ehdr.e_version = EV_CURRENT; + Ehdr.e_shoff = ShOff; + if (HasMetadataNote) { + Ehdr.e_phoff = PhOff; + Ehdr.e_phentsize = sizeof(Elf64_Phdr); + Ehdr.e_phnum = 1; + } + Ehdr.e_ehsize = sizeof(Elf64_Ehdr); + Ehdr.e_shentsize = sizeof(Elf64_Shdr); + Ehdr.e_shnum = 6; + Ehdr.e_shstrndx = 5; + std::memcpy(Buf, &Ehdr, sizeof(Ehdr)); + + Elf64_Shdr TextSh{}; + TextSh.sh_name = 1; + TextSh.sh_type = SHT_PROGBITS; + TextSh.sh_flags = SHF_ALLOC | SHF_EXECINSTR; + TextSh.sh_offset = TextOffset; + TextSh.sh_addr = Options.TextAddr; + TextSh.sh_size = Text.size(); + TextSh.sh_addralign = 4; + std::memcpy(Buf + ShOff + 1 * sizeof(Elf64_Shdr), &TextSh, sizeof(TextSh)); + + Elf64_Shdr RodataSh{}; + RodataSh.sh_name = 7; + RodataSh.sh_type = SHT_PROGBITS; + RodataSh.sh_flags = SHF_ALLOC; + RodataSh.sh_offset = RodataOff; + RodataSh.sh_addr = Options.RodataAddr; + RodataSh.sh_size = KdBytes; + RodataSh.sh_addralign = 8; + std::memcpy(Buf + ShOff + 2 * sizeof(Elf64_Shdr), &RodataSh, + sizeof(RodataSh)); + + Elf64_Shdr StrtabSh{}; + StrtabSh.sh_name = 15; + StrtabSh.sh_type = SHT_STRTAB; + StrtabSh.sh_offset = StrTabOff; + StrtabSh.sh_size = StrTab.size(); + std::memcpy(Buf + ShOff + 3 * sizeof(Elf64_Shdr), &StrtabSh, + sizeof(StrtabSh)); + + Elf64_Shdr SymtabSh{}; + SymtabSh.sh_name = 23; + SymtabSh.sh_type = SHT_SYMTAB; + SymtabSh.sh_offset = SymTabOff; + SymtabSh.sh_size = SymCount * sizeof(Elf64_Sym); + SymtabSh.sh_link = 3; + SymtabSh.sh_entsize = sizeof(Elf64_Sym); + std::memcpy(Buf + ShOff + 4 * sizeof(Elf64_Shdr), &SymtabSh, + sizeof(SymtabSh)); + + Elf64_Shdr ShstrSh{}; + ShstrSh.sh_name = 31; + ShstrSh.sh_type = SHT_STRTAB; + ShstrSh.sh_offset = ShStrTabOff; + ShstrSh.sh_size = sizeof(ShStrTab); + std::memcpy(Buf + ShOff + 5 * sizeof(Elf64_Shdr), &ShstrSh, sizeof(ShstrSh)); + + if (HasMetadataNote) { + Elf64_Phdr NotePhdr{}; + NotePhdr.p_type = PT_NOTE; + NotePhdr.p_offset = NoteOff; + NotePhdr.p_filesz = MetadataNote.size(); + NotePhdr.p_memsz = MetadataNote.size(); + NotePhdr.p_align = 4; + std::memcpy(Buf + PhOff, &NotePhdr, sizeof(NotePhdr)); + } + + std::memcpy( + Buf + RodataOff + + offsetof(hsa::kernel_descriptor_t, kernel_code_entry_byte_offset), + &Result.EntryOffset, sizeof(Result.EntryOffset)); + std::memcpy(Buf + RodataOff + + offsetof(hsa::kernel_descriptor_t, group_segment_fixed_size), + &Options.GroupSegmentFixedSize, + sizeof(Options.GroupSegmentFixedSize)); + std::memcpy(Buf + RodataOff + + offsetof(hsa::kernel_descriptor_t, compute_pgm_rsrc3), + &Options.ComputePgmRsrc3, sizeof(Options.ComputePgmRsrc3)); + + Elf64_Sym KernelSym{}; + KernelSym.st_name = KernelNameOff; + KernelSym.setBindingAndType(STB_GLOBAL, STT_FUNC); + KernelSym.st_shndx = 1; + KernelSym.st_value = Options.TextAddr; + KernelSym.st_size = Text.size(); + std::memcpy(Buf + SymTabOff + 1 * sizeof(Elf64_Sym), &KernelSym, + sizeof(KernelSym)); + + if (Options.EmitKernelDescriptorSymbol) { + Elf64_Sym KdSym{}; + KdSym.st_name = KdNameOff; + KdSym.setBindingAndType(STB_GLOBAL, STT_OBJECT); + KdSym.st_shndx = 2; + KdSym.st_value = KernelDescriptorAddr; + KdSym.st_size = KdBytes; + std::memcpy(Buf + SymTabOff + 2 * sizeof(Elf64_Sym), &KdSym, sizeof(KdSym)); + } + + return Result; +} + } // namespace comgr_test #endif // COMGR_TEST_UNIT_ELF_UTILS_H From 04ba3f5ca921e6f98a45ad4a4874392fe8ea7e02 Mon Sep 17 00:00:00 2001 From: Harsh Menon Date: Tue, 30 Jun 2026 07:05:58 -0700 Subject: [PATCH 2/4] AMDGPU: expose hotswap rewrite options API --- amd/comgr/include/amd_comgr.h.in | 82 +++++++++++++++++++++ amd/comgr/src/comgr-env.cpp | 5 -- amd/comgr/src/comgr-env.h | 3 - amd/comgr/src/comgr-hotswap.cpp | 123 ++++++++++++++++++++++--------- amd/comgr/src/exportmap.in | 1 + amd/comgr/src/hotswap/README.md | 14 ++-- 6 files changed, 177 insertions(+), 51 deletions(-) diff --git a/amd/comgr/include/amd_comgr.h.in b/amd/comgr/include/amd_comgr.h.in index 1e393f1e63192..63cfd4c248abe 100644 --- a/amd/comgr/include/amd_comgr.h.in +++ b/amd/comgr/include/amd_comgr.h.in @@ -2833,6 +2833,88 @@ amd_comgr_hotswap_rewrite( const char *target_isa_name, amd_comgr_data_t *output) AMD_COMGR_VERSION_3_2; +/** + * @brief HotSwap rewrite option flags. + */ +typedef enum amd_comgr_hotswap_rewrite_flag_s { + /** + * Apply the default rewrite behavior. + */ + AMD_COMGR_HOTSWAP_REWRITE_FLAG_NONE = 0, + /** + * Redirect kernel descriptors through generated entry stubs. + */ + AMD_COMGR_HOTSWAP_REWRITE_FLAG_ENTRY_TRAMPOLINES = 0x1, +} amd_comgr_hotswap_rewrite_flag_t; + +/** + * @brief Options for @p amd_comgr_hotswap_rewrite_with_options. + */ +typedef struct amd_comgr_hotswap_rewrite_options_s { + /** + * Size of this structure, in bytes. Must be at least + * sizeof(amd_comgr_hotswap_rewrite_options_t). + */ + size_t size; + /** + * Bitwise OR of @p amd_comgr_hotswap_rewrite_flag_t values. + */ + uint64_t flags; +} amd_comgr_hotswap_rewrite_options_t; + +/** + * @brief Rewrite a code object from one ISA to another with explicit options. + * + * Rewrites GPU instructions in the ELF code object so that it can execute + * on a different target ISA. This includes both same-family stepping + * patches (e.g. B0 to A0) and cross-family transpilation. + * The input ELF is not modified; a new data object is created and returned. + * + * If no patches are needed, the output is a copy of the input. + * + * Currently supported transformations: + * - GFX1250 B0 to A0 + * - GFX125x entry trampolines when requested by @p rewrite_options + * + * Additional source/target ISA pairs may be added in future releases. + * Unsupported @p source_isa_name / @p target_isa_name combinations return + * @c AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT. + * + * @param[in] input A data object of kind @p AMD_COMGR_DATA_KIND_EXECUTABLE + * containing the input ELF code object bytes. + * @param[in] source_isa_name A null terminated string that is the isa name + * the code object was compiled for. The isa name is defined as the Code + * Object Target Identification string, described at + * https://llvm.org/docs/AMDGPUUsage.html#code-object-target-identification + * @param[in] target_isa_name A null terminated string that is the isa name + * of the target GPU. + * @param[in] rewrite_options Options controlling optional rewrite behavior. + * Must not be NULL. Unknown flag bits return + * @c AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT. + * @param[out] output A handle to a data object of kind @p + * AMD_COMGR_DATA_KIND_EXECUTABLE containing the rewritten ELF. The caller + * must release this handle using @c amd_comgr_release_data when done. + * @p output is not modified on failure. + * + * @retval ::AMD_COMGR_STATUS_SUCCESS Patching completed successfully. + * @retval ::AMD_COMGR_STATUS_ERROR An internal error occurred. + * @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p input is an invalid + * data object, is not of kind @p AMD_COMGR_DATA_KIND_EXECUTABLE, does not + * contain data bytes, @p source_isa_name, @p target_isa_name, @p + * rewrite_options, or @p output is NULL, the source/target isa name + * combination is not supported, the options structure is too small, or + * unsupported option flags are set. + * @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to allocate + * the output data object. + */ +amd_comgr_status_t AMD_COMGR_API +amd_comgr_hotswap_rewrite_with_options( + amd_comgr_data_t input, + const char *source_isa_name, + const char *target_isa_name, + const amd_comgr_hotswap_rewrite_options_t *rewrite_options, + amd_comgr_data_t *output) AMD_COMGR_VERSION_3_3; + /** @} */ #ifdef __cplusplus diff --git a/amd/comgr/src/comgr-env.cpp b/amd/comgr/src/comgr-env.cpp index 52ffc17a625ec..7922cf67895c1 100644 --- a/amd/comgr/src/comgr-env.cpp +++ b/amd/comgr/src/comgr-env.cpp @@ -84,11 +84,6 @@ bool shouldEmitVerboseLogs() { return VerboseLogs && StringRef(VerboseLogs) != "0"; } -bool shouldUseHotswapEntryTrampolines() { - static char *EntryTrampolines = getenv("AMD_COMGR_HOTSWAP_ENTRY_TRAMPOLINES"); - return EntryTrampolines && StringRef(EntryTrampolines) != "0"; -} - llvm::StringRef getLLVMPath() { static const char *EnvLLVMPath = std::getenv("LLVM_PATH"); return EnvLLVMPath; diff --git a/amd/comgr/src/comgr-env.h b/amd/comgr/src/comgr-env.h index 75e9d30ab99ca..9715b48a32c46 100644 --- a/amd/comgr/src/comgr-env.h +++ b/amd/comgr/src/comgr-env.h @@ -26,9 +26,6 @@ std::optional getRedirectLogs(); /// Return whether the environment requests verbose logging. bool shouldEmitVerboseLogs(); -/// Return whether hotswap should redirect kernel descriptors to entry stubs. -bool shouldUseHotswapEntryTrampolines(); - /// Return whether the environment requests time statistics collection. bool needTimeStatistics(); diff --git a/amd/comgr/src/comgr-hotswap.cpp b/amd/comgr/src/comgr-hotswap.cpp index 2017f8b92f2d4..dde6f28da1ded 100644 --- a/amd/comgr/src/comgr-hotswap.cpp +++ b/amd/comgr/src/comgr-hotswap.cpp @@ -6,7 +6,6 @@ //===----------------------------------------------------------------------===// #include "amd_comgr.h" -#include "comgr-env.h" #include "comgr-hotswap-internal.h" #include "comgr.h" @@ -125,33 +124,65 @@ static bool shouldRunB0A0Patches(const ParsedHotswapIsa &Source, return SourceIsB0 && !TargetIsB0; } -} // namespace +static amd_comgr_status_t validateHotswapRewriteOptions( + const amd_comgr_hotswap_rewrite_options_t *RewriteOptions, + uint64_t &RewriteFlags) { + if (!RewriteOptions) { + hotswap::log() + << "hotswap: error: amd_comgr_hotswap_rewrite_with_options: rewrite " + "options are required\n"; + return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; + } + + if (RewriteOptions->size < sizeof(amd_comgr_hotswap_rewrite_options_t)) { + hotswap::log() + << "hotswap: error: amd_comgr_hotswap_rewrite_with_options: rewrite " + "options size " + << RewriteOptions->size << " is smaller than " + << sizeof(amd_comgr_hotswap_rewrite_options_t) << "\n"; + return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; + } + + static constexpr uint64_t SupportedFlags = + AMD_COMGR_HOTSWAP_REWRITE_FLAG_ENTRY_TRAMPOLINES; + if (RewriteOptions->flags & ~SupportedFlags) { + hotswap::log() << "hotswap: error: amd_comgr_hotswap_rewrite_with_options: " + "unsupported rewrite option flags 0x"; + hotswap::log().write_hex(RewriteOptions->flags & ~SupportedFlags); + hotswap::log() << "\n"; + return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; + } + + RewriteFlags = RewriteOptions->flags; + return AMD_COMGR_STATUS_SUCCESS; +} + +static amd_comgr_status_t +hotswapRewrite(amd_comgr_data_t input, const char *source_isa_name, + const char *target_isa_name, uint64_t RewriteFlags, + const char *ApiName, amd_comgr_data_t *output) { + const bool RunEntryTrampolines = + RewriteFlags & AMD_COMGR_HOTSWAP_REWRITE_FLAG_ENTRY_TRAMPOLINES; -amd_comgr_status_t AMD_COMGR_API amd_comgr_hotswap_rewrite( - amd_comgr_data_t input, const char *source_isa_name, - const char *target_isa_name, amd_comgr_data_t *output) { DataObject *InputP = DataObject::convert(input); if (!InputP) { - hotswap::log() - << "hotswap: error: amd_comgr_hotswap_rewrite: invalid input data " - "handle\n"; + hotswap::log() << "hotswap: error: " << ApiName + << ": invalid input data handle\n"; return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; } if (!InputP->Data) { - hotswap::log() - << "hotswap: error: amd_comgr_hotswap_rewrite: input data is null\n"; + hotswap::log() << "hotswap: error: " << ApiName << ": input data is null\n"; return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; } if (InputP->DataKind != AMD_COMGR_DATA_KIND_EXECUTABLE) { - hotswap::log() - << "hotswap: error: amd_comgr_hotswap_rewrite: input data kind must " - "be executable\n"; + hotswap::log() << "hotswap: error: " << ApiName + << ": input data kind must be executable\n"; return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; } if (!source_isa_name || !target_isa_name || !output) { - hotswap::log() - << "hotswap: error: amd_comgr_hotswap_rewrite: source ISA, target " - "ISA, and output handle are required\n"; + hotswap::log() << "hotswap: error: " << ApiName + << ": source ISA, target ISA, and output handle are " + "required\n"; return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; } @@ -162,26 +193,24 @@ amd_comgr_status_t AMD_COMGR_API amd_comgr_hotswap_rewrite( if (!isGfx12_5Processor(SourceIdent.Ident.Processor) || !isGfx12_5Processor(TargetIdent.Ident.Processor)) { - hotswap::log() - << "hotswap: error: amd_comgr_hotswap_rewrite: only gfx125x " - "processors are supported, got source '" - << SourceIdent.Ident.Processor << "' and target '" - << TargetIdent.Ident.Processor << "'\n"; + hotswap::log() << "hotswap: error: " << ApiName + << ": only gfx125x processors are supported, got source '" + << SourceIdent.Ident.Processor << "' and target '" + << TargetIdent.Ident.Processor << "'\n"; return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; } if (SourceIdent.Ident.Processor != TargetIdent.Ident.Processor) { - hotswap::log() - << "hotswap: error: amd_comgr_hotswap_rewrite: processor retargeting " - "is not supported, got source '" - << SourceIdent.Ident.Processor << "' and target '" - << TargetIdent.Ident.Processor << "'\n"; + hotswap::log() << "hotswap: error: " << ApiName + << ": processor retargeting is not supported, got source '" + << SourceIdent.Ident.Processor << "' and target '" + << TargetIdent.Ident.Processor << "'\n"; return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT; } hotswap::Gfx1250RewriteOptions Options; Options.RunB0A0Patches = SourceIdent.Ident.Processor == "gfx1250" && shouldRunB0A0Patches(SourceIdent, TargetIdent); - Options.RunEntryTrampolines = env::shouldUseHotswapEntryTrampolines(); + Options.RunEntryTrampolines = RunEntryTrampolines; std::unique_ptr OutBuffer; amd_comgr_status_t Status = hotswap::retargetCodeObject( @@ -189,24 +218,22 @@ amd_comgr_status_t AMD_COMGR_API amd_comgr_hotswap_rewrite( if (Status != AMD_COMGR_STATUS_SUCCESS) return Status; if (!OutBuffer) { - hotswap::log() - << "hotswap: error: amd_comgr_hotswap_rewrite: rewrite returned no " - "output buffer\n"; + hotswap::log() << "hotswap: error: " << ApiName + << ": rewrite returned no output buffer\n"; return AMD_COMGR_STATUS_ERROR; } DataObject *OutputP = DataObject::allocate(AMD_COMGR_DATA_KIND_EXECUTABLE); if (!OutputP) { - hotswap::log() << "hotswap: error: amd_comgr_hotswap_rewrite: output data " - "allocation failed\n"; + hotswap::log() << "hotswap: error: " << ApiName + << ": output data allocation failed\n"; return AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES; } if (amd_comgr_status_t SetStatus = OutputP->setData(std::move(OutBuffer))) { - hotswap::log() - << "hotswap: error: amd_comgr_hotswap_rewrite: output setData " - "failed with status " - << SetStatus << "\n"; + hotswap::log() << "hotswap: error: " << ApiName + << ": output setData failed with status " << SetStatus + << "\n"; OutputP->release(); return SetStatus; } @@ -214,3 +241,27 @@ amd_comgr_status_t AMD_COMGR_API amd_comgr_hotswap_rewrite( *output = DataObject::convert(OutputP); return AMD_COMGR_STATUS_SUCCESS; } + +} // namespace + +amd_comgr_status_t AMD_COMGR_API amd_comgr_hotswap_rewrite( + amd_comgr_data_t input, const char *source_isa_name, + const char *target_isa_name, amd_comgr_data_t *output) { + return hotswapRewrite(input, source_isa_name, target_isa_name, + AMD_COMGR_HOTSWAP_REWRITE_FLAG_NONE, + "amd_comgr_hotswap_rewrite", output); +} + +amd_comgr_status_t AMD_COMGR_API amd_comgr_hotswap_rewrite_with_options( + amd_comgr_data_t input, const char *source_isa_name, + const char *target_isa_name, + const amd_comgr_hotswap_rewrite_options_t *rewrite_options, + amd_comgr_data_t *output) { + uint64_t RewriteFlags = AMD_COMGR_HOTSWAP_REWRITE_FLAG_NONE; + if (amd_comgr_status_t Status = + validateHotswapRewriteOptions(rewrite_options, RewriteFlags)) + return Status; + + return hotswapRewrite(input, source_isa_name, target_isa_name, RewriteFlags, + "amd_comgr_hotswap_rewrite_with_options", output); +} diff --git a/amd/comgr/src/exportmap.in b/amd/comgr/src/exportmap.in index baac4815cb087..db7775ed72eaf 100644 --- a/amd/comgr/src/exportmap.in +++ b/amd/comgr/src/exportmap.in @@ -104,4 +104,5 @@ global: amd_comgr_hotswap_rewrite; global: amd_comgr_action_info_set_block_sizes; amd_comgr_action_info_get_block_sizes_count; amd_comgr_action_info_get_block_sizes; + amd_comgr_hotswap_rewrite_with_options; } @amd_comgr_NAME@_3.2; diff --git a/amd/comgr/src/hotswap/README.md b/amd/comgr/src/hotswap/README.md index 2a2eccec20105..852a46a642b1c 100644 --- a/amd/comgr/src/hotswap/README.md +++ b/amd/comgr/src/hotswap/README.md @@ -7,23 +7,23 @@ rewrite applied. The input code object is not modified. This directory contains COMGR's hotswap transpiler scaffolding, the raiser-based path for heavier cross-ISA transformations. The same-family stepping patches and -optional entry trampolines are implemented in the surrounding COMGR source files -and are exposed through `amd_comgr_hotswap_rewrite`. +entry trampolines are implemented in the surrounding COMGR source files and are +exposed through `amd_comgr_hotswap_rewrite_with_options`. ## Supported transformations | Transformation | Status | | -------------- | ------ | | gfx1250 B0 to A0 | Supported | -| gfx125x entry trampolines | Supported, on by default | +| gfx125x entry trampolines | Supported, opt-in | | gfx950 | Coming soon | | gfx942 | Coming soon | -## Environment variables +## Rewrite options -| Variable | Effect | -| -------- | ------ | -| `AMD_COMGR_HOTSWAP_ENTRY_TRAMPOLINES` | Set to a nonzero value to enable gfx125x kernel descriptor entry redirection through COMGR-generated entry stubs, independent of A0/B0 stepping. Off by default. | +Callers request optional gfx125x kernel descriptor entry redirection through +`amd_comgr_hotswap_rewrite_with_options` with +`AMD_COMGR_HOTSWAP_REWRITE_FLAG_ENTRY_TRAMPOLINES`. ## Transpiler (cross-gen) From 80cbb2836313a0574b85bb9e38457c8a75212f50 Mon Sep 17 00:00:00 2001 From: Harsh Menon Date: Tue, 30 Jun 2026 08:41:25 -0700 Subject: [PATCH 3/4] AMDGPU: test hotswap rewrite options API --- .../test-lit/comgr-sources/hotswap-rewrite.c | 53 +++++++++++++++++-- .../hotswap-kernel-entry-trampoline.s | 16 ++++-- amd/comgr/test-lit/hotswap-rewrite.c | 17 ++++-- 3 files changed, 72 insertions(+), 14 deletions(-) diff --git a/amd/comgr/test-lit/comgr-sources/hotswap-rewrite.c b/amd/comgr/test-lit/comgr-sources/hotswap-rewrite.c index 01cb19fad38d3..cc460b70ffbe9 100644 --- a/amd/comgr/test-lit/comgr-sources/hotswap-rewrite.c +++ b/amd/comgr/test-lit/comgr-sources/hotswap-rewrite.c @@ -8,14 +8,43 @@ /// /// \file /// Canonical hotswap input/output driver for lit tests. Loads an ELF, runs -/// amd_comgr_hotswap_rewrite, and optionally dumps the output and/or checks -/// that a second rewrite produces identical output (idempotency). +/// the hotswap rewrite API, and optionally dumps the output and/or checks that +/// a second rewrite produces identical output (idempotency). /// //===----------------------------------------------------------------------===// #include "amd_comgr.h" #include "common.h" +enum RewriteOptionMode { + RewriteDefault, + RewriteEntryTrampolines, + RewriteBadOptionsSize, + RewriteBadOptionsFlags, +}; + +static amd_comgr_status_t runRewrite(amd_comgr_data_t InputData, + const char *SourceISA, + const char *TargetISA, + enum RewriteOptionMode OptionMode, + amd_comgr_data_t *OutputData) { + if (OptionMode == RewriteDefault) + return amd_comgr_hotswap_rewrite(InputData, SourceISA, TargetISA, + OutputData); + + amd_comgr_hotswap_rewrite_options_t Options = { + sizeof(amd_comgr_hotswap_rewrite_options_t), 0}; + if (OptionMode == RewriteEntryTrampolines) + Options.flags = AMD_COMGR_HOTSWAP_REWRITE_FLAG_ENTRY_TRAMPOLINES; + else if (OptionMode == RewriteBadOptionsSize) + Options.size = 0; + else if (OptionMode == RewriteBadOptionsFlags) + Options.flags = 0x2; + + return amd_comgr_hotswap_rewrite_with_options(InputData, SourceISA, TargetISA, + &Options, OutputData); +} + int main(int argc, char *argv[]) { if (argc < 2) { amd_comgr_data_t dummy_output; @@ -24,12 +53,19 @@ int main(int argc, char *argv[]) { amd_comgr_hotswap_rewrite(dummy_input, NULL, NULL, &dummy_output); if (Status != AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT) fail("rewrite with NULL args: expected INVALID_ARGUMENT"); + + Status = amd_comgr_hotswap_rewrite_with_options(dummy_input, NULL, NULL, + NULL, &dummy_output); + if (Status != AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT) + fail("rewrite with NULL options: expected INVALID_ARGUMENT"); + printf("NULL_ARGS: INVALID_ARGUMENT\n"); return 0; } if (argc < 4) fail("usage: hotswap-rewrite " + "[--entry-trampolines] [--bad-options-size] [--bad-options-flags] " "[--zero-size] [--output ] [--dump ] " "[--check-idempotent] [--expect-status ]"); @@ -41,10 +77,17 @@ int main(int argc, char *argv[]) { const char *DumpFile = NULL; const char *ExpectStatus = NULL; int CheckIdempotent = 0; + enum RewriteOptionMode OptionMode = RewriteDefault; for (int I = 4; I < argc; ++I) { if (strcmp(argv[I], "--zero-size") == 0) ZeroSize = 1; + else if (strcmp(argv[I], "--entry-trampolines") == 0) + OptionMode = RewriteEntryTrampolines; + else if (strcmp(argv[I], "--bad-options-size") == 0) + OptionMode = RewriteBadOptionsSize; + else if (strcmp(argv[I], "--bad-options-flags") == 0) + OptionMode = RewriteBadOptionsFlags; else if (strcmp(argv[I], "--output") == 0 && I + 1 < argc) OutputPath = argv[++I]; else if (strcmp(argv[I], "--dump") == 0 && I + 1 < argc) @@ -70,7 +113,7 @@ int main(int argc, char *argv[]) { amd_comgr_data_t OutputData; amd_comgr_status_t Status = - amd_comgr_hotswap_rewrite(InputData, SourceISA, TargetISA, &OutputData); + runRewrite(InputData, SourceISA, TargetISA, OptionMode, &OutputData); const char *StatusString; amd_comgr_(status_string(Status, &StatusString)); @@ -110,8 +153,8 @@ int main(int argc, char *argv[]) { if (CheckIdempotent) { amd_comgr_data_t Output2Data; - Status = amd_comgr_hotswap_rewrite(OutputData, SourceISA, TargetISA, - &Output2Data); + Status = runRewrite(OutputData, SourceISA, TargetISA, OptionMode, + &Output2Data); if (Status != AMD_COMGR_STATUS_SUCCESS) fail("idempotent rewrite failed with status %d", (int)Status); diff --git a/amd/comgr/test-lit/hotswap-kernel-entry-trampoline.s b/amd/comgr/test-lit/hotswap-kernel-entry-trampoline.s index 7df45acd8ca9d..7867a1bd16a77 100644 --- a/amd/comgr/test-lit/hotswap-kernel-entry-trampoline.s +++ b/amd/comgr/test-lit/hotswap-kernel-entry-trampoline.s @@ -13,9 +13,9 @@ // NO-TRAMP: s_endpgm // NO-TRAMP-NOT: global_wb -// RUN: AMD_COMGR_HOTSWAP_ENTRY_TRAMPOLINES=1 hotswap-rewrite %t.elf \ +// RUN: hotswap-rewrite %t.elf \ // RUN: amdgcn-amd-amdhsa--gfx1250 amdgcn-amd-amdhsa--gfx1250 \ -// RUN: --output %t.out.elf \ +// RUN: --entry-trampolines --output %t.out.elf \ // RUN: | %FileCheck --check-prefix=API %s // API: RESULT: SUCCESS @@ -34,13 +34,21 @@ // METADATA: .name: entry_tramp_kernel // METADATA: .sgpr_count: 10 +// RUN: hotswap-rewrite %t.out.elf \ +// RUN: amdgcn-amd-amdhsa--gfx1250 amdgcn-amd-amdhsa--gfx1250 \ +// RUN: --entry-trampolines --output %t.out2.elf \ +// RUN: | %FileCheck --check-prefix=API2 %s +// API2: RESULT: SUCCESS +// RUN: cmp %t.out.elf %t.out2.elf + // COM: If the requested entry trampoline cannot allocate an aligned scratch // COM: SGPR pair, the rewrite fails instead of returning a partial output. // RUN: sed 's/.sgpr_count: 8/.sgpr_count: 105/' %s > %t.highsgpr.s // RUN: %clang -target amdgcn-amd-amdhsa -mcpu=gfx1250 -nostdlib \ // RUN: %t.highsgpr.s -o %t.highsgpr.elf -// RUN: AMD_COMGR_HOTSWAP_ENTRY_TRAMPOLINES=1 hotswap-rewrite %t.highsgpr.elf amdgcn-amd-amdhsa--gfx1250 \ -// RUN: amdgcn-amd-amdhsa--gfx1250 --expect-status ERROR \ +// RUN: hotswap-rewrite %t.highsgpr.elf \ +// RUN: amdgcn-amd-amdhsa--gfx1250 amdgcn-amd-amdhsa--gfx1250 \ +// RUN: --entry-trampolines --expect-status ERROR \ // RUN: | %FileCheck --check-prefix=NO-SCRATCH %s // NO-SCRATCH: RESULT: ERROR diff --git a/amd/comgr/test-lit/hotswap-rewrite.c b/amd/comgr/test-lit/hotswap-rewrite.c index 6cde81f732e48..f6f963e773a95 100644 --- a/amd/comgr/test-lit/hotswap-rewrite.c +++ b/amd/comgr/test-lit/hotswap-rewrite.c @@ -7,6 +7,15 @@ // RUN: hotswap-rewrite | %FileCheck --check-prefix=NULL %s // NULL: NULL_ARGS: INVALID_ARGUMENT +// COM: Options API validation +// RUN: hotswap-rewrite %t.elf amdgcn-amd-amdhsa--gfx1250 amdgcn-amd-amdhsa--gfx1250 \ +// RUN: --bad-options-size --expect-status INVALID_ARGUMENT \ +// RUN: | %FileCheck --check-prefix=BADOPTIONS %s +// RUN: hotswap-rewrite %t.elf amdgcn-amd-amdhsa--gfx1250 amdgcn-amd-amdhsa--gfx1250 \ +// RUN: --bad-options-flags --expect-status INVALID_ARGUMENT \ +// RUN: | %FileCheck --check-prefix=BADOPTIONS %s +// BADOPTIONS: RESULT: INVALID_ARGUMENT + // COM: Unsupported ISA pair // RUN: hotswap-rewrite %t.elf amdgcn-amd-amdhsa--gfx942 amdgcn-amd-amdhsa--gfx942 \ // RUN: | %FileCheck --check-prefix=INVALID %s @@ -23,14 +32,12 @@ // ZEROSIZE: RESULT: INVALID_ARGUMENT // COM: Supported GFX1250 pair on a malformed ELF (no .text section). -// COM: retargetCodeObjectB0A0 rejects inputs that fail ELF64 parsing or have +// COM: retargetCodeObject rejects inputs that fail ELF64 parsing or have // COM: an empty .text section with INVALID_ARGUMENT -- returning SUCCESS with // COM: an unchanged copy there would silently hide caller-side bugs. // RUN: hotswap-rewrite %t.elf amdgcn-amd-amdhsa--gfx1250 amdgcn-amd-amdhsa--gfx1250 \ // RUN: | %FileCheck --check-prefix=MALFORMED %s // MALFORMED: RESULT: INVALID_ARGUMENT -// COM: End-to-end coverage on a real gfx1250 code object (compiled via clang -// COM: --offload-arch=gfx1250, verified with llvm-readelf + llvm-objdump) is -// COM: tracked as a follow-up once the gfx1250 kernel-compile driver is wired -// COM: into the test-lit infrastructure. +// COM: End-to-end coverage on real gfx1250 code objects is covered by +// COM: hotswap-rewrite-e2e.hip and hotswap-kernel-entry-trampoline.s. From b64f8f51a1649f73f74f9b85f524418d6727c177 Mon Sep 17 00:00:00 2001 From: Harsh Menon Date: Tue, 30 Jun 2026 11:49:43 -0700 Subject: [PATCH 4/4] AMDGPU: document hotswap no-op behavior --- amd/comgr/include/amd_comgr.h.in | 10 ++++-- amd/comgr/src/hotswap/README.md | 5 +++ .../test-lit/comgr-sources/hotswap-rewrite.c | 4 +-- amd/comgr/test-unit/HotswapMCTest.cpp | 34 ++++++++++--------- 4 files changed, 33 insertions(+), 20 deletions(-) diff --git a/amd/comgr/include/amd_comgr.h.in b/amd/comgr/include/amd_comgr.h.in index 63cfd4c248abe..2bfeca80c8cf2 100644 --- a/amd/comgr/include/amd_comgr.h.in +++ b/amd/comgr/include/amd_comgr.h.in @@ -2795,7 +2795,10 @@ amd_comgr_map_elf_virtual_address_to_code_object_offset( * patches (e.g. B0 to A0) and cross-family transpilation. * The input ELF is not modified; a new data object is created and returned. * - * If no patches are needed, the output is a copy of the input. + * A successful call means COMGR produced a valid output code object, not + * necessarily that the output bytes differ from the input. If the + * source/target ISA pair selects no enabled transformation, the output is a + * copy of the input. * * Currently supported transformations: * - GFX1250 B0 to A0 @@ -2870,7 +2873,10 @@ typedef struct amd_comgr_hotswap_rewrite_options_s { * patches (e.g. B0 to A0) and cross-family transpilation. * The input ELF is not modified; a new data object is created and returned. * - * If no patches are needed, the output is a copy of the input. + * A successful call means COMGR produced a valid output code object, not + * necessarily that the output bytes differ from the input. If the + * source/target ISA pair and rewrite options select no enabled transformation, + * the output is a copy of the input. * * Currently supported transformations: * - GFX1250 B0 to A0 diff --git a/amd/comgr/src/hotswap/README.md b/amd/comgr/src/hotswap/README.md index 852a46a642b1c..bf0dd44c14169 100644 --- a/amd/comgr/src/hotswap/README.md +++ b/amd/comgr/src/hotswap/README.md @@ -25,6 +25,11 @@ Callers request optional gfx125x kernel descriptor entry redirection through `amd_comgr_hotswap_rewrite_with_options` with `AMD_COMGR_HOTSWAP_REWRITE_FLAG_ENTRY_TRAMPOLINES`. +`AMD_COMGR_STATUS_SUCCESS` means COMGR produced a valid output code object, not +necessarily that the output bytes changed. If the source/target ISA pair and +rewrite options select no enabled transformation, the output is a copy of the +input. + ## Transpiler (cross-gen) The transpiler is the heavier sibling to the byte-level rewrite. It raises diff --git a/amd/comgr/test-lit/comgr-sources/hotswap-rewrite.c b/amd/comgr/test-lit/comgr-sources/hotswap-rewrite.c index cc460b70ffbe9..59791d2dbf03b 100644 --- a/amd/comgr/test-lit/comgr-sources/hotswap-rewrite.c +++ b/amd/comgr/test-lit/comgr-sources/hotswap-rewrite.c @@ -8,8 +8,8 @@ /// /// \file /// Canonical hotswap input/output driver for lit tests. Loads an ELF, runs -/// the hotswap rewrite API, and optionally dumps the output and/or checks that -/// a second rewrite produces identical output (idempotency). +/// the hotswap rewrite API, and optionally dumps the output and/or reruns the +/// same request and compares the two outputs. /// //===----------------------------------------------------------------------===// diff --git a/amd/comgr/test-unit/HotswapMCTest.cpp b/amd/comgr/test-unit/HotswapMCTest.cpp index 60029d3aad360..38515ab66d1ff 100644 --- a/amd/comgr/test-unit/HotswapMCTest.cpp +++ b/amd/comgr/test-unit/HotswapMCTest.cpp @@ -274,6 +274,18 @@ static void expectInstMatchesAsm(const llvm::MCInst &Actual, expectSameOperands(Actual, Expected, Asm); } +static bool appendSingleInstBytes(llvm::SmallVectorImpl &Bytes, + llvm::StringRef Asm, + const LLVMState &S) { + llvm::SmallVector Inst = assembleSingleInst(Asm, S); + if (Inst.empty()) { + ADD_FAILURE() << "failed to assemble: " << Asm.str(); + return false; + } + Bytes.append(Inst.begin(), Inst.end()); + return true; +} + TEST(CheckVgprOverlap, DetectsDirectOverlap) { LLVMState S = initLLVM(makeGfx1250Ident()); ASSERT_TRUE(S.Valid); @@ -399,22 +411,12 @@ TEST(BuildKernelEntryTrampoline, MatcherRejectsWrongOperandShape) { ASSERT_TRUE(S.Valid); llvm::SmallVector Bytes; - auto Append = [&](llvm::StringRef Asm) { - llvm::SmallVector Inst = assembleSingleInst(Asm, S); - if (Inst.empty()) { - ADD_FAILURE() << "failed to assemble: " << Asm.str(); - return false; - } - Bytes.append(Inst.begin(), Inst.end()); - return true; - }; - - ASSERT_TRUE(Append("global_wb")); - ASSERT_TRUE(Append("v_nop")); - ASSERT_TRUE(Append("s_get_pc_i64 s[8:9]")); - ASSERT_TRUE(Append("s_add_u32 s8, s8, 0")); - ASSERT_TRUE(Append("s_addc_u32 s10, s10, 0")); - ASSERT_TRUE(Append("s_set_pc_i64 s[8:9]")); + ASSERT_TRUE(appendSingleInstBytes(Bytes, "global_wb", S)); + ASSERT_TRUE(appendSingleInstBytes(Bytes, "v_nop", S)); + ASSERT_TRUE(appendSingleInstBytes(Bytes, "s_get_pc_i64 s[8:9]", S)); + ASSERT_TRUE(appendSingleInstBytes(Bytes, "s_add_u32 s8, s8, 0", S)); + ASSERT_TRUE(appendSingleInstBytes(Bytes, "s_addc_u32 s10, s10, 0", S)); + ASSERT_TRUE(appendSingleInstBytes(Bytes, "s_set_pc_i64 s[8:9]", S)); llvm::SmallVector CodeEnd = assembleSingleInst("s_code_end", S); ASSERT_EQ(CodeEnd.size(), MinInstSize);