Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions amd/comgr/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ set(SOURCES
src/comgr-env.cpp
src/comgr-hotswap.cpp
src/comgr-hotswap-b0a0.cpp
src/comgr-hotswap-entry-trampoline.cpp
src/comgr-hotswap-patch-trampoline.cpp
src/comgr-hotswap-elf.cpp
src/comgr-hotswap-llvm.cpp
Expand Down
90 changes: 89 additions & 1 deletion amd/comgr/include/amd_comgr.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -2795,7 +2795,10 @@ amd_comgr_map_elf_virtual_address_to_code_object_offset(
* patches (e.g. B0 to A0) and cross-family transpilation.
* The input ELF is not modified; a new data object is created and returned.
*
* If no patches are needed, the output is a copy of the input.
* A successful call means COMGR produced a valid output code object, not
* necessarily that the output bytes differ from the input. If the
* source/target ISA pair selects no enabled transformation, the output is a
* copy of the input.
*
* Currently supported transformations:
* - GFX1250 B0 to A0
Expand Down Expand Up @@ -2833,6 +2836,91 @@ amd_comgr_hotswap_rewrite(
const char *target_isa_name,
amd_comgr_data_t *output) AMD_COMGR_VERSION_3_2;

/**
* @brief HotSwap rewrite option flags.
*/
typedef enum amd_comgr_hotswap_rewrite_flag_s {
/**
* Apply the default rewrite behavior.
*/
AMD_COMGR_HOTSWAP_REWRITE_FLAG_NONE = 0,
/**
* Redirect kernel descriptors through generated entry stubs.
*/
AMD_COMGR_HOTSWAP_REWRITE_FLAG_ENTRY_TRAMPOLINES = 0x1,
} amd_comgr_hotswap_rewrite_flag_t;

/**
* @brief Options for @p amd_comgr_hotswap_rewrite_with_options.
*/
typedef struct amd_comgr_hotswap_rewrite_options_s {
/**
* Size of this structure, in bytes. Must be at least
* sizeof(amd_comgr_hotswap_rewrite_options_t).
*/
size_t size;
/**
* Bitwise OR of @p amd_comgr_hotswap_rewrite_flag_t values.
*/
uint64_t flags;
} amd_comgr_hotswap_rewrite_options_t;

/**
* @brief Rewrite a code object from one ISA to another with explicit options.
*
* Rewrites GPU instructions in the ELF code object so that it can execute
* on a different target ISA. This includes both same-family stepping
* patches (e.g. B0 to A0) and cross-family transpilation.
* The input ELF is not modified; a new data object is created and returned.
*
* A successful call means COMGR produced a valid output code object, not
* necessarily that the output bytes differ from the input. If the
* source/target ISA pair and rewrite options select no enabled transformation,
* the output is a copy of the input.
*
* Currently supported transformations:
* - GFX1250 B0 to A0
* - GFX125x entry trampolines when requested by @p rewrite_options
*
* Additional source/target ISA pairs may be added in future releases.
* Unsupported @p source_isa_name / @p target_isa_name combinations return
* @c AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT.
*
* @param[in] input A data object of kind @p AMD_COMGR_DATA_KIND_EXECUTABLE
* containing the input ELF code object bytes.
* @param[in] source_isa_name A null terminated string that is the isa name
* the code object was compiled for. The isa name is defined as the Code
* Object Target Identification string, described at
* https://llvm.org/docs/AMDGPUUsage.html#code-object-target-identification
* @param[in] target_isa_name A null terminated string that is the isa name
* of the target GPU.
* @param[in] rewrite_options Options controlling optional rewrite behavior.
* Must not be NULL. Unknown flag bits return
* @c AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT.
* @param[out] output A handle to a data object of kind @p
* AMD_COMGR_DATA_KIND_EXECUTABLE containing the rewritten ELF. The caller
* must release this handle using @c amd_comgr_release_data when done.
* @p output is not modified on failure.
*
* @retval ::AMD_COMGR_STATUS_SUCCESS Patching completed successfully.
* @retval ::AMD_COMGR_STATUS_ERROR An internal error occurred.
* @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p input is an invalid
* data object, is not of kind @p AMD_COMGR_DATA_KIND_EXECUTABLE, does not
* contain data bytes, @p source_isa_name, @p target_isa_name, @p
* rewrite_options, or @p output is NULL, the source/target isa name
* combination is not supported, the options structure is too small, or
* unsupported option flags are set.
* @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES Unable to allocate
* the output data object.
*/
amd_comgr_status_t AMD_COMGR_API
amd_comgr_hotswap_rewrite_with_options(
amd_comgr_data_t input,
const char *source_isa_name,
const char *target_isa_name,
const amd_comgr_hotswap_rewrite_options_t *rewrite_options,
amd_comgr_data_t *output) AMD_COMGR_VERSION_3_3;

/** @} */

#ifdef __cplusplus
Expand Down
118 changes: 83 additions & 35 deletions amd/comgr/src/comgr-hotswap-b0a0.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
///
/// \file
/// Dispatcher for B0-to-A0 silicon stepping patches and the
/// retargetCodeObjectB0A0 orchestrator that drives the full pipeline:
/// retargetCodeObject orchestrator that drives the full pipeline:
/// decode -> patch -> trampoline growth -> DWARF update.
///
/// Patch passes are dispatched through HotswapPatchVTable. The membership
Expand All @@ -32,6 +32,8 @@
#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/Compiler.h"

#include <limits>

using namespace llvm;

namespace COMGR {
Expand Down Expand Up @@ -118,7 +120,7 @@ void patchDebugFrame(uint8_t *Elf, size_t ElfSize, uint64_t TextAddr,
// invokes it eagerly on the singleton's private storage, which the C++11
// magic-static rule guarantees runs exactly once even under concurrent
// first access. That removes both the explicit std::call_once at the
// retargetCodeObjectB0A0 entry point and any inter-TU static-init order
// retargetCodeObject entry point and any inter-TU static-init order
// dependency on the patch modules.

void installHotswapPatches(HotswapPatchVTable &VT) {
Expand Down Expand Up @@ -426,15 +428,15 @@ applyGfx1250B0toA0Rules(std::vector<InternalDecodedInst> &Decoded,
return Patched;
}

// -- retargetCodeObjectB0A0 helpers -------------------------------------------
// -- retargetCodeObject helpers -------------------------------------------

/// Finalize the deferred trampolines produced by emitToTrampoline: resolves
/// the branch-back at the tail of each trampoline to land on the next
/// instruction after the original site, writes the branch-forward + s_nop
/// padding at the original .text slot, and reports per-trampoline encoding
/// failures through log(). Runs after all patch passes finish so the
/// post-.text layout of trampolines is known. Returns false if any
/// trampoline could not be fixed up, but still patches the ones that can.
/// trampoline could not be fixed up.
[[nodiscard]] static bool
fixupTrampolineBranches(std::vector<Trampoline> &Trampolines, uint8_t *Text,
uint64_t TextSize, const LLVMState &LS) {
Expand Down Expand Up @@ -480,15 +482,15 @@ fixupTrampolineBranches(std::vector<Trampoline> &Trampolines, uint8_t *Text,
/// implementations land in separate PRs.
static void patchDebugSections(WritableMemoryBuffer &ElfBuf,
ArrayRef<Trampoline> Trampolines,
const ElfView &Elf, size_t TrampTotal) {
const ElfView &Elf, size_t GrowthTotal) {
uint8_t *Data = reinterpret_cast<uint8_t *>(ElfBuf.getBufferStart());
size_t Size = ElfBuf.getBufferSize();
if (!addTrampolineSymbols(ElfBuf, Trampolines, Elf.textSize(),
Elf.textSectionIndex()))
log() << "hotswap: error: addTrampolineSymbols failed\n";
patchDebugRanges(Data, Size, Elf.textAddr(), Elf.textSize(), TrampTotal);
patchDebugInfo(Data, Size, Elf.textAddr(), Elf.textSize(), TrampTotal);
patchDebugFrame(Data, Size, Elf.textAddr(), Elf.textSize(), TrampTotal);
patchDebugRanges(Data, Size, Elf.textAddr(), Elf.textSize(), GrowthTotal);
patchDebugInfo(Data, Size, Elf.textAddr(), Elf.textSize(), GrowthTotal);
patchDebugFrame(Data, Size, Elf.textAddr(), Elf.textSize(), GrowthTotal);
if (!patchDebugLine(ElfBuf, Trampolines, Elf.textSize(), Elf.textAddr()))
log() << "hotswap: error: patchDebugLine failed\n";
}
Expand Down Expand Up @@ -520,79 +522,125 @@ static void runScratchVerification(WritableMemoryBuffer &OutBuf,
<< "scratch conflicts\n";
}

// -- retargetCodeObjectB0A0 ---------------------------------------------------
// -- retargetCodeObject -------------------------------------------------------

amd_comgr_status_t retargetCodeObjectB0A0(const void *ElfData, size_t ElfSize,
const TargetIdentifier &TargetIdent,
std::unique_ptr<MemoryBuffer> &Out) {
amd_comgr_status_t retargetCodeObject(const void *ElfData, size_t ElfSize,
const TargetIdentifier &TargetIdent,
const Gfx1250RewriteOptions &Options,
std::unique_ptr<MemoryBuffer> &Out) {
// The dispatcher fetches the patch vtable lazily via
// getHotswapPatchVTable() inside applyGfx1250B0toA0Rules; the singleton's
// initializer binds every register*Patch slot on first access, so no
// explicit install step is needed here.

if (!Options.RunB0A0Patches && !Options.RunEntryTrampolines) {
std::unique_ptr<WritableMemoryBuffer> Result =
WritableMemoryBuffer::getNewUninitMemBuffer(ElfSize);
if (!Result) {
log() << "hotswap: error: retargetCodeObject: "
<< "getNewUninitMemBuffer(" << ElfSize
<< ") failed (out of memory) for the no-op output copy.\n";
return AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES;
}
std::memcpy(Result->getBufferStart(), ElfData, ElfSize);
Out = std::move(Result);
return AMD_COMGR_STATUS_SUCCESS;
}
Comment on lines +536 to +548

@chinmaydd chinmaydd Jun 30, 2026

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we're almost there.

This no-op behavior should be documented somewhere

@chinmaydd chinmaydd Jun 30, 2026

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its also weird that there's no way for the caller to know whether this was exercised.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree this is not ideal, but I would prefer not to grow the API shape in this PR. For now, callers that need to know can compare input/output bytes. If ROCr needs structured pass-result reporting, we should add that as a follow-up API extension.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. I documented this in the public hotswap API comments and the HotSwap README. SUCCESS means comgr produced a valid output code object, not that bytes necessarily changed; if source/target/options select no enabled rewrite, output is a copy of the input.


// Take a working copy so the input is preserved and we have a mutable
// buffer to parse / patch.
std::vector<uint8_t> Buf(static_cast<const uint8_t *>(ElfData),
static_cast<const uint8_t *>(ElfData) + ElfSize);

Expected<ElfView> ViewOrErr = ElfView::create(Buf.data(), Buf.size());
if (!ViewOrErr) {
log() << "hotswap: error: retargetCodeObjectB0A0: input is not a "
log() << "hotswap: error: retargetCodeObject: input is not a "
<< "parseable ELF64 (" << toString(ViewOrErr.takeError()) << ").\n";
return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT;
}
if (ViewOrErr->textSize() == 0) {
log() << "hotswap: error: retargetCodeObjectB0A0: input ELF has empty "
log() << "hotswap: error: retargetCodeObject: input ELF has empty "
<< ".text section; nothing to rewrite.\n";
return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT;
}
ElfView &Elf = *ViewOrErr;

LLVMState LS = initLLVM(TargetIdent);
if (!LS.Valid) {
log() << "hotswap: error: retargetCodeObjectB0A0: initLLVM failed "
log() << "hotswap: error: retargetCodeObject: initLLVM failed "
<< "for CPU '" << TargetIdent.Processor << "'; aborting rewrite.\n";
return AMD_COMGR_STATUS_ERROR;
}

RewriteConfig Config = makeGfx1250B0A0Config();

uint8_t *Text = Elf.textData();
std::vector<InternalDecodedInst> Decoded;
if (!decodeTextSection(Text, Elf.textSize(), LS, Decoded)) {
log() << "hotswap: error: retargetCodeObjectB0A0: decodeTextSection "
<< "failed on .text (" << Elf.textSize() << " bytes).\n";
return AMD_COMGR_STATUS_ERROR;
}

uint64_t Count = 0;
std::vector<Trampoline> Deferred;
std::vector<ScratchPatchInfo> ScratchPatches;
uint32_t Count = applyGfx1250B0toA0Rules(
Decoded, Text, Elf.textSize(), LS, Deferred, Elf, ScratchPatches, Config);
if (Options.RunB0A0Patches) {
std::vector<InternalDecodedInst> Decoded;
if (!decodeTextSection(Text, Elf.textSize(), LS, Decoded)) {
log() << "hotswap: error: retargetCodeObject: decodeTextSection "
<< "failed on .text (" << Elf.textSize() << " bytes).\n";
return AMD_COMGR_STATUS_ERROR;
}

log() << "hotswap: applied " << Count << " patches\n";
Count = applyGfx1250B0toA0Rules(Decoded, Text, Elf.textSize(), LS, Deferred,
Elf, ScratchPatches, Config);
log() << "hotswap: applied " << Count << " B0-to-A0 patches\n";
} else {
log() << "hotswap: B0-to-A0 patches disabled for this rewrite\n";
}

std::unique_ptr<WritableMemoryBuffer> Result;
std::vector<Trampoline> Growth = Deferred;
if (!Deferred.empty()) {
if (!fixupTrampolineBranches(Deferred, Text, Elf.textSize(), LS))
log() << "hotswap: error: some trampolines could not be fixed up\n";
if (!fixupTrampolineBranches(Deferred, Text, Elf.textSize(), LS)) {
log() << "hotswap: error: trampoline branch fixup failed; aborting "
"rewrite\n";
return AMD_COMGR_STATUS_ERROR;
}
Growth = Deferred;
}

std::vector<KernelEntryTrampolineFixup> EntryFixups;
if (Options.RunEntryTrampolines) {
std::optional<uint32_t> EntryCount = appendKernelEntryTrampolines(
Elf, LS, Config.MaxSgprs, Growth, EntryFixups);
if (!EntryCount)
return AMD_COMGR_STATUS_ERROR;
Count += *EntryCount;
} else {
log() << "hotswap: kernel-entry trampolines disabled for this rewrite\n";
}

Result = Elf.growWithTrampolines(Deferred, LS.SNopBytes);
if (!Growth.empty()) {
Result = Elf.growWithTrampolines(Growth, LS.SNopBytes);
if (!Result) {
log() << "hotswap: error: retargetCodeObjectB0A0: "
log() << "hotswap: error: retargetCodeObject: "
<< "ElfView::growWithTrampolines returned null with "
<< Deferred.size() << " trampolines queued.\n";
<< Growth.size() << " trampolines queued.\n";
return AMD_COMGR_STATUS_ERROR;
}

size_t TrampTotal = 0;
for (const Trampoline &T : Deferred)
TrampTotal += T.Bytes.size();
patchDebugSections(*Result, Deferred, Elf, TrampTotal);
size_t GrowthTotal = 0;
for (const Trampoline &T : Growth) {
if (T.Bytes.size() > std::numeric_limits<size_t>::max() - GrowthTotal) {
log() << "hotswap: error: retargetCodeObject: growth byte count "
<< "overflows size_t.\n";
return AMD_COMGR_STATUS_ERROR;
}
GrowthTotal += T.Bytes.size();
}
patchDebugSections(*Result, Deferred, Elf, GrowthTotal);
if (!rewriteKernelEntryDescriptorOffsets(*Result, Elf.textSize(),
EntryFixups))
return AMD_COMGR_STATUS_ERROR;
} else {
Result = WritableMemoryBuffer::getNewUninitMemBuffer(ElfSize);
if (!Result) {
log() << "hotswap: error: retargetCodeObjectB0A0: "
log() << "hotswap: error: retargetCodeObject: "
<< "getNewUninitMemBuffer(" << ElfSize
<< ") failed (out of memory) for the patched output copy.\n";
return AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES;
Expand Down
Loading
Loading