From 271b1d38e4a8b341d7aae46545e7260253251de4 Mon Sep 17 00:00:00 2001 From: Nirmal Senthilkumar Date: Mon, 22 Jun 2026 18:18:17 +0000 Subject: [PATCH 1/9] hotswap: re-key HSA_TOOLS_LIB allowlist to libhsa-hotswap.so The ROCr v1 tool-load carve-out (runtime.cpp) and the CLR hotswap enable-gate (clr/rocclr/device/hotswap.hpp) matched the old comgr-hosted tool name "libamd_comgr_hotswap_tool.so". After the HSA tool moved to rocm-systems' libhsa-hotswap.so, the name no longer matches: on a rocprofiler-register (v3) box allow_v1_registration stays false, so ROCr never invokes the tool's OnLoad and no rewrite is applied (silent). Re-key both constants to "libhsa-hotswap.so". --- projects/clr/rocclr/device/hotswap.hpp | 2 +- .../rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/projects/clr/rocclr/device/hotswap.hpp b/projects/clr/rocclr/device/hotswap.hpp index 0c26aa2781cc..bf9205195c75 100644 --- a/projects/clr/rocclr/device/hotswap.hpp +++ b/projects/clr/rocclr/device/hotswap.hpp @@ -27,7 +27,7 @@ namespace amd { namespace hotswap { // On when this tool is loaded via HSA_TOOLS_LIB (name must match ROCR LoadTools). -inline constexpr const char* kHotswapToolLib = "libamd_comgr_hotswap_tool.so"; +inline constexpr const char* kHotswapToolLib = "libhsa-hotswap.so"; inline bool Enabled() { const char* tools_lib = std::getenv("HSA_TOOLS_LIB"); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp index 3194c83785e2..4343b1693b95 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp @@ -2793,12 +2793,12 @@ void Runtime::LoadTools() { getpid(), rocp_reg_status, rocprofiler_register_error_string(rocp_reg_status)); } - // rocprofiler-register (v3) provides tracing only; the comgr hotswap tool must + // rocprofiler-register (v3) provides tracing only; the hotswap tool must // intercept and modify HSA calls (it wraps hsa_code_object_reader_create_from_memory), // which requires the v1 HSA_TOOLS_LIB path. Allow v1 registration specifically for // that first-party tool so it loads via HSA_TOOLS_LIB alone, without re-enabling v1 // for other tools. General v1 behavior is otherwise unchanged. - static constexpr const char* kHotswapToolLib = "libamd_comgr_hotswap_tool.so"; + static constexpr const char* kHotswapToolLib = "libhsa-hotswap.so"; bool allow_v1_registration = flag().tools_lib_names().find(kHotswapToolLib) != std::string::npos; if (os::IsEnvVarSet("HSA_TOOLS_ROCPROFILER_V1_TOOLS")) { From 130a49ef59ab3fdf2bed6057c506e805d4fcb7dc Mon Sep 17 00:00:00 2001 From: Nirmal Senthilkumar Date: Mon, 22 Jun 2026 18:18:17 +0000 Subject: [PATCH 2/9] hotswap: include amd_comgr.h via its namespaced subdir Installed comgr ships its public header at include/amd_comgr/amd_comgr.h and its CMake package exposes the include root (-Iinclude), so the flat #include does not resolve when building against an installed comgr (the path TheRock builds the tool against). Use "amd_comgr/amd_comgr.h" to match every other comgr consumer in rocm-systems (clr comgrctx.hpp/devkernel.hpp/devprogram.hpp, rocdbgapi, rocprofiler-sdk). --- projects/hotswap/hotswap.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/hotswap/hotswap.cpp b/projects/hotswap/hotswap.cpp index 5267e3998fa6..dae28dca733f 100644 --- a/projects/hotswap/hotswap.cpp +++ b/projects/hotswap/hotswap.cpp @@ -5,7 +5,7 @@ //===----------------------------------------------------------------------===// #include "hotswap.hpp" -#include +#include "amd_comgr/amd_comgr.h" #include #include #include From 8fa66e4078352070929efed19318382f2580a467 Mon Sep 17 00:00:00 2001 From: Nirmal Senthilkumar Date: Mon, 22 Jun 2026 23:19:25 +0000 Subject: [PATCH 3/9] hotswap: install libhsa-hotswap.so and link hsa-runtime64 Fold in the install rule and HSA runtime linkage previously provided by #7577 so this branch no longer depends on it: add GNUInstallDirs, find_package(hsa-runtime64), link hsa-runtime64::hsa-runtime64, and install(TARGETS hsa-hotswap). --- projects/hotswap/CMakeLists.txt | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/projects/hotswap/CMakeLists.txt b/projects/hotswap/CMakeLists.txt index 6710d980717e..b9e0f3734b65 100644 --- a/projects/hotswap/CMakeLists.txt +++ b/projects/hotswap/CMakeLists.txt @@ -1,6 +1,8 @@ cmake_minimum_required(VERSION 3.16) project(hotswap LANGUAGES CXX) +include(GNUInstallDirs) + set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -10,6 +12,7 @@ set(HSA_RUNTIME_INC "${CMAKE_CURRENT_SOURCE_DIR}/../rocr-runtime/runtime/hsa-run # COMGR is required — provides amd_comgr_hotswap_rewrite. find_package(amd_comgr CONFIG REQUIRED) +find_package(hsa-runtime64 CONFIG REQUIRED) find_path(HSA_INCLUDE_DIR hsa.h PATHS ${HSA_RUNTIME_INC} NO_DEFAULT_PATH REQUIRED) @@ -36,10 +39,18 @@ target_include_directories(hsa-hotswap PRIVATE ${HSA_RUNTIME_INC}/.. ) -target_link_libraries(hsa-hotswap PRIVATE amd_comgr) +target_link_libraries(hsa-hotswap PRIVATE + amd_comgr + hsa-runtime64::hsa-runtime64 +) set_target_properties(hsa-hotswap PROPERTIES POSITION_INDEPENDENT_CODE ON) +install(TARGETS hsa-hotswap + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) + # Test executable add_executable(hotswap_test tests/hotswap_test.cpp hotswap.cpp) target_include_directories(hotswap_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) From e6b00f256afc4596b8d1b909fae530246522f73d Mon Sep 17 00:00:00 2001 From: Nirmal Senthilkumar Date: Tue, 23 Jun 2026 06:25:28 +0000 Subject: [PATCH 4/9] hotswap: derive code-object ISA via amd_comgr_get_data_isa_name The hand-rolled metadata-note parser (find_isa_in_metadata) over-read the msgpack-encoded ISA string, spilling past the gfx target into the next note key, so COMGR rejected the malformed source ISA (rc=2) and every rewrite fell back to the original code object. Read the source ISA from the code object via COMGR's amd_comgr_get_data_isa_name instead. It uses LLVM's canonical parser, so it is correct and tracks triple normalization automatically, and it removes the fragile hand-rolled ELF/note parsing. The target ISA stays the running GPU's ISA (from the HSA agent), so amd_comgr_hotswap_rewrite still receives source (code object) and target (GPU) as designed. Also drops the now-dead note-parsing helpers and updates the unit test. --- projects/hotswap/hotswap.cpp | 36 ++++- projects/hotswap/hotswap.hpp | 16 ++- projects/hotswap/hotswap_tool.cpp | 172 +----------------------- projects/hotswap/tests/hotswap_test.cpp | 91 ++++--------- 4 files changed, 68 insertions(+), 247 deletions(-) diff --git a/projects/hotswap/hotswap.cpp b/projects/hotswap/hotswap.cpp index dae28dca733f..a9a67b747713 100644 --- a/projects/hotswap/hotswap.cpp +++ b/projects/hotswap/hotswap.cpp @@ -10,12 +10,13 @@ #include #include #include +#include namespace rocr::hotswap { int RetargetCodeObject(const void *elf_data, size_t elf_size, - const char *source_isa, const char *target_isa, - void **out_data, size_t *out_size) { + const char *target_isa, void **out_data, + size_t *out_size) { using OwnedElf = std::unique_ptr; if (!out_data || !out_size) { @@ -26,7 +27,7 @@ int RetargetCodeObject(const void *elf_data, size_t elf_size, *out_data = const_cast(elf_data); *out_size = elf_size; - if (!elf_data || elf_size == 0 || !source_isa || !target_isa) { + if (!elf_data || elf_size == 0 || !target_isa) { fprintf(stderr, "hotswap: invalid null input argument(s)\n"); return -1; } @@ -47,14 +48,37 @@ int RetargetCodeObject(const void *elf_data, size_t elf_size, return static_cast(status); } - // Call the hotswap rewrite API. + // Source ISA from the code object via COMGR (LLVM-canonical, tracks triple + // normalization); the target ISA is the running GPU, supplied by the caller. + size_t isa_len = 0; + status = amd_comgr_get_data_isa_name(input, &isa_len, nullptr); + if (status != AMD_COMGR_STATUS_SUCCESS || isa_len == 0) { + fprintf(stderr, "hotswap: failed to query COMGR ISA name (rc=%d)\n", + static_cast(status)); + amd_comgr_release_data(input); + return static_cast(status); + } + std::string source_isa(isa_len, '\0'); + status = amd_comgr_get_data_isa_name(input, &isa_len, source_isa.data()); + if (status != AMD_COMGR_STATUS_SUCCESS) { + fprintf(stderr, "hotswap: failed to read COMGR ISA name (rc=%d)\n", + static_cast(status)); + amd_comgr_release_data(input); + return static_cast(status); + } + // Reported size includes the terminating NUL. + if (!source_isa.empty() && source_isa.back() == '\0') { + source_isa.pop_back(); + } + amd_comgr_data_t output = {0}; - status = amd_comgr_hotswap_rewrite(input, source_isa, target_isa, &output); + status = amd_comgr_hotswap_rewrite(input, source_isa.c_str(), target_isa, + &output); amd_comgr_release_data(input); if (status != AMD_COMGR_STATUS_SUCCESS) { fprintf(stderr, "hotswap: COMGR rewrite failed for %s -> %s (rc=%d)\n", - source_isa, target_isa, static_cast(status)); + source_isa.c_str(), target_isa, static_cast(status)); return static_cast(status); } diff --git a/projects/hotswap/hotswap.hpp b/projects/hotswap/hotswap.hpp index a730b0ae6708..ce31e0cdf42b 100644 --- a/projects/hotswap/hotswap.hpp +++ b/projects/hotswap/hotswap.hpp @@ -11,11 +11,15 @@ namespace rocr::hotswap { -/// Rewrite a code object from source_isa to target_isa via COMGR. +/// Retarget a code object from its own ISA to the running GPU's ISA via COMGR. /// -/// Called by the hotswap tools lib when the code object's ISA differs from -/// the agent's ISA, or when stepping patches are needed (e.g., B0-to-A0). -/// Delegates to COMGR's amd_comgr_hotswap_rewrite (linked directly). +/// The source ISA is read from the code object via COMGR +/// (amd_comgr_get_data_isa_name); the target ISA is the running GPU's ISA, +/// supplied by the caller (e.g. from the HSA agent). COMGR's +/// amd_comgr_hotswap_rewrite (linked directly) applies whatever transformation +/// the source/target pair calls for -- same-ISA stepping patches (e.g. gfx1250 +/// B0 to A0) or cross-family transpilation -- and returns the rewritten code +/// object. If no transformation is needed, the output is a copy of the input. /// /// On success, *out_data and *out_size describe the rewritten code object. /// If *out_data differs from elf_data, it was allocated by this function @@ -27,8 +31,8 @@ namespace rocr::hotswap { /// /// Returns 0 on success, non-zero on failure. int RetargetCodeObject(const void *elf_data, size_t elf_size, - const char *source_isa, const char *target_isa, - void **out_data, size_t *out_size); + const char *target_isa, void **out_data, + size_t *out_size); } // namespace rocr::hotswap diff --git a/projects/hotswap/hotswap_tool.cpp b/projects/hotswap/hotswap_tool.cpp index 444bd1caaa87..ebc0b9db462f 100644 --- a/projects/hotswap/hotswap_tool.cpp +++ b/projects/hotswap/hotswap_tool.cpp @@ -21,14 +21,11 @@ #include #include #include -#include -#include #include #include #include #include #include -#include #include #include #include @@ -83,22 +80,6 @@ void stash_bytes(uint64_t handle, const uint8_t *data, size_t size) { g_reader_map[handle] = ReaderEntry{std::move(vec), false, false}; } -bool checked_add(size_t lhs, size_t rhs, size_t *out) { - if (lhs > std::numeric_limits::max() - rhs) { - return false; - } - *out = lhs + rhs; - return true; -} - -bool checked_mul(size_t lhs, size_t rhs, size_t *out) { - if (lhs != 0 && rhs > std::numeric_limits::max() / lhs) { - return false; - } - *out = lhs * rhs; - return true; -} - bool try_get_reader_entry(uint64_t handle, ByteVec *bytes, bool *from_file) { std::scoped_lock lock(g_reader_map_mutex); const auto it = g_reader_map.find(handle); @@ -129,145 +110,6 @@ void mark_reader_keepalive(uint64_t handle) { } } -// Validate ELF64 header and return pointer, or nullptr on failure. -const Elf64_Ehdr *validate_elf64(const uint8_t *elf, size_t size) { - if (size < sizeof(Elf64_Ehdr)) { - return nullptr; - } - const auto *ehdr = reinterpret_cast(elf); - if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) { - return nullptr; - } - if (ehdr->e_ident[EI_CLASS] != ELFCLASS64) { - return nullptr; - } - return ehdr; -} - -bool validate_program_header_table(const Elf64_Ehdr *ehdr, size_t size) { - return ehdr->e_phoff != 0 && ehdr->e_phoff <= size && ehdr->e_phnum != 0 && - ehdr->e_phentsize >= sizeof(Elf64_Phdr); -} - -bool compute_program_header_offset(const Elf64_Ehdr *ehdr, size_t size, - uint16_t index, size_t *hdr_offset) { - size_t hdr_index_offset = 0; - if (!checked_mul(static_cast(index), - static_cast(ehdr->e_phentsize), - &hdr_index_offset) || - !checked_add(ehdr->e_phoff, hdr_index_offset, hdr_offset) || - *hdr_offset > size || sizeof(Elf64_Phdr) > size - *hdr_offset) { - return false; - } - return true; -} - -bool compute_note_segment_bounds(const Elf64_Phdr *phdr, size_t size, - size_t *note_offset, size_t *note_end) { - *note_offset = phdr->p_offset; - return *note_offset <= size && phdr->p_filesz <= size - *note_offset && - checked_add(*note_offset, phdr->p_filesz, note_end); -} - -bool compute_note_layout(size_t note_offset, size_t note_end, - const Elf64_Nhdr *nhdr, size_t *desc_off, - size_t *next_note) { - size_t raw_name_size = 0; - size_t raw_desc_size = 0; - size_t name_sz_aligned = 0; - size_t desc_sz_aligned = 0; - if (!checked_add(static_cast(nhdr->n_namesz), 3, &raw_name_size) || - !checked_add(static_cast(nhdr->n_descsz), 3, &raw_desc_size)) { - return false; - } - - name_sz_aligned = raw_name_size & ~size_t{3}; - desc_sz_aligned = raw_desc_size & ~size_t{3}; - return checked_add(note_offset, sizeof(Elf64_Nhdr), desc_off) && - checked_add(*desc_off, name_sz_aligned, desc_off) && - checked_add(*desc_off, desc_sz_aligned, next_note) && - *next_note <= note_end; -} - -// Search a single NT_AMDGPU_METADATA note descriptor for the ISA triple. -std::string find_isa_in_metadata(const char *desc, size_t desc_size) { - const char prefix[] = "amdgcn-amd-amdhsa--"; - const size_t prefix_len = sizeof(prefix) - 1; - for (size_t j = 0; j + prefix_len <= desc_size; ++j) { - if (memcmp(desc + j, prefix, prefix_len) == 0) { - size_t len = 0; - while (j + len < desc_size && desc[j + len] != '\0' && - desc[j + len] != '\n' && desc[j + len] != '\'' && - desc[j + len] != '"' && desc[j + len] != ' ') { - ++len; - } - return std::string(desc + j, len); - } - } - return {}; -} - -std::string read_elf_isa_from_note_segment(const uint8_t *elf, size_t note_offset, - size_t note_end) { - while (note_offset <= note_end && - sizeof(Elf64_Nhdr) <= note_end - note_offset) { - const auto *nhdr = reinterpret_cast(elf + note_offset); - size_t desc_off = 0; - size_t next_note = 0; - if (!compute_note_layout(note_offset, note_end, nhdr, &desc_off, - &next_note)) { - break; - } - - constexpr uint32_t NT_AMDGPU_METADATA = 32; - if (nhdr->n_type == NT_AMDGPU_METADATA && nhdr->n_descsz > 0 && - desc_off + nhdr->n_descsz <= note_end) { - const char *desc = reinterpret_cast(elf + desc_off); - std::string result = find_isa_in_metadata(desc, nhdr->n_descsz); - if (!result.empty()) { - return result; - } - } - - note_offset = next_note; - } - return {}; -} - -// Parse ELF PT_NOTE segments to find the AMDGPU ISA name from -// NT_AMDGPU_METADATA (type 32) notes in v3+ code objects. -std::string read_elf_isa_note(const uint8_t *elf, size_t size) { - const Elf64_Ehdr *ehdr = validate_elf64(elf, size); - if (!ehdr || !validate_program_header_table(ehdr, size)) { - return {}; - } - - for (uint16_t i = 0; i < ehdr->e_phnum; ++i) { - size_t hdr_offset = 0; - if (!compute_program_header_offset(ehdr, size, i, &hdr_offset)) { - break; - } - const auto *phdr = - reinterpret_cast(elf + hdr_offset); - if (phdr->p_type != PT_NOTE) { - continue; - } - - size_t note_offset = 0; - size_t note_end = 0; - if (!compute_note_segment_bounds(phdr, size, ¬e_offset, ¬e_end)) { - continue; - } - - std::string result = read_elf_isa_from_note_segment(elf, note_offset, - note_end); - if (!result.empty()) { - return result; - } - } - return {}; -} - hsa_status_t HSA_API hotswap_reader_create_from_memory( const void *code_object, size_t size, hsa_code_object_reader_t *code_object_reader) { @@ -384,22 +226,18 @@ hsa_status_t try_retarget_and_load(hsa_executable_t executable, hsa_agent_t agen const char *options, hsa_loaded_code_object_t *loaded_code_object, const ByteVec &local_bytes) { - const std::string source_isa = - read_elf_isa_note(local_bytes->data(), local_bytes->size()); + // Source ISA is derived from the code object inside RetargetCodeObject; the + // target ISA is the running GPU. const std::string target_isa = get_agent_isa_name(agent); - - if (source_isa.empty() || target_isa.empty()) { + if (target_isa.empty()) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } - // Route through RetargetCodeObject for unified logging, validation, - // and COMGR interaction. Do NOT skip when source == target: B0-to-A0 - // patching uses the same ISA name on both sides. void *out_elf = nullptr; size_t out_elf_size = 0; const int rc = rocr::hotswap::RetargetCodeObject( - local_bytes->data(), local_bytes->size(), source_isa.c_str(), - target_isa.c_str(), &out_elf, &out_elf_size); + local_bytes->data(), local_bytes->size(), target_isa.c_str(), &out_elf, + &out_elf_size); if (rc != 0 || out_elf == local_bytes->data()) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; diff --git a/projects/hotswap/tests/hotswap_test.cpp b/projects/hotswap/tests/hotswap_test.cpp index 32753e2820b4..34bf67b2f564 100644 --- a/projects/hotswap/tests/hotswap_test.cpp +++ b/projects/hotswap/tests/hotswap_test.cpp @@ -7,11 +7,12 @@ #include "hotswap.hpp" #include #include -#include static int tests_passed = 0; static int tests_failed = 0; +static constexpr const char *kTargetIsa = "amdgcn-amd-amdhsa--gfx1250"; + static void check(bool cond, const char *name) { if (cond) { ++tests_passed; @@ -22,35 +23,30 @@ static void check(bool cond, const char *name) { } } -static void test_RetargetPassthrough() { - printf("TEST RetargetPassthrough...\n"); +// A 16-byte fake ELF has no AMDGPU metadata note, so the ISA cannot be derived; +// RetargetCodeObject must fail and leave the original code object untouched. +static void test_RetargetInvalidCodeObjectFallsBack() { + printf("TEST RetargetInvalidCodeObjectFallsBack...\n"); const unsigned char fake_elf[] = {0x7f, 'E', 'L', 'F', 0x02, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; void *out_data = nullptr; size_t out_size = 0; const int rc = rocr::hotswap::RetargetCodeObject( - fake_elf, sizeof(fake_elf), "amdgcn-amd-amdhsa--gfx1250", - "amdgcn-amd-amdhsa--gfx1250", &out_data, &out_size); + fake_elf, sizeof(fake_elf), kTargetIsa, &out_data, &out_size); - check(rc == 0, "RetargetCodeObject succeeds"); - check(out_size == sizeof(fake_elf), "output size matches input"); - if (out_data && out_data != fake_elf) { - check(memcmp(out_data, fake_elf, sizeof(fake_elf)) == 0, - "output bytes match input (passthrough)"); - std::free(out_data); - } else { - check(out_data != nullptr && out_data != fake_elf, - "out_data is a new allocation"); - } + check(rc != 0, "RetargetCodeObject fails when ISA cannot be derived"); + check(out_data == static_cast(fake_elf), + "out_data still points to original input after failure"); + check(out_size == sizeof(fake_elf), + "out_size still matches original after failure"); } static void test_RetargetNullOutputPointers() { printf("TEST RetargetNullOutputPointers...\n"); const unsigned char fake_elf[] = {0x7f, 'E', 'L', 'F'}; - const int rc = rocr::hotswap::RetargetCodeObject( - fake_elf, sizeof(fake_elf), "amdgcn-amd-amdhsa--gfx1250", - "amdgcn-amd-amdhsa--gfx1250", nullptr, nullptr); + const int rc = rocr::hotswap::RetargetCodeObject(fake_elf, sizeof(fake_elf), + kTargetIsa, nullptr, nullptr); check(rc != 0, "RetargetCodeObject rejects null output pointers"); } @@ -58,67 +54,26 @@ static void test_RetargetNullInputs() { printf("TEST RetargetNullInputs...\n"); void *out_data = nullptr; size_t out_size = 0; - const int rc = rocr::hotswap::RetargetCodeObject( - nullptr, 0, "amdgcn-amd-amdhsa--gfx1250", "amdgcn-amd-amdhsa--gfx1250", - &out_data, &out_size); + const int rc = rocr::hotswap::RetargetCodeObject(nullptr, 0, kTargetIsa, + &out_data, &out_size); check(rc != 0, "RetargetCodeObject rejects null elf_data"); } -static void test_RetargetOutputOwnership() { - // Verifies that on success, out_data is a NEW allocation (not the input). - // This catches the bug where out_elf was stashed unconditionally — - // the caller must be able to distinguish "new buffer to free" from - // "original buffer, don't free". - printf("TEST RetargetOutputOwnership...\n"); - const unsigned char fake_elf[] = {0x7f, 'E', 'L', 'F', 0x02, 0x01, 0x01, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00}; - void *out_data = nullptr; - size_t out_size = 0; - const int rc = rocr::hotswap::RetargetCodeObject( - fake_elf, sizeof(fake_elf), "amdgcn-amd-amdhsa--gfx1250", - "amdgcn-amd-amdhsa--gfx1250", &out_data, &out_size); - - check(rc == 0, "RetargetCodeObject succeeds"); - check(out_data != static_cast(fake_elf), - "out_data is NOT the original input (new allocation)"); - check(out_data != nullptr, "out_data is non-null"); - - if (out_data && out_data != fake_elf) { - // Verify we can write to it (proves it's a heap allocation, not const) - memset(out_data, 0xAB, out_size); - std::free(out_data); - check(true, "out_data is writable and freeable"); - } -} - -static void test_RetargetFailureLeavesOriginal() { - // Verifies that on failure, out_data points to the original input - // (not a dangling pointer or null). This catches use-after-free bugs - // where the fallback buffer was freed prematurely. - printf("TEST RetargetFailureLeavesOriginal...\n"); +static void test_RetargetNullTarget() { + printf("TEST RetargetNullTarget...\n"); const unsigned char fake_elf[] = {0x7f, 'E', 'L', 'F'}; void *out_data = nullptr; size_t out_size = 0; - - // Use an unsupported ISA pair to force failure - const int rc = rocr::hotswap::RetargetCodeObject( - fake_elf, sizeof(fake_elf), "amdgcn-amd-amdhsa--gfx_unsupported", - "amdgcn-amd-amdhsa--gfx_unsupported", &out_data, &out_size); - - check(rc != 0, "RetargetCodeObject fails for unsupported ISA"); - check(out_data == static_cast(fake_elf), - "out_data still points to original input after failure"); - check(out_size == sizeof(fake_elf), - "out_size still matches original after failure"); + const int rc = rocr::hotswap::RetargetCodeObject(fake_elf, sizeof(fake_elf), + nullptr, &out_data, &out_size); + check(rc != 0, "RetargetCodeObject rejects null target_isa"); } int main() { - test_RetargetPassthrough(); + test_RetargetInvalidCodeObjectFallsBack(); test_RetargetNullOutputPointers(); test_RetargetNullInputs(); - test_RetargetOutputOwnership(); - test_RetargetFailureLeavesOriginal(); + test_RetargetNullTarget(); printf("\n%d passed, %d failed\n", tests_passed, tests_failed); return tests_failed ? EXIT_FAILURE : EXIT_SUCCESS; From 3727e5e309300b5bc2e4c41fb9683bcbc47132e9 Mon Sep 17 00:00:00 2001 From: Nirmal Senthilkumar Date: Tue, 23 Jun 2026 18:43:06 +0000 Subject: [PATCH 5/9] hotswap: pass explicit source/target ISA into RetargetCodeObject Move code-object ISA discovery into GetCodeObjectIsaName (still via amd_comgr_get_data_isa_name) and have the HSA tool derive source_isa from the code object and target_isa from the agent, passing both into RetargetCodeObject. This keeps RetargetCodeObject a pure rewrite primitive whose source/target can be overridden (e.g. cross-gen). --- projects/hotswap/hotswap.cpp | 70 ++++++++++++++++++------------- projects/hotswap/hotswap.hpp | 19 ++++++--- projects/hotswap/hotswap_tool.cpp | 11 ++--- 3 files changed, 60 insertions(+), 40 deletions(-) diff --git a/projects/hotswap/hotswap.cpp b/projects/hotswap/hotswap.cpp index a9a67b747713..740d4f0f2248 100644 --- a/projects/hotswap/hotswap.cpp +++ b/projects/hotswap/hotswap.cpp @@ -14,9 +14,45 @@ namespace rocr::hotswap { +std::string GetCodeObjectIsaName(const void *elf_data, size_t elf_size) { + if (!elf_data || elf_size == 0) { + return {}; + } + + amd_comgr_data_t data = {0}; + if (amd_comgr_create_data(AMD_COMGR_DATA_KIND_EXECUTABLE, &data) != + AMD_COMGR_STATUS_SUCCESS) { + return {}; + } + + std::string isa; + if (amd_comgr_set_data(data, elf_size, + static_cast(elf_data)) == + AMD_COMGR_STATUS_SUCCESS) { + size_t isa_len = 0; + if (amd_comgr_get_data_isa_name(data, &isa_len, nullptr) == + AMD_COMGR_STATUS_SUCCESS && + isa_len > 0) { + isa.resize(isa_len); + if (amd_comgr_get_data_isa_name(data, &isa_len, isa.data()) == + AMD_COMGR_STATUS_SUCCESS) { + // Reported size includes the terminating NUL. + if (!isa.empty() && isa.back() == '\0') { + isa.pop_back(); + } + } else { + isa.clear(); + } + } + } + + amd_comgr_release_data(data); + return isa; +} + int RetargetCodeObject(const void *elf_data, size_t elf_size, - const char *target_isa, void **out_data, - size_t *out_size) { + const char *source_isa, const char *target_isa, + void **out_data, size_t *out_size) { using OwnedElf = std::unique_ptr; if (!out_data || !out_size) { @@ -27,7 +63,7 @@ int RetargetCodeObject(const void *elf_data, size_t elf_size, *out_data = const_cast(elf_data); *out_size = elf_size; - if (!elf_data || elf_size == 0 || !target_isa) { + if (!elf_data || elf_size == 0 || !source_isa || !target_isa) { fprintf(stderr, "hotswap: invalid null input argument(s)\n"); return -1; } @@ -48,37 +84,13 @@ int RetargetCodeObject(const void *elf_data, size_t elf_size, return static_cast(status); } - // Source ISA from the code object via COMGR (LLVM-canonical, tracks triple - // normalization); the target ISA is the running GPU, supplied by the caller. - size_t isa_len = 0; - status = amd_comgr_get_data_isa_name(input, &isa_len, nullptr); - if (status != AMD_COMGR_STATUS_SUCCESS || isa_len == 0) { - fprintf(stderr, "hotswap: failed to query COMGR ISA name (rc=%d)\n", - static_cast(status)); - amd_comgr_release_data(input); - return static_cast(status); - } - std::string source_isa(isa_len, '\0'); - status = amd_comgr_get_data_isa_name(input, &isa_len, source_isa.data()); - if (status != AMD_COMGR_STATUS_SUCCESS) { - fprintf(stderr, "hotswap: failed to read COMGR ISA name (rc=%d)\n", - static_cast(status)); - amd_comgr_release_data(input); - return static_cast(status); - } - // Reported size includes the terminating NUL. - if (!source_isa.empty() && source_isa.back() == '\0') { - source_isa.pop_back(); - } - amd_comgr_data_t output = {0}; - status = amd_comgr_hotswap_rewrite(input, source_isa.c_str(), target_isa, - &output); + status = amd_comgr_hotswap_rewrite(input, source_isa, target_isa, &output); amd_comgr_release_data(input); if (status != AMD_COMGR_STATUS_SUCCESS) { fprintf(stderr, "hotswap: COMGR rewrite failed for %s -> %s (rc=%d)\n", - source_isa.c_str(), target_isa, static_cast(status)); + source_isa, target_isa, static_cast(status)); return static_cast(status); } diff --git a/projects/hotswap/hotswap.hpp b/projects/hotswap/hotswap.hpp index ce31e0cdf42b..4034723b25bb 100644 --- a/projects/hotswap/hotswap.hpp +++ b/projects/hotswap/hotswap.hpp @@ -8,14 +8,21 @@ #define ROCR_HOTSWAP_HPP #include +#include namespace rocr::hotswap { -/// Retarget a code object from its own ISA to the running GPU's ISA via COMGR. +/// Read a code object's own ISA name via COMGR (amd_comgr_get_data_isa_name). /// -/// The source ISA is read from the code object via COMGR -/// (amd_comgr_get_data_isa_name); the target ISA is the running GPU's ISA, -/// supplied by the caller (e.g. from the HSA agent). COMGR's +/// Uses COMGR's LLVM-canonical parser, so it tracks triple normalization +/// without hand-rolled metadata parsing. Returns an empty string on failure. +std::string GetCodeObjectIsaName(const void *elf_data, size_t elf_size); + +/// Retarget a code object from source_isa to target_isa via COMGR. +/// +/// Both ISA names are supplied by the caller: source_isa typically comes from +/// the code object (see GetCodeObjectIsaName) and target_isa from the running +/// GPU (e.g. the HSA agent), but either may be overridden. COMGR's /// amd_comgr_hotswap_rewrite (linked directly) applies whatever transformation /// the source/target pair calls for -- same-ISA stepping patches (e.g. gfx1250 /// B0 to A0) or cross-family transpilation -- and returns the rewritten code @@ -31,8 +38,8 @@ namespace rocr::hotswap { /// /// Returns 0 on success, non-zero on failure. int RetargetCodeObject(const void *elf_data, size_t elf_size, - const char *target_isa, void **out_data, - size_t *out_size); + const char *source_isa, const char *target_isa, + void **out_data, size_t *out_size); } // namespace rocr::hotswap diff --git a/projects/hotswap/hotswap_tool.cpp b/projects/hotswap/hotswap_tool.cpp index ebc0b9db462f..1f43abdfcda8 100644 --- a/projects/hotswap/hotswap_tool.cpp +++ b/projects/hotswap/hotswap_tool.cpp @@ -226,18 +226,19 @@ hsa_status_t try_retarget_and_load(hsa_executable_t executable, hsa_agent_t agen const char *options, hsa_loaded_code_object_t *loaded_code_object, const ByteVec &local_bytes) { - // Source ISA is derived from the code object inside RetargetCodeObject; the - // target ISA is the running GPU. + // Source ISA from the code object, target ISA from the running GPU. + const std::string source_isa = rocr::hotswap::GetCodeObjectIsaName( + local_bytes->data(), local_bytes->size()); const std::string target_isa = get_agent_isa_name(agent); - if (target_isa.empty()) { + if (source_isa.empty() || target_isa.empty()) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } void *out_elf = nullptr; size_t out_elf_size = 0; const int rc = rocr::hotswap::RetargetCodeObject( - local_bytes->data(), local_bytes->size(), target_isa.c_str(), &out_elf, - &out_elf_size); + local_bytes->data(), local_bytes->size(), source_isa.c_str(), + target_isa.c_str(), &out_elf, &out_elf_size); if (rc != 0 || out_elf == local_bytes->data()) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; From a13f50e1aab56d904d6d76e384248539ad555e77 Mon Sep 17 00:00:00 2001 From: Nirmal Senthilkumar Date: Tue, 23 Jun 2026 18:43:06 +0000 Subject: [PATCH 6/9] hotswap: add unit test for code-object ISA parse + rewrite Add a minimal gfx1250 fixture (tests/fixtures/gfx1250_min.hsaco), embedded into the test binary at build time, plus tests exercising GetCodeObjectIsaName and RetargetCodeObject end-to-end. GPU-free and file-free, so it runs on any CI runner. --- projects/hotswap/CMakeLists.txt | 12 +++ projects/hotswap/tests/fixtures/README.md | 16 ++++ .../hotswap/tests/fixtures/gfx1250_min.hsaco | Bin 0 -> 3936 bytes projects/hotswap/tests/hotswap_test.cpp | 84 ++++++++++++++---- 4 files changed, 97 insertions(+), 15 deletions(-) create mode 100644 projects/hotswap/tests/fixtures/README.md create mode 100644 projects/hotswap/tests/fixtures/gfx1250_min.hsaco diff --git a/projects/hotswap/CMakeLists.txt b/projects/hotswap/CMakeLists.txt index b9e0f3734b65..d71ae9143bee 100644 --- a/projects/hotswap/CMakeLists.txt +++ b/projects/hotswap/CMakeLists.txt @@ -56,6 +56,18 @@ add_executable(hotswap_test tests/hotswap_test.cpp hotswap.cpp) target_include_directories(hotswap_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) target_link_libraries(hotswap_test PRIVATE amd_comgr) +# Embed the gfx1250 fixture code object as a byte array so the test exercises the +# real parse + rewrite path with no GPU and no runtime file dependency (works in +# any CI). Regenerate the .hsaco per tests/fixtures/README.md. +set(_hotswap_co "${CMAKE_CURRENT_SOURCE_DIR}/tests/fixtures/gfx1250_min.hsaco") +set(_hotswap_co_hdr "${CMAKE_CURRENT_BINARY_DIR}/gfx1250_min_hsaco.h") +file(READ "${_hotswap_co}" _hotswap_co_hex HEX) +string(REGEX REPLACE "(..)" "0x\\1," _hotswap_co_arr "${_hotswap_co_hex}") +file(WRITE "${_hotswap_co_hdr}" + "// Generated from tests/fixtures/gfx1250_min.hsaco. Do not edit.\n" + "static const unsigned char kGfx1250MinCo[] = {${_hotswap_co_arr}};\n") +target_include_directories(hotswap_test PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) + # Unit tests for the gfx-target / ASIC-revision query logic. The test compiles # the portable hotswap_gfx_query.cpp unit alongside the test translation unit # and supplies its own stubs for the HSA entry points, so this target needs diff --git a/projects/hotswap/tests/fixtures/README.md b/projects/hotswap/tests/fixtures/README.md new file mode 100644 index 000000000000..0756c21e4c85 --- /dev/null +++ b/projects/hotswap/tests/fixtures/README.md @@ -0,0 +1,16 @@ +# HotSwap test fixtures + +## gfx1250_min.hsaco + +A minimal gfx1250 code object (empty kernel) used by `hotswap_test` to exercise +the real parse -> ISA-derivation -> rewrite path. It carries a valid +`NT_AMDGPU_METADATA` note (`amdhsa.target: amdgcn-amd-amdhsa--gfx1250`, +`.gfx1250_revision: B0`), which is what `amd_comgr_get_data_isa_name` reads. + +Regenerate with the ROCm clang: + +```bash +echo 'kernel void k(){}' > k.cl +clang -target amdgcn-amd-amdhsa -mcpu=gfx1250 -mcode-object-version=5 \ + -nogpulib -x cl -cl-std=CL2.0 -O2 k.cl -o gfx1250_min.hsaco +``` diff --git a/projects/hotswap/tests/fixtures/gfx1250_min.hsaco b/projects/hotswap/tests/fixtures/gfx1250_min.hsaco new file mode 100644 index 0000000000000000000000000000000000000000..edb84446d9ab88051b9505cf34196c53e57361c9 GIT binary patch literal 3936 zcmeHK&2Jl35Px<|ngSvYLWnPc50wZ)d0FSfj&o=ol0quEv_wU~X}jKiUN5$Ht=+Yo zRv=dD0WOG?UbrC;6+san5u&0X(TEFN_!kft4oDm-Bsdagc4u5`6GViNKq^mpo_X`; zH}l@SeQzeO9Iri@&19-0B55Py2azEjNtO7scO=Y-+fcS3#^5(E?h>OQ&Oxsx(|hO+ z8Dly|dMHNRpS63y4>qRYEN+O4r_eFBdlSMm$|hr`!#IRQw18-m^oVxJOmv7C@`vUQ zf@3`IZ|%p>!u=i8AIJO|!12~-B?lttD~>1aU!>?JIbh1M$L(H%xdPColPgc1epZN! z9~m9185pvqd{?!DOW%u$E7I`oU}Ia_jrHQpyr%oA?*xwLzWaoBP1?TK>*|5BJIW39 zhO@3LJ#a24aW%F9(7~k9c5FBI*&cbU{()>8uH7?iwRx(m+~rzfd3zwz`^ra;Z)U~k zvSY04jkXc$=RChf?!n=A8F59rMn}EV`cig%r*DL6;Cg6XTQab_zHWLwHx%2+Am}>n zHi_Ps!TCOp9jt*^Iaf7n7i7`E`SJZJP(!}lN{xVOTG z6x0x+p~5d!!y*%jBZ(}3H7kC8{jHZS{!-gQbJ&O1#BT?PB)74vv-hRK)#D3C5mR!m4od}1x2xxRoMdreq$mQ~x2SIip^&?;_j>mXT zEB%x`gt`wwpM(Ay&uLXLn$M4I9KD>|IChz=g2FA*mPVs%@p*|)t_uhlM&UKsgh)|gk-(kFhR0fFW>1N(e_wo^1o>uo4vX;z0UO7spZaeyWQ_hb$#zeWrh+a z3v;E0S)ZM)H0q^du~-4N+^Cdi&9Z4!RJo`bv&Blovb4ufFP&LlJ-)IyH>Z`0rN%;i zpst}SFz^GHnPP*OOZUMCQE_*lpYECNEK4zvlz6E2~Dk>2!`7oJ`^d19HP z@b@Xk4{NkRzGa3e*K_P4{n;ZH?-axMo-sX3K)h$@i|bd+q1eWu|F*m9F`n-k)5%!t zrhUefSlkZ`+Gaf8OQvZ&ub(M?SNJqAp5Fnc{0@Lh+>%_zBOx9I7G;d*dWdQDrfSTk z$I~Dk#_!O=n4U@D)BCRwp5HmH)8cw%;N_#)Kc!nb1;{dp*rJ-pfQ`)ZG#kLT1t`eM^gYEGVK8_9U?TP wQ6g9;yZD`=9&KxoKVAkI@10a*vPt+ihK0bhUlzX#%zpSXB`OLhtM9@80mAWT)&Kwi literal 0 HcmV?d00001 diff --git a/projects/hotswap/tests/hotswap_test.cpp b/projects/hotswap/tests/hotswap_test.cpp index 34bf67b2f564..710e81885252 100644 --- a/projects/hotswap/tests/hotswap_test.cpp +++ b/projects/hotswap/tests/hotswap_test.cpp @@ -5,13 +5,16 @@ //===----------------------------------------------------------------------===// #include "hotswap.hpp" +// Generated from tests/fixtures/gfx1250_min.hsaco: kGfx1250MinCo[]. Embedding it +// keeps the test GPU-free and free of any runtime file dependency (CI-portable). +#include "gfx1250_min_hsaco.h" #include #include static int tests_passed = 0; static int tests_failed = 0; -static constexpr const char *kTargetIsa = "amdgcn-amd-amdhsa--gfx1250"; +static constexpr const char *kGfx1250Isa = "amdgcn-amd-amdhsa--gfx1250"; static void check(bool cond, const char *name) { if (cond) { @@ -23,8 +26,45 @@ static void check(bool cond, const char *name) { } } -// A 16-byte fake ELF has no AMDGPU metadata note, so the ISA cannot be derived; -// RetargetCodeObject must fail and leave the original code object untouched. +// A real gfx1250 code object: COMGR parses its ISA name from the metadata note. +static void test_GetIsaNameRealCodeObject() { + printf("TEST GetIsaNameRealCodeObject...\n"); + const std::string isa = + rocr::hotswap::GetCodeObjectIsaName(kGfx1250MinCo, sizeof(kGfx1250MinCo)); + check(isa == kGfx1250Isa, "GetCodeObjectIsaName reads gfx1250 from a real CO"); +} + +// A 16-byte fake ELF has no metadata note: the ISA name cannot be parsed. +static void test_GetIsaNameInvalidCodeObject() { + printf("TEST GetIsaNameInvalidCodeObject...\n"); + const unsigned char fake_elf[] = {0x7f, 'E', 'L', 'F', 0x02, 0x01, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00}; + const std::string isa = + rocr::hotswap::GetCodeObjectIsaName(fake_elf, sizeof(fake_elf)); + check(isa.empty(), "GetCodeObjectIsaName returns empty for an invalid CO"); +} + +// A real gfx1250 CO rewrites successfully and yields a fresh output buffer. +static void test_RetargetRealCodeObject() { + printf("TEST RetargetRealCodeObject...\n"); + void *out_data = nullptr; + size_t out_size = 0; + const int rc = rocr::hotswap::RetargetCodeObject( + kGfx1250MinCo, sizeof(kGfx1250MinCo), kGfx1250Isa, kGfx1250Isa, &out_data, + &out_size); + + check(rc == 0, "RetargetCodeObject rewrites a real CO"); + check(out_data != nullptr && + out_data != static_cast(kGfx1250MinCo), + "out_data is a fresh allocation"); + check(out_size > 0, "out_size is non-zero"); + if (out_data && out_data != static_cast(kGfx1250MinCo)) { + std::free(out_data); + } +} + +// An un-rewritable (fake) CO fails and leaves the original code object intact. static void test_RetargetInvalidCodeObjectFallsBack() { printf("TEST RetargetInvalidCodeObjectFallsBack...\n"); const unsigned char fake_elf[] = {0x7f, 'E', 'L', 'F', 0x02, 0x01, 0x01, @@ -33,9 +73,10 @@ static void test_RetargetInvalidCodeObjectFallsBack() { void *out_data = nullptr; size_t out_size = 0; const int rc = rocr::hotswap::RetargetCodeObject( - fake_elf, sizeof(fake_elf), kTargetIsa, &out_data, &out_size); + fake_elf, sizeof(fake_elf), kGfx1250Isa, kGfx1250Isa, &out_data, + &out_size); - check(rc != 0, "RetargetCodeObject fails when ISA cannot be derived"); + check(rc != 0, "RetargetCodeObject fails for an un-rewritable CO"); check(out_data == static_cast(fake_elf), "out_data still points to original input after failure"); check(out_size == sizeof(fake_elf), @@ -45,8 +86,8 @@ static void test_RetargetInvalidCodeObjectFallsBack() { static void test_RetargetNullOutputPointers() { printf("TEST RetargetNullOutputPointers...\n"); const unsigned char fake_elf[] = {0x7f, 'E', 'L', 'F'}; - const int rc = rocr::hotswap::RetargetCodeObject(fake_elf, sizeof(fake_elf), - kTargetIsa, nullptr, nullptr); + const int rc = rocr::hotswap::RetargetCodeObject( + fake_elf, sizeof(fake_elf), kGfx1250Isa, kGfx1250Isa, nullptr, nullptr); check(rc != 0, "RetargetCodeObject rejects null output pointers"); } @@ -54,26 +95,39 @@ static void test_RetargetNullInputs() { printf("TEST RetargetNullInputs...\n"); void *out_data = nullptr; size_t out_size = 0; - const int rc = rocr::hotswap::RetargetCodeObject(nullptr, 0, kTargetIsa, - &out_data, &out_size); + const int rc = rocr::hotswap::RetargetCodeObject( + nullptr, 0, kGfx1250Isa, kGfx1250Isa, &out_data, &out_size); check(rc != 0, "RetargetCodeObject rejects null elf_data"); } -static void test_RetargetNullTarget() { - printf("TEST RetargetNullTarget...\n"); +static void test_RetargetNullSourceOrTarget() { + printf("TEST RetargetNullSourceOrTarget...\n"); const unsigned char fake_elf[] = {0x7f, 'E', 'L', 'F'}; void *out_data = nullptr; size_t out_size = 0; - const int rc = rocr::hotswap::RetargetCodeObject(fake_elf, sizeof(fake_elf), - nullptr, &out_data, &out_size); - check(rc != 0, "RetargetCodeObject rejects null target_isa"); + const int rc_src = rocr::hotswap::RetargetCodeObject( + fake_elf, sizeof(fake_elf), nullptr, kGfx1250Isa, &out_data, &out_size); + check(rc_src != 0, "RetargetCodeObject rejects null source_isa"); + const int rc_tgt = rocr::hotswap::RetargetCodeObject( + fake_elf, sizeof(fake_elf), kGfx1250Isa, nullptr, &out_data, &out_size); + check(rc_tgt != 0, "RetargetCodeObject rejects null target_isa"); } int main() { + // Line-buffer stdout so the tool's stderr diagnostics interleave next to the + // test that triggered them (e.g. under ctest -V) instead of clumping. + setvbuf(stdout, nullptr, _IOLBF, 0); + printf("Note: the negative tests below intentionally trigger tool diagnostics\n" + "on stderr (\"COMGR rewrite failed ... rc=2\", \"invalid null ...\").\n" + "These are EXPECTED; a passing run is the PASS lines + the final count.\n\n"); + + test_GetIsaNameRealCodeObject(); + test_GetIsaNameInvalidCodeObject(); + test_RetargetRealCodeObject(); test_RetargetInvalidCodeObjectFallsBack(); test_RetargetNullOutputPointers(); test_RetargetNullInputs(); - test_RetargetNullTarget(); + test_RetargetNullSourceOrTarget(); printf("\n%d passed, %d failed\n", tests_passed, tests_failed); return tests_failed ? EXIT_FAILURE : EXIT_SUCCESS; From 56ed59c43e9fdac9f67ec9752dea0c76a9aa26d4 Mon Sep 17 00:00:00 2001 From: Nirmal Senthilkumar Date: Thu, 25 Jun 2026 01:06:13 +0000 Subject: [PATCH 7/9] hotswap: fix OnUnload use-after-free via immortal singletons OnUnload is driven by libamdhip64's atexit handler (hsa_shut_down -> UnloadTools), which runs AFTER this library's C++ static destructors because the tool is dlopen'd by hsa_init (its __cxa_atexit dtors are LIFO-earlier). With ordinary file-scope statics, the reader-map / rewritten-ELF containers and their mutexes were destroyed first, then OnUnload's clear()/lock touched freed memory -> SIGSEGV during process exit (confirmed on gfx1250 A0: ~unique_ptr calling a tcache-corrupted deleter). Make these four objects never-destroyed heap singletons so their lifetime spans past static destruction. OnUnload still clear()s the containers and frees every malloc'd ELF buffer; only the small container shells persist to exit. Not a payload leak. --- projects/hotswap/hotswap_tool.cpp | 70 ++++++++++++++++++++----------- 1 file changed, 46 insertions(+), 24 deletions(-) diff --git a/projects/hotswap/hotswap_tool.cpp b/projects/hotswap/hotswap_tool.cpp index 1f43abdfcda8..386429390f94 100644 --- a/projects/hotswap/hotswap_tool.cpp +++ b/projects/hotswap/hotswap_tool.cpp @@ -54,15 +54,37 @@ struct ReaderEntry { bool keepalive_after_load = false; }; -std::mutex g_reader_map_mutex; -std::unordered_map g_reader_map; +// These four are deliberately immortal (heap-allocated, never destroyed). HSA +// calls OnUnload from hsa_shut_down, which is driven by libamdhip64's atexit +// handler -- this runs AFTER this library's C++ static destructors (the tool is +// dlopen'd by hsa_init, so its __cxa_atexit dtors are LIFO-earlier-to-run than +// HIP's shutdown handler). If these were ordinary statics, their destructors +// would free the backing storage first, and OnUnload's clear()/lock would then +// touch freed memory -> use-after-free SIGSEGV. Making them never-destroyed +// singletons removes that hazard. This is NOT a buffer leak: OnUnload still +// clear()s the containers and frees every malloc'd ELF buffer; only the small +// container shells persist to process exit (reclaimed by the OS). +std::mutex &reader_map_mutex() { + static std::mutex *m = new std::mutex(); + return *m; +} +std::unordered_map &reader_map() { + static auto *m = new std::unordered_map(); + return *m; +} // Rewritten ELF buffers must outlive the executable because ROCR's // LoadedCodeObjectImpl stores a raw pointer to the ELF data (used by // debuggers, profilers, and hsa_ven_amd_loader queries). We keep them // alive until OnUnload. -std::mutex g_rewritten_elfs_mutex; -std::vector g_rewritten_elfs; +std::mutex &rewritten_elfs_mutex() { + static std::mutex *m = new std::mutex(); + return *m; +} +std::vector &rewritten_elfs() { + static auto *v = new std::vector(); + return *v; +} CoreApiTable *g_core_table = nullptr; @@ -76,14 +98,14 @@ decltype(hsa_executable_load_agent_code_object) *g_orig_load_agent_code_object = void stash_bytes(uint64_t handle, const uint8_t *data, size_t size) { auto vec = std::make_shared>(data, data + size); - std::scoped_lock lock(g_reader_map_mutex); - g_reader_map[handle] = ReaderEntry{std::move(vec), false, false}; + std::scoped_lock lock(reader_map_mutex()); + reader_map()[handle] = ReaderEntry{std::move(vec), false, false}; } bool try_get_reader_entry(uint64_t handle, ByteVec *bytes, bool *from_file) { - std::scoped_lock lock(g_reader_map_mutex); - const auto it = g_reader_map.find(handle); - if (it == g_reader_map.end()) { + std::scoped_lock lock(reader_map_mutex()); + const auto it = reader_map().find(handle); + if (it == reader_map().end()) { return false; } *bytes = it->second.bytes; @@ -93,8 +115,8 @@ bool try_get_reader_entry(uint64_t handle, ByteVec *bytes, bool *from_file) { void retain_rewritten_elf(OwnedElf elf) { try { - std::scoped_lock lock(g_rewritten_elfs_mutex); - g_rewritten_elfs.push_back(std::move(elf)); + std::scoped_lock lock(rewritten_elfs_mutex()); + rewritten_elfs().push_back(std::move(elf)); } catch (const std::bad_alloc &) { // Intentionally leak to preserve debugger/profiler correctness if the // keepalive vector itself cannot grow. @@ -103,9 +125,9 @@ void retain_rewritten_elf(OwnedElf elf) { } void mark_reader_keepalive(uint64_t handle) { - std::scoped_lock lock(g_reader_map_mutex); - auto it = g_reader_map.find(handle); - if (it != g_reader_map.end()) { + std::scoped_lock lock(reader_map_mutex()); + auto it = reader_map().find(handle); + if (it != reader_map().end()) { it->second.keepalive_after_load = true; } } @@ -157,8 +179,8 @@ hsa_status_t HSA_API hotswap_reader_create_from_file( } { - std::scoped_lock lock(g_reader_map_mutex); - g_reader_map[reader.handle] = ReaderEntry{std::move(vec), true, false}; + std::scoped_lock lock(reader_map_mutex()); + reader_map()[reader.handle] = ReaderEntry{std::move(vec), true, false}; } *code_object_reader = reader; return HSA_STATUS_SUCCESS; @@ -174,10 +196,10 @@ hsa_status_t HSA_API hotswap_reader_create_from_file( hsa_status_t HSA_API hotswap_reader_destroy(hsa_code_object_reader_t code_object_reader) { { - std::scoped_lock lock(g_reader_map_mutex); - auto it = g_reader_map.find(code_object_reader.handle); - if (it != g_reader_map.end() && !it->second.keepalive_after_load) { - g_reader_map.erase(it); + std::scoped_lock lock(reader_map_mutex()); + auto it = reader_map().find(code_object_reader.handle); + if (it != reader_map().end() && !it->second.keepalive_after_load) { + reader_map().erase(it); } } return g_orig_reader_destroy(code_object_reader); @@ -363,13 +385,13 @@ void OnUnload() { g_orig_load_agent_code_object = nullptr; { - std::scoped_lock lock(g_reader_map_mutex); - g_reader_map.clear(); + std::scoped_lock lock(reader_map_mutex()); + reader_map().clear(); } { - std::scoped_lock lock(g_rewritten_elfs_mutex); - g_rewritten_elfs.clear(); + std::scoped_lock lock(rewritten_elfs_mutex()); + rewritten_elfs().clear(); } fprintf(stderr, "hotswap: tool unloaded\n"); From 940c9aafa8be219f8e57d8b3b9e2dd20691e1c1f Mon Sep 17 00:00:00 2001 From: Nirmal Senthilkumar Date: Thu, 25 Jun 2026 01:06:13 +0000 Subject: [PATCH 8/9] hotswap: add opt-in HSA_HOTSWAP_VERBOSE diagnostic logging Add a runtime-gated HOTSWAP_LOG(...) macro (driven by HSA_HOTSWAP_VERBOSE, off by default) that traces each intercepted reader-create, load_agent_code_object (with has_bytes), the gate decision, and each rewrite (src/tgt ISA, rc, in/out size, changed). No rebuild needed to toggle; when off the cost is one cached bool load per site. Used to confirm on real gfx1250 A0 that torch/hipBLASLt kernels are intercepted and rewritten. --- projects/hotswap/hotswap_tool.cpp | 35 +++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/projects/hotswap/hotswap_tool.cpp b/projects/hotswap/hotswap_tool.cpp index 386429390f94..467b5bbb57b4 100644 --- a/projects/hotswap/hotswap_tool.cpp +++ b/projects/hotswap/hotswap_tool.cpp @@ -86,6 +86,24 @@ std::vector &rewritten_elfs() { return *v; } +// Opt-in diagnostic logging (HSA_HOTSWAP_VERBOSE=1). Off by default so +// production stays quiet; lets us confirm which code objects are intercepted, +// gated, and rewritten. Runtime-gated (no rebuild to toggle); when off the cost +// is a single cached bool load per call site. +bool verbose() { + static const bool v = [] { + const char *e = std::getenv("HSA_HOTSWAP_VERBOSE"); + return e && e[0] && e[0] != '0'; + }(); + return v; +} + +#define HOTSWAP_LOG(...) \ + do { \ + if (verbose()) \ + fprintf(stderr, __VA_ARGS__); \ + } while (0) + CoreApiTable *g_core_table = nullptr; decltype(hsa_code_object_reader_create_from_memory) @@ -144,6 +162,8 @@ hsa_status_t HSA_API hotswap_reader_create_from_memory( try { stash_bytes(reader.handle, static_cast(code_object), size); + HOTSWAP_LOG("hotswap: reader_create_from_memory handle=%lu size=%zu\n", + static_cast(reader.handle), size); } catch (const std::bad_alloc &) { // Fall back to the original load path without rewrite support. } @@ -182,6 +202,9 @@ hsa_status_t HSA_API hotswap_reader_create_from_file( std::scoped_lock lock(reader_map_mutex()); reader_map()[reader.handle] = ReaderEntry{std::move(vec), true, false}; } + HOTSWAP_LOG("hotswap: reader_create_from_file handle=%lu size=%lld\n", + static_cast(reader.handle), + static_cast(file_size)); *code_object_reader = reader; return HSA_STATUS_SUCCESS; } catch (const std::bad_alloc &) { @@ -253,6 +276,8 @@ hsa_status_t try_retarget_and_load(hsa_executable_t executable, hsa_agent_t agen local_bytes->data(), local_bytes->size()); const std::string target_isa = get_agent_isa_name(agent); if (source_isa.empty() || target_isa.empty()) { + HOTSWAP_LOG("hotswap: rewrite SKIP empty isa (src='%s' tgt='%s' size=%zu)\n", + source_isa.c_str(), target_isa.c_str(), local_bytes->size()); return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } @@ -262,6 +287,10 @@ hsa_status_t try_retarget_and_load(hsa_executable_t executable, hsa_agent_t agen local_bytes->data(), local_bytes->size(), source_isa.c_str(), target_isa.c_str(), &out_elf, &out_elf_size); + HOTSWAP_LOG("hotswap: rewrite src=%s tgt=%s in=%zu rc=%d out=%zu changed=%d\n", + source_isa.c_str(), target_isa.c_str(), local_bytes->size(), rc, + out_elf_size, out_elf != local_bytes->data()); + if (rc != 0 || out_elf == local_bytes->data()) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } @@ -285,6 +314,10 @@ hsa_status_t HSA_API hotswap_load_agent_code_object( try_get_reader_entry(code_object_reader.handle, &local_bytes, &reader_from_file); + HOTSWAP_LOG("hotswap: load_agent_code_object handle=%lu has_bytes=%d\n", + static_cast(code_object_reader.handle), + local_bytes ? 1 : 0); + if (!local_bytes) { return load_original_reader(executable, agent, code_object_reader, options, loaded_code_object, @@ -295,6 +328,8 @@ hsa_status_t HSA_API hotswap_load_agent_code_object( // the original code object unchanged instead of routing through COMGR. const AgentGfxRevision gfx = query_agent_gfx_revision(agent); if (!gate_allows_hotswap(gfx)) { + HOTSWAP_LOG("hotswap: gate BLOCKED (gfx=%s rev=%u valid=%d)\n", + gfx.gfx_target.c_str(), gfx.asic_revision, gfx.revision_valid); return load_original_reader(executable, agent, code_object_reader, options, loaded_code_object, reader_from_file); From c9d7edeb6dca08dc8942c6763a79e0d79ab8fe09 Mon Sep 17 00:00:00 2001 From: Nirmal Senthilkumar Date: Thu, 25 Jun 2026 15:50:55 +0000 Subject: [PATCH 9/9] hotswap: simplify OnUnload to table-restore only (drop immortal singletons) The OnUnload use-after-free is better avoided by NOT touching heap-owning state at teardown than by making that state immortal. OnUnload now only restores the HSA API-table function pointers (POD writes); it no longer locks the mutexes or clear()s g_reader_map / g_rewritten_elfs. Those revert to ordinary file-scope statics, freed once by their normal static destructors at process exit. Mirrors the original comgr hotswap tool (llvm-project #2936), whose OnUnload likewise only restored the table and never crashed at teardown. Safe because ROCr dereferences the retained ELF bytes during the executable's life (debugger/profiler queries), not during teardown. Supersedes the immortal-singleton approach from 56ed59c43e. --- projects/hotswap/hotswap_tool.cpp | 91 ++++++++++++------------------- 1 file changed, 36 insertions(+), 55 deletions(-) diff --git a/projects/hotswap/hotswap_tool.cpp b/projects/hotswap/hotswap_tool.cpp index 467b5bbb57b4..ab709d5b57b9 100644 --- a/projects/hotswap/hotswap_tool.cpp +++ b/projects/hotswap/hotswap_tool.cpp @@ -54,37 +54,24 @@ struct ReaderEntry { bool keepalive_after_load = false; }; -// These four are deliberately immortal (heap-allocated, never destroyed). HSA -// calls OnUnload from hsa_shut_down, which is driven by libamdhip64's atexit -// handler -- this runs AFTER this library's C++ static destructors (the tool is -// dlopen'd by hsa_init, so its __cxa_atexit dtors are LIFO-earlier-to-run than -// HIP's shutdown handler). If these were ordinary statics, their destructors -// would free the backing storage first, and OnUnload's clear()/lock would then -// touch freed memory -> use-after-free SIGSEGV. Making them never-destroyed -// singletons removes that hazard. This is NOT a buffer leak: OnUnload still -// clear()s the containers and frees every malloc'd ELF buffer; only the small -// container shells persist to process exit (reclaimed by the OS). -std::mutex &reader_map_mutex() { - static std::mutex *m = new std::mutex(); - return *m; -} -std::unordered_map &reader_map() { - static auto *m = new std::unordered_map(); - return *m; -} +// State for the HSA_TOOLS_LIB callbacks. Ordinary file-scope statics: OnUnload +// is driven late (from HIP's atexit -> hsa_shut_down -> UnloadTools), and it +// deliberately does NOT touch any of this heap-owning state -- it only restores +// the API-table function pointers (POD). That keeps teardown safe regardless of +// C++ static-destruction order; the maps/buffers are freed by their normal +// static destructors at exit. (An earlier version clear()ed these in OnUnload, +// which crashed: by the time OnUnload runs, the static dtors have already freed +// them -> use-after-free SIGSEGV.) +std::mutex g_reader_map_mutex; +std::unordered_map g_reader_map; // Rewritten ELF buffers must outlive the executable because ROCR's -// LoadedCodeObjectImpl stores a raw pointer to the ELF data (used by -// debuggers, profilers, and hsa_ven_amd_loader queries). We keep them -// alive until OnUnload. -std::mutex &rewritten_elfs_mutex() { - static std::mutex *m = new std::mutex(); - return *m; -} -std::vector &rewritten_elfs() { - static auto *v = new std::vector(); - return *v; -} +// LoadedCodeObjectImpl stores a raw pointer to the ELF data (used by debuggers, +// profilers, and hsa_ven_amd_loader queries). Those queries happen during the +// executable's life, not during teardown, so freeing them at process exit (via +// the normal static destructor) is safe. +std::mutex g_rewritten_elfs_mutex; +std::vector g_rewritten_elfs; // Opt-in diagnostic logging (HSA_HOTSWAP_VERBOSE=1). Off by default so // production stays quiet; lets us confirm which code objects are intercepted, @@ -116,14 +103,14 @@ decltype(hsa_executable_load_agent_code_object) *g_orig_load_agent_code_object = void stash_bytes(uint64_t handle, const uint8_t *data, size_t size) { auto vec = std::make_shared>(data, data + size); - std::scoped_lock lock(reader_map_mutex()); - reader_map()[handle] = ReaderEntry{std::move(vec), false, false}; + std::scoped_lock lock(g_reader_map_mutex); + g_reader_map[handle] = ReaderEntry{std::move(vec), false, false}; } bool try_get_reader_entry(uint64_t handle, ByteVec *bytes, bool *from_file) { - std::scoped_lock lock(reader_map_mutex()); - const auto it = reader_map().find(handle); - if (it == reader_map().end()) { + std::scoped_lock lock(g_reader_map_mutex); + const auto it = g_reader_map.find(handle); + if (it == g_reader_map.end()) { return false; } *bytes = it->second.bytes; @@ -133,8 +120,8 @@ bool try_get_reader_entry(uint64_t handle, ByteVec *bytes, bool *from_file) { void retain_rewritten_elf(OwnedElf elf) { try { - std::scoped_lock lock(rewritten_elfs_mutex()); - rewritten_elfs().push_back(std::move(elf)); + std::scoped_lock lock(g_rewritten_elfs_mutex); + g_rewritten_elfs.push_back(std::move(elf)); } catch (const std::bad_alloc &) { // Intentionally leak to preserve debugger/profiler correctness if the // keepalive vector itself cannot grow. @@ -143,9 +130,9 @@ void retain_rewritten_elf(OwnedElf elf) { } void mark_reader_keepalive(uint64_t handle) { - std::scoped_lock lock(reader_map_mutex()); - auto it = reader_map().find(handle); - if (it != reader_map().end()) { + std::scoped_lock lock(g_reader_map_mutex); + auto it = g_reader_map.find(handle); + if (it != g_reader_map.end()) { it->second.keepalive_after_load = true; } } @@ -199,8 +186,8 @@ hsa_status_t HSA_API hotswap_reader_create_from_file( } { - std::scoped_lock lock(reader_map_mutex()); - reader_map()[reader.handle] = ReaderEntry{std::move(vec), true, false}; + std::scoped_lock lock(g_reader_map_mutex); + g_reader_map[reader.handle] = ReaderEntry{std::move(vec), true, false}; } HOTSWAP_LOG("hotswap: reader_create_from_file handle=%lu size=%lld\n", static_cast(reader.handle), @@ -219,10 +206,10 @@ hsa_status_t HSA_API hotswap_reader_create_from_file( hsa_status_t HSA_API hotswap_reader_destroy(hsa_code_object_reader_t code_object_reader) { { - std::scoped_lock lock(reader_map_mutex()); - auto it = reader_map().find(code_object_reader.handle); - if (it != reader_map().end() && !it->second.keepalive_after_load) { - reader_map().erase(it); + std::scoped_lock lock(g_reader_map_mutex); + auto it = g_reader_map.find(code_object_reader.handle); + if (it != g_reader_map.end() && !it->second.keepalive_after_load) { + g_reader_map.erase(it); } } return g_orig_reader_destroy(code_object_reader); @@ -419,16 +406,10 @@ void OnUnload() { g_orig_reader_destroy = nullptr; g_orig_load_agent_code_object = nullptr; - { - std::scoped_lock lock(reader_map_mutex()); - reader_map().clear(); - } - - { - std::scoped_lock lock(rewritten_elfs_mutex()); - rewritten_elfs().clear(); - } - + // Intentionally do NOT clear g_reader_map / g_rewritten_elfs here. OnUnload is + // driven from HIP's atexit hsa_shut_down, which runs AFTER this library's + // static destructors have already freed those containers; touching them here + // is a use-after-free. They are freed exactly once, by their static dtors. fprintf(stderr, "hotswap: tool unloaded\n"); }