diff --git a/Cargo.toml b/Cargo.toml
index fb638f2fb..41350c6cb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,8 +1,8 @@
 [workspace]
 resolver = "2"
 members = [
+    "builtins-shim",
     "builtins-test",
-    "compiler-builtins",
     "crates/josh-sync",
     "crates/libm-macros",
     "crates/musl-math-sys",
@@ -14,8 +14,8 @@ members = [
 ]
 
 default-members = [
+    "builtins-shim",
     "builtins-test",
-    "compiler-builtins",
     "crates/libm-macros",
     "libm",
     "libm-test",
@@ -26,6 +26,10 @@ exclude = [
     # and `mangled-names` disabled, which is the opposite of what is needed for
     # other tests, so it makes sense to keep it out of the workspace.
     "builtins-test-intrinsics",
+    # We test via the `builtins-shim` crate, so exclude the `compiler-builtins`
+    # that has a dependency on `core`. See `builtins-shim/Cargo.toml` for more
+    # details.
+    "compiler-builtins",
 ]
 
 [profile.release]
diff --git a/builtins-shim/Cargo.toml b/builtins-shim/Cargo.toml
new file mode 100644
index 000000000..8eb880c6f
--- /dev/null
+++ b/builtins-shim/Cargo.toml
@@ -0,0 +1,63 @@
+# NOTE: Must be kept in sync with `../compiler-builtins/Cargo.toml`.
+#
+# The manifest at `../compiler-builtins` is what actually gets used in the
+# rust-lang/rust tree; however, we can't build it out of tree because it
+# depends on `core` by path, and even optional Cargo dependencies need to be
+# available at build time. So, we work around this by having this "shim"
+# manifest that is identical except for the `core` dependency and forwards
+# to the same sources, which acts as the `compiler-builtins` Cargo entrypoint
+# for out of tree testing
+
+[package]
+name = "compiler_builtins"
+version = "0.1.160"
+authors = ["Jorge Aparicio <japaricious@gmail.com>"]
+description = "Compiler intrinsics used by the Rust compiler."
+repository = "https://github.com/rust-lang/compiler-builtins"
+license = "MIT AND Apache-2.0 WITH LLVM-exception AND (MIT OR Apache-2.0)"
+edition = "2024"
+publish = false
+links = "compiler-rt"
+
+build = "../compiler-builtins/build.rs"
+
+[lib]
+path = "../compiler-builtins/src/lib.rs"
+bench = false
+doctest = false
+test = false
+
+[build-dependencies]
+cc = { optional = true, version = "1.2" }
+
+[features]
+default = ["compiler-builtins"]
+
+# Enable compilation of C code in compiler-rt, filling in some more optimized
+# implementations and also filling in unimplemented intrinsics
+c = ["dep:cc"]
+
+# Workaround for the Cranelift codegen backend. Disables any implementations
+# which use inline assembly and fall back to pure Rust versions (if available).
+no-asm = []
+
+# Workaround for codegen backends which haven't yet implemented `f16` and
+# `f128` support. Disabled any intrinsics which use those types.
+no-f16-f128 = []
+
+# Flag this library as the unstable compiler-builtins lib
+compiler-builtins = []
+
+# Generate memory-related intrinsics like memcpy
+mem = []
+
+# Mangle all names so this can be linked in with other versions or other
+# compiler-rt implementations. Also used for testing
+mangled-names = []
+
+# Only used in the compiler's build system
+rustc-dep-of-std = ["compiler-builtins"]
+
+# This makes certain traits and function specializations public that
+# are not normally public but are required by the `builtins-test`
+unstable-public-internals = []
diff --git a/builtins-test-intrinsics/Cargo.toml b/builtins-test-intrinsics/Cargo.toml
index 064b7cad2..e73a1f7b1 100644
--- a/builtins-test-intrinsics/Cargo.toml
+++ b/builtins-test-intrinsics/Cargo.toml
@@ -6,7 +6,7 @@ publish = false
 license = "MIT OR Apache-2.0"
 
 [dependencies]
-compiler_builtins = { path = "../compiler-builtins", features = ["compiler-builtins"] }
+compiler_builtins = { path = "../builtins-shim", features = ["compiler-builtins"] }
 panic-handler = { path = "../crates/panic-handler" }
 
 [features]
diff --git a/builtins-test/Cargo.toml b/builtins-test/Cargo.toml
index c7742aa24..093d4633f 100644
--- a/builtins-test/Cargo.toml
+++ b/builtins-test/Cargo.toml
@@ -17,7 +17,7 @@ rustc_apfloat = "0.2.2"
 iai-callgrind = { version = "0.14.1", optional = true }
 
 [dependencies.compiler_builtins]
-path = "../compiler-builtins"
+path = "../builtins-shim"
 default-features = false
 features = ["unstable-public-internals"]
 
diff --git a/compiler-builtins/Cargo.toml b/compiler-builtins/Cargo.toml
index dffdcaf94..c5446cd76 100644
--- a/compiler-builtins/Cargo.toml
+++ b/compiler-builtins/Cargo.toml
@@ -1,3 +1,9 @@
+# NOTE: Must be kept in sync with `../builtins-shim/Cargo.toml`.
+#
+# This manifest is actually used in-tree by rust-lang/rust,
+# `../builtins-shim/Cargo.toml` is used by out-of-tree testing. See the other
+# manifest for further details.
+
 [package]
 name = "compiler_builtins"
 version = "0.1.160"
@@ -15,9 +21,7 @@ doctest = false
 test = false
 
 [dependencies]
-# For more information on this dependency see
-# https://github.com/rust-lang/rust/tree/master/library/rustc-std-workspace-core
-core = { version = "1.0.1", optional = true, package = "rustc-std-workspace-core" }
+core = { path = "../../core", optional = true }
 
 [build-dependencies]
 cc = { optional = true, version = "1.2" }
diff --git a/compiler-builtins/build.rs b/compiler-builtins/build.rs
index e909a0dcb..018899faf 100644
--- a/compiler-builtins/build.rs
+++ b/compiler-builtins/build.rs
@@ -19,6 +19,9 @@ fn main() {
 
     println!("cargo:compiler-rt={}", cwd.join("compiler-rt").display());
 
+    println!("cargo::rustc-check-cfg=cfg(kernel_user_helpers)");
+    println!("cargo::rustc-check-cfg=cfg(feature, values(\"mem-unaligned\"))");
+
     // Emscripten's runtime includes all the builtins
     if target.os == "emscripten" {
         return;
@@ -44,7 +47,6 @@ fn main() {
     }
 
     // These targets have hardware unaligned access support.
-    println!("cargo::rustc-check-cfg=cfg(feature, values(\"mem-unaligned\"))");
     if target.arch.contains("x86_64")
         || target.arch.contains("x86")
         || target.arch.contains("aarch64")
@@ -75,7 +77,6 @@ fn main() {
     // Only emit the ARM Linux atomic emulation on pre-ARMv6 architectures. This
     // includes the old androideabi. It is deprecated but it is available as a
     // rustc target (arm-linux-androideabi).
-    println!("cargo::rustc-check-cfg=cfg(kernel_user_helpers)");
     if llvm_target[0] == "armv4t"
         || llvm_target[0] == "armv5te"
         || target.triple == "arm-linux-androideabi"
diff --git a/compiler-builtins/src/aarch64_linux.rs b/compiler-builtins/src/aarch64_linux.rs
index 2402a3fe1..38fcab152 100644
--- a/compiler-builtins/src/aarch64_linux.rs
+++ b/compiler-builtins/src/aarch64_linux.rs
@@ -4,7 +4,7 @@
 //! To avoid breaking backwards compat, C toolchains introduced a concept of "outlined atomics",
 //! where atomic operations call into the compiler runtime to dispatch between two depending on
 //! which is supported on the current CPU.
-//! See https://community.arm.com/arm-community-blogs/b/tools-software-ides-blog/posts/making-the-most-of-the-arm-architecture-in-gcc-10#:~:text=out%20of%20line%20atomics for more discussion.
+//! See <https://community.arm.com/arm-community-blogs/b/tools-software-ides-blog/posts/making-the-most-of-the-arm-architecture-in-gcc-10#:~:text=out%20of%20line%20atomics> for more discussion.
 //!
 //! Currently we only support LL/SC, because LSE requires `getauxval` from libc in order to do runtime detection.
 //! Use the `compiler-rt` intrinsics if you want LSE support.
diff --git a/compiler-builtins/src/arm_linux.rs b/compiler-builtins/src/arm_linux.rs
index 6ce67ba71..ab9f86807 100644
--- a/compiler-builtins/src/arm_linux.rs
+++ b/compiler-builtins/src/arm_linux.rs
@@ -4,12 +4,17 @@ use core::{arch, mem};
 // Kernel-provided user-mode helper functions:
 // https://www.kernel.org/doc/Documentation/arm/kernel_user_helpers.txt
 unsafe fn __kuser_cmpxchg(oldval: u32, newval: u32, ptr: *mut u32) -> bool {
-    let f: extern "C" fn(u32, u32, *mut u32) -> u32 = mem::transmute(0xffff0fc0usize as *const ());
+    // FIXME(volatile): the third parameter is a volatile pointer
+    // SAFETY: kernel docs specify a known address with the given signature
+    let f = unsafe {
+        mem::transmute::<_, extern "C" fn(u32, u32, *mut u32) -> u32>(0xffff0fc0usize as *const ())
+    };
     f(oldval, newval, ptr) == 0
 }
 
 unsafe fn __kuser_memory_barrier() {
-    let f: extern "C" fn() = mem::transmute(0xffff0fa0usize as *const ());
+    // SAFETY: kernel docs specify a known address with the given signature
+    let f = unsafe { mem::transmute::<_, extern "C" fn()>(0xffff0fa0usize as *const ()) };
     f();
 }
 
@@ -67,8 +72,10 @@ fn insert_aligned(aligned: u32, val: u32, shift: u32, mask: u32) -> u32 {
 /// - if `size_of::<T>() == 2`, `ptr` or `ptr` offset by 2 bytes must be valid for a relaxed atomic
 ///   read of 2 bytes.
 /// - if `size_of::<T>() == 4`, `ptr` must be valid for a relaxed atomic read of 4 bytes.
+// FIXME: assert some of the preconditions in debug mode
 unsafe fn atomic_load_aligned<T>(ptr: *mut u32) -> u32 {
-    if mem::size_of::<T>() == 4 {
+    const { assert!(size_of::<T>() <= 4) };
+    if size_of::<T>() == 4 {
         // SAFETY: As `T` has a size of 4, the caller garantees this is sound.
         unsafe { AtomicU32::from_ptr(ptr).load(Ordering::Relaxed) }
     } else {
@@ -100,11 +107,13 @@ unsafe fn atomic_rmw<T, F: Fn(u32) -> u32, G: Fn(u32, u32) -> u32>(ptr: *mut T,
     let (shift, mask) = get_shift_mask(ptr);
 
     loop {
-        let curval_aligned = atomic_load_aligned::<T>(aligned_ptr);
+        // FIXME(safety): preconditions review needed
+        let curval_aligned = unsafe { atomic_load_aligned::<T>(aligned_ptr) };
         let curval = extract_aligned(curval_aligned, shift, mask);
         let newval = f(curval);
         let newval_aligned = insert_aligned(curval_aligned, newval, shift, mask);
-        if __kuser_cmpxchg(curval_aligned, newval_aligned, aligned_ptr) {
+        // FIXME(safety): preconditions review needed
+        if unsafe { __kuser_cmpxchg(curval_aligned, newval_aligned, aligned_ptr) } {
             return g(curval, newval);
         }
     }
@@ -116,13 +125,15 @@ unsafe fn atomic_cmpxchg<T>(ptr: *mut T, oldval: u32, newval: u32) -> u32 {
     let (shift, mask) = get_shift_mask(ptr);
 
     loop {
-        let curval_aligned = atomic_load_aligned::<T>(aligned_ptr);
+        // FIXME(safety): preconditions review needed
+        let curval_aligned = unsafe { atomic_load_aligned::<T>(aligned_ptr) };
         let curval = extract_aligned(curval_aligned, shift, mask);
         if curval != oldval {
             return curval;
         }
         let newval_aligned = insert_aligned(curval_aligned, newval, shift, mask);
-        if __kuser_cmpxchg(curval_aligned, newval_aligned, aligned_ptr) {
+        // FIXME(safety): preconditions review needed
+        if unsafe { __kuser_cmpxchg(curval_aligned, newval_aligned, aligned_ptr) } {
             return oldval;
         }
     }
@@ -132,7 +143,14 @@ macro_rules! atomic_rmw {
     ($name:ident, $ty:ty, $op:expr, $fetch:expr) => {
         intrinsics! {
             pub unsafe extern "C" fn $name(ptr: *mut $ty, val: $ty) -> $ty {
-                atomic_rmw(ptr, |x| $op(x as $ty, val) as u32, |old, new| $fetch(old, new)) as $ty
+                // FIXME(safety): preconditions review needed
+                unsafe {
+                    atomic_rmw(
+                        ptr,
+                        |x| $op(x as $ty, val) as u32,
+                        |old, new| $fetch(old, new)
+                    ) as $ty
+                }
             }
         }
     };
@@ -149,7 +167,8 @@ macro_rules! atomic_cmpxchg {
     ($name:ident, $ty:ty) => {
         intrinsics! {
             pub unsafe extern "C" fn $name(ptr: *mut $ty, oldval: $ty, newval: $ty) -> $ty {
-                atomic_cmpxchg(ptr, oldval as u32, newval as u32) as $ty
+                // FIXME(safety): preconditions review needed
+                unsafe { atomic_cmpxchg(ptr, oldval as u32, newval as u32) as $ty }
             }
         }
     };
@@ -285,6 +304,7 @@ atomic_cmpxchg!(__sync_val_compare_and_swap_4, u32);
 
 intrinsics! {
     pub unsafe extern "C" fn __sync_synchronize() {
-        __kuser_memory_barrier();
+       // SAFETY: preconditions are the same as the calling function.
+       unsafe {  __kuser_memory_barrier() };
     }
 }
diff --git a/compiler-builtins/src/lib.rs b/compiler-builtins/src/lib.rs
index ef3299d69..1cec39d8b 100644
--- a/compiler-builtins/src/lib.rs
+++ b/compiler-builtins/src/lib.rs
@@ -9,6 +9,7 @@
 #![feature(naked_functions)]
 #![feature(repr_simd)]
 #![feature(macro_metavar_expr_concat)]
+#![feature(rustc_attrs)]
 #![cfg_attr(f16_enabled, feature(f16))]
 #![cfg_attr(f128_enabled, feature(f128))]
 #![no_builtins]
diff --git a/compiler-builtins/src/mem/x86_64.rs b/compiler-builtins/src/mem/x86_64.rs
index 5cbe83ab1..fb29eb11b 100644
--- a/compiler-builtins/src/mem/x86_64.rs
+++ b/compiler-builtins/src/mem/x86_64.rs
@@ -69,7 +69,7 @@ pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
         "rep movsb",
         "sub $7, %rsi",
         "sub $7, %rdi",
-        "mov {qword_count}, %rcx",
+        "mov {qword_count:r}, %rcx",
         "rep movsq",
         "test {pre_byte_count:e}, {pre_byte_count:e}",
         "add $7, %rsi",
@@ -212,7 +212,7 @@ pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
     let x = {
         let r;
         asm!(
-            "movdqa ({addr}), {dest}",
+            "movdqa ({addr:r}), {dest}",
             addr = in(reg) s,
             dest = out(xmm_reg) r,
             options(att_syntax, nostack),
@@ -232,7 +232,7 @@ pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
         let x = {
             let r;
             asm!(
-                "movdqa ({addr}), {dest}",
+                "movdqa ({addr:r}), {dest}",
                 addr = in(reg) s,
                 dest = out(xmm_reg) r,
                 options(att_syntax, nostack),
diff --git a/compiler-builtins/src/probestack.rs b/compiler-builtins/src/probestack.rs
index c9070cf55..1441fd73b 100644
--- a/compiler-builtins/src/probestack.rs
+++ b/compiler-builtins/src/probestack.rs
@@ -49,304 +49,198 @@
 // We only define stack probing for these architectures today.
 #![cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 
-// SAFETY: defined in this module.
-// FIXME(extern_custom): the ABI is not correct.
-unsafe extern "C" {
-    pub fn __rust_probestack();
-}
-
-// A wrapper for our implementation of __rust_probestack, which allows us to
-// keep the assembly inline while controlling all CFI directives in the assembly
-// emitted for the function.
-//
-// This is the ELF version.
-#[cfg(not(any(target_vendor = "apple", target_os = "uefi")))]
-macro_rules! define_rust_probestack {
-    ($body: expr) => {
-        concat!(
-            "
-            .pushsection .text.__rust_probestack
-            .globl __rust_probestack
-            .type  __rust_probestack, @function
-            .hidden __rust_probestack
-        __rust_probestack:
-            ",
-            $body,
-            "
-            .size __rust_probestack, . - __rust_probestack
-            .popsection
-            "
-        )
-    };
-}
-
-#[cfg(all(target_os = "uefi", target_arch = "x86_64"))]
-macro_rules! define_rust_probestack {
-    ($body: expr) => {
-        concat!(
-            "
-            .globl __rust_probestack
-        __rust_probestack:
-            ",
-            $body
-        )
-    };
-}
-
-// Same as above, but for Mach-O. Note that the triple underscore
-// is deliberate
-#[cfg(target_vendor = "apple")]
-macro_rules! define_rust_probestack {
-    ($body: expr) => {
-        concat!(
-            "
-            .globl ___rust_probestack
-        ___rust_probestack:
-            ",
-            $body
-        )
-    };
-}
-
-// In UEFI x86 arch, triple underscore is deliberate.
-#[cfg(all(target_os = "uefi", target_arch = "x86"))]
-macro_rules! define_rust_probestack {
-    ($body: expr) => {
-        concat!(
-            "
-            .globl ___rust_probestack
-        ___rust_probestack:
-            ",
-            $body
-        )
-    };
-}
-
 // Our goal here is to touch each page between %rsp+8 and %rsp+8-%rax,
 // ensuring that if any pages are unmapped we'll make a page fault.
 //
+// FIXME(abi_custom): This function is unsafe because it uses a custom ABI,
+// it does not actually match `extern "C"`.
+//
 // The ABI here is that the stack frame size is located in `%rax`. Upon
 // return we're not supposed to modify `%rsp` or `%rax`.
-//
-// Any changes to this function should be replicated to the SGX version below.
-#[cfg(all(
-    target_arch = "x86_64",
-    not(all(target_env = "sgx", target_vendor = "fortanix"))
-))]
-core::arch::global_asm!(
-    define_rust_probestack!(
-        "
-    .cfi_startproc
-    pushq  %rbp
-    .cfi_adjust_cfa_offset 8
-    .cfi_offset %rbp, -16
-    movq   %rsp, %rbp
-    .cfi_def_cfa_register %rbp
-
-    mov    %rax,%r11        // duplicate %rax as we're clobbering %r11
-
-    // Main loop, taken in one page increments. We're decrementing rsp by
-    // a page each time until there's less than a page remaining. We're
-    // guaranteed that this function isn't called unless there's more than a
-    // page needed.
-    //
-    // Note that we're also testing against `8(%rsp)` to account for the 8
-    // bytes pushed on the stack orginally with our return address. Using
-    // `8(%rsp)` simulates us testing the stack pointer in the caller's
-    // context.
-
-    // It's usually called when %rax >= 0x1000, but that's not always true.
-    // Dynamic stack allocation, which is needed to implement unsized
-    // rvalues, triggers stackprobe even if %rax < 0x1000.
-    // Thus we have to check %r11 first to avoid segfault.
-    cmp    $0x1000,%r11
-    jna    3f
-2:
-    sub    $0x1000,%rsp
-    test   %rsp,8(%rsp)
-    sub    $0x1000,%r11
-    cmp    $0x1000,%r11
-    ja     2b
-
-3:
-    // Finish up the last remaining stack space requested, getting the last
-    // bits out of r11
-    sub    %r11,%rsp
-    test   %rsp,8(%rsp)
-
-    // Restore the stack pointer to what it previously was when entering
-    // this function. The caller will readjust the stack pointer after we
-    // return.
-    add    %rax,%rsp
-
-    leave
-    .cfi_def_cfa_register %rsp
-    .cfi_adjust_cfa_offset -8
-    ret
-    .cfi_endproc
-    "
-    ),
-    options(att_syntax)
-);
+#[cfg(target_arch = "x86_64")]
+#[unsafe(naked)]
+#[rustc_std_internal_symbol]
+pub unsafe extern "C" fn __rust_probestack() {
+    #[cfg(not(all(target_env = "sgx", target_vendor = "fortanix")))]
+    macro_rules! ret {
+        () => {
+            "ret"
+        };
+    }
+
+    #[cfg(all(target_env = "sgx", target_vendor = "fortanix"))]
+    macro_rules! ret {
+        // for this target, [manually patch for LVI].
+        //
+        // [manually patch for LVI]: https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions
+        () => {
+            "
+            pop %r11
+            lfence
+            jmp *%r11
+            "
+        };
+    }
 
-// This function is the same as above, except that some instructions are
-// [manually patched for LVI].
-//
-// [manually patched for LVI]: https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions
-#[cfg(all(
-    target_arch = "x86_64",
-    all(target_env = "sgx", target_vendor = "fortanix")
-))]
-core::arch::global_asm!(
-    define_rust_probestack!(
+    core::arch::naked_asm!(
         "
-    .cfi_startproc
-    pushq  %rbp
-    .cfi_adjust_cfa_offset 8
-    .cfi_offset %rbp, -16
-    movq   %rsp, %rbp
-    .cfi_def_cfa_register %rbp
-
-    mov    %rax,%r11        // duplicate %rax as we're clobbering %r11
-
-    // Main loop, taken in one page increments. We're decrementing rsp by
-    // a page each time until there's less than a page remaining. We're
-    // guaranteed that this function isn't called unless there's more than a
-    // page needed.
-    //
-    // Note that we're also testing against `8(%rsp)` to account for the 8
-    // bytes pushed on the stack orginally with our return address. Using
-    // `8(%rsp)` simulates us testing the stack pointer in the caller's
-    // context.
-
-    // It's usually called when %rax >= 0x1000, but that's not always true.
-    // Dynamic stack allocation, which is needed to implement unsized
-    // rvalues, triggers stackprobe even if %rax < 0x1000.
-    // Thus we have to check %r11 first to avoid segfault.
-    cmp    $0x1000,%r11
-    jna    3f
-2:
-    sub    $0x1000,%rsp
-    test   %rsp,8(%rsp)
-    sub    $0x1000,%r11
-    cmp    $0x1000,%r11
-    ja     2b
-
-3:
-    // Finish up the last remaining stack space requested, getting the last
-    // bits out of r11
-    sub    %r11,%rsp
-    test   %rsp,8(%rsp)
-
-    // Restore the stack pointer to what it previously was when entering
-    // this function. The caller will readjust the stack pointer after we
-    // return.
-    add    %rax,%rsp
-
-    leave
-    .cfi_def_cfa_register %rsp
-    .cfi_adjust_cfa_offset -8
-    pop %r11
-    lfence
-    jmp *%r11
-    .cfi_endproc
-    "
-    ),
-    options(att_syntax)
-);
+            .cfi_startproc
+            pushq  %rbp
+            .cfi_adjust_cfa_offset 8
+            .cfi_offset %rbp, -16
+            movq   %rsp, %rbp
+            .cfi_def_cfa_register %rbp
+
+            mov    %rax,%r11        // duplicate %rax as we're clobbering %r11
+
+            // Main loop, taken in one page increments. We're decrementing rsp by
+            // a page each time until there's less than a page remaining. We're
+            // guaranteed that this function isn't called unless there's more than a
+            // page needed.
+            //
+            // Note that we're also testing against `8(%rsp)` to account for the 8
+            // bytes pushed on the stack orginally with our return address. Using
+            // `8(%rsp)` simulates us testing the stack pointer in the caller's
+            // context.
+
+            // It's usually called when %rax >= 0x1000, but that's not always true.
+            // Dynamic stack allocation, which is needed to implement unsized
+            // rvalues, triggers stackprobe even if %rax < 0x1000.
+            // Thus we have to check %r11 first to avoid segfault.
+            cmp    $0x1000,%r11
+            jna    3f
+        2:
+            sub    $0x1000,%rsp
+            test   %rsp,8(%rsp)
+            sub    $0x1000,%r11
+            cmp    $0x1000,%r11
+            ja     2b
+
+        3:
+            // Finish up the last remaining stack space requested, getting the last
+            // bits out of r11
+            sub    %r11,%rsp
+            test   %rsp,8(%rsp)
+
+            // Restore the stack pointer to what it previously was when entering
+            // this function. The caller will readjust the stack pointer after we
+            // return.
+            add    %rax,%rsp
+
+            leave
+            .cfi_def_cfa_register %rsp
+            .cfi_adjust_cfa_offset -8
+    ",
+        ret!(),
+        "
+            .cfi_endproc
+    ",
+        options(att_syntax)
+    )
+}
 
 #[cfg(all(target_arch = "x86", not(target_os = "uefi")))]
 // This is the same as x86_64 above, only translated for 32-bit sizes. Note
 // that on Unix we're expected to restore everything as it was, this
 // function basically can't tamper with anything.
 //
+// FIXME(abi_custom): This function is unsafe because it uses a custom ABI,
+// it does not actually match `extern "C"`.
+//
 // The ABI here is the same as x86_64, except everything is 32-bits large.
-core::arch::global_asm!(
-    define_rust_probestack!(
+#[unsafe(naked)]
+#[rustc_std_internal_symbol]
+pub unsafe extern "C" fn __rust_probestack() {
+    core::arch::naked_asm!(
         "
-    .cfi_startproc
-    push   %ebp
-    .cfi_adjust_cfa_offset 4
-    .cfi_offset %ebp, -8
-    mov    %esp, %ebp
-    .cfi_def_cfa_register %ebp
-    push   %ecx
-    mov    %eax,%ecx
-
-    cmp    $0x1000,%ecx
-    jna    3f
-2:
-    sub    $0x1000,%esp
-    test   %esp,8(%esp)
-    sub    $0x1000,%ecx
-    cmp    $0x1000,%ecx
-    ja     2b
-
-3:
-    sub    %ecx,%esp
-    test   %esp,8(%esp)
-
-    add    %eax,%esp
-    pop    %ecx
-    leave
-    .cfi_def_cfa_register %esp
-    .cfi_adjust_cfa_offset -4
-    ret
-    .cfi_endproc
-    "
-    ),
-    options(att_syntax)
-);
+            .cfi_startproc
+            push   %ebp
+            .cfi_adjust_cfa_offset 4
+            .cfi_offset %ebp, -8
+            mov    %esp, %ebp
+            .cfi_def_cfa_register %ebp
+            push   %ecx
+            mov    %eax,%ecx
+
+            cmp    $0x1000,%ecx
+            jna    3f
+        2:
+            sub    $0x1000,%esp
+            test   %esp,8(%esp)
+            sub    $0x1000,%ecx
+            cmp    $0x1000,%ecx
+            ja     2b
+
+        3:
+            sub    %ecx,%esp
+            test   %esp,8(%esp)
+
+            add    %eax,%esp
+            pop    %ecx
+            leave
+            .cfi_def_cfa_register %esp
+            .cfi_adjust_cfa_offset -4
+            ret
+            .cfi_endproc
+    ",
+        options(att_syntax)
+    )
+}
 
 #[cfg(all(target_arch = "x86", target_os = "uefi"))]
 // UEFI target is windows like target. LLVM will do _chkstk things like windows.
 // probestack function will also do things like _chkstk in MSVC.
 // So we need to sub %ax %sp in probestack when arch is x86.
 //
+// FIXME(abi_custom): This function is unsafe because it uses a custom ABI,
+// it does not actually match `extern "C"`.
+//
 // REF: Rust commit(74e80468347)
 // rust\src\llvm-project\llvm\lib\Target\X86\X86FrameLowering.cpp: 805
 // Comments in LLVM:
 //   MSVC x32's _chkstk and cygwin/mingw's _alloca adjust %esp themselves.
 //   MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
 //   themselves.
-core::arch::global_asm!(
-    define_rust_probestack!(
+#[unsafe(naked)]
+#[rustc_std_internal_symbol]
+pub unsafe extern "C" fn __rust_probestack() {
+    core::arch::naked_asm!(
         "
-    .cfi_startproc
-    push   %ebp
-    .cfi_adjust_cfa_offset 4
-    .cfi_offset %ebp, -8
-    mov    %esp, %ebp
-    .cfi_def_cfa_register %ebp
-    push   %ecx
-    push   %edx
-    mov    %eax,%ecx
-
-    cmp    $0x1000,%ecx
-    jna    3f
-2:
-    sub    $0x1000,%esp
-    test   %esp,8(%esp)
-    sub    $0x1000,%ecx
-    cmp    $0x1000,%ecx
-    ja     2b
-
-3:
-    sub    %ecx,%esp
-    test   %esp,8(%esp)
-    mov    4(%ebp),%edx
-    mov    %edx, 12(%esp)
-    add    %eax,%esp
-    pop    %edx
-    pop    %ecx
-    leave
-
-    sub   %eax, %esp
-    .cfi_def_cfa_register %esp
-    .cfi_adjust_cfa_offset -4
-    ret
-    .cfi_endproc
-    "
-    ),
-    options(att_syntax)
-);
+            .cfi_startproc
+            push   %ebp
+            .cfi_adjust_cfa_offset 4
+            .cfi_offset %ebp, -8
+            mov    %esp, %ebp
+            .cfi_def_cfa_register %ebp
+            push   %ecx
+            push   %edx
+            mov    %eax,%ecx
+
+            cmp    $0x1000,%ecx
+            jna    3f
+        2:
+            sub    $0x1000,%esp
+            test   %esp,8(%esp)
+            sub    $0x1000,%ecx
+            cmp    $0x1000,%ecx
+            ja     2b
+
+        3:
+            sub    %ecx,%esp
+            test   %esp,8(%esp)
+            mov    4(%ebp),%edx
+            mov    %edx, 12(%esp)
+            add    %eax,%esp
+            pop    %edx
+            pop    %ecx
+            leave
+
+            sub   %eax, %esp
+            .cfi_def_cfa_register %esp
+            .cfi_adjust_cfa_offset -4
+            ret
+            .cfi_endproc
+    ",
+        options(att_syntax)
+    )
+}
diff --git a/rust-version b/rust-version
index e05aaa057..731839835 100644
--- a/rust-version
+++ b/rust-version
@@ -1 +1 @@
-df8102fe5f24f28a918660b0cd918d7331c3896e
+d087f112b7d1323446c7b39a8b616aee7fa56b3d