Skip to content

Commit

Permalink
chacha: Move x86_64 dispatching from assembly to Rust (Merge BoringSSL
Browse files Browse the repository at this point in the history
…f5e0c8f)

Also, the `_nohw` functions do not require `cpu::Features`. Make that clearer
for the AArch64/ARM branches.
  • Loading branch information
briansmith committed Jan 17, 2025
2 parents b72dd0d + f5e0c8f commit 737a3ec
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 68 deletions.
3 changes: 3 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -855,8 +855,11 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
"CRYPTO_poly1305_update",
"CRYPTO_poly1305_update_neon",
"ChaCha20_ctr32",
"ChaCha20_ctr32_avx2",
"ChaCha20_ctr32_neon",
"ChaCha20_ctr32_nohw",
"ChaCha20_ctr32_ssse3",
"ChaCha20_ctr32_ssse3_4x",
"LIMBS_add_mod",
"LIMBS_are_even",
"LIMBS_are_zero",
Expand Down
94 changes: 35 additions & 59 deletions crypto/chacha/asm/chacha-x86_64.pl
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,6 @@
$code.=<<___;
.text

.extern OPENSSL_ia32cap_P

.section .rodata
.align 64
.Lzero:
Expand Down Expand Up @@ -226,20 +224,12 @@ sub ROUND { # critical path is 24 cycles per round
########################################################################
# Generic code path that handles all lengths on pre-SSSE3 processors.
$code.=<<___;
.globl ChaCha20_ctr32
.type ChaCha20_ctr32,\@function,5
.globl ChaCha20_ctr32_nohw
.type ChaCha20_ctr32_nohw,\@function,5
.align 64
ChaCha20_ctr32:
ChaCha20_ctr32_nohw:
.cfi_startproc
_CET_ENDBR
cmp \$0,$len
je .Lno_data
mov OPENSSL_ia32cap_P+4(%rip),%r10
___
$code.=<<___;
test \$`1<<(41-32)`,%r10d
jnz .LChaCha20_ssse3

push %rbx
.cfi_push rbx
push %rbp
Expand Down Expand Up @@ -411,7 +401,7 @@ sub ROUND { # critical path is 24 cycles per round
.Lno_data:
ret
.cfi_endproc
.size ChaCha20_ctr32,.-ChaCha20_ctr32
.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw
___

########################################################################
Expand Down Expand Up @@ -446,19 +436,16 @@ sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
my $xframe = $win64 ? 32+8 : 8;

$code.=<<___;
.type ChaCha20_ssse3,\@function,5
.globl ChaCha20_ctr32_ssse3
.type ChaCha20_ctr32_ssse3,\@function,5
.align 32
ChaCha20_ssse3:
.LChaCha20_ssse3:
ChaCha20_ctr32_ssse3:
.cfi_startproc
_CET_ENDBR
mov %rsp,%r9 # frame pointer
.cfi_def_cfa_register r9
___
$code.=<<___;
cmp \$128,$len # we might throw away some data,
ja .LChaCha20_4x # but overall it won't be slower

.Ldo_sse3_after_all:
sub \$64+$xframe,%rsp
___
$code.=<<___ if ($win64);
Expand Down Expand Up @@ -568,7 +555,7 @@ sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
.Lssse3_epilogue:
ret
.cfi_endproc
.size ChaCha20_ssse3,.-ChaCha20_ssse3
.size ChaCha20_ctr32_ssse3,.-ChaCha20_ctr32_ssse3
___
}

Expand Down Expand Up @@ -706,29 +693,17 @@ sub SSSE3_lane_ROUND {
my $xframe = $win64 ? 0xa8 : 8;

$code.=<<___;
.type ChaCha20_4x,\@function,5
.globl ChaCha20_ctr32_ssse3_4x
.type ChaCha20_ctr32_ssse3_4x,\@function,5
.align 32
ChaCha20_4x:
.LChaCha20_4x:
ChaCha20_ctr32_ssse3_4x:
.cfi_startproc
_CET_ENDBR
mov %rsp,%r9 # frame pointer
.cfi_def_cfa_register r9
mov %r10,%r11
___
$code.=<<___ if ($avx>1);
shr \$32,%r10 # OPENSSL_ia32cap_P+8
test \$`1<<5`,%r10 # test AVX2
jnz .LChaCha20_8x
___
$code.=<<___;
cmp \$192,$len
ja .Lproceed4x

and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE
cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE
je .Ldo_sse3_after_all # to detect Atom

.Lproceed4x:
sub \$0x140+$xframe,%rsp
___
################ stack layout
Expand Down Expand Up @@ -1156,7 +1131,7 @@ sub SSSE3_lane_ROUND {
.L4x_epilogue:
ret
.cfi_endproc
.size ChaCha20_4x,.-ChaCha20_4x
.size ChaCha20_ctr32_ssse3_4x,.-ChaCha20_ctr32_ssse3_4x
___
}

Expand Down Expand Up @@ -1285,11 +1260,12 @@ sub AVX2_lane_ROUND {
my $xframe = $win64 ? 0xa8 : 8;

$code.=<<___;
.type ChaCha20_8x,\@function,5
.globl ChaCha20_ctr32_avx2
.type ChaCha20_ctr32_avx2,\@function,5
.align 32
ChaCha20_8x:
.LChaCha20_8x:
ChaCha20_ctr32_avx2:
.cfi_startproc
_CET_ENDBR
mov %rsp,%r9 # frame register
.cfi_def_cfa_register r9
sub \$0x280+$xframe,%rsp
Expand Down Expand Up @@ -1801,7 +1777,7 @@ sub AVX2_lane_ROUND {
.L8x_epilogue:
ret
.cfi_endproc
.size ChaCha20_8x,.-ChaCha20_8x
.size ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2
___
}

Expand Down Expand Up @@ -1985,42 +1961,42 @@ sub AVX2_lane_ROUND {

.section .pdata
.align 4
.rva .LSEH_begin_ChaCha20_ctr32
.rva .LSEH_end_ChaCha20_ctr32
.rva .LSEH_info_ChaCha20_ctr32
.rva .LSEH_begin_ChaCha20_ctr32_nohw
.rva .LSEH_end_ChaCha20_ctr32_nohw
.rva .LSEH_info_ChaCha20_ctr32_nohw

.rva .LSEH_begin_ChaCha20_ssse3
.rva .LSEH_end_ChaCha20_ssse3
.rva .LSEH_info_ChaCha20_ssse3
.rva .LSEH_begin_ChaCha20_ctr32_ssse3
.rva .LSEH_end_ChaCha20_ctr32_ssse3
.rva .LSEH_info_ChaCha20_ctr32_ssse3

.rva .LSEH_begin_ChaCha20_4x
.rva .LSEH_end_ChaCha20_4x
.rva .LSEH_info_ChaCha20_4x
.rva .LSEH_begin_ChaCha20_ctr32_ssse3_4x
.rva .LSEH_end_ChaCha20_ctr32_ssse3_4x
.rva .LSEH_info_ChaCha20_ctr32_ssse3_4x
___
$code.=<<___ if ($avx>1);
.rva .LSEH_begin_ChaCha20_8x
.rva .LSEH_end_ChaCha20_8x
.rva .LSEH_info_ChaCha20_8x
.rva .LSEH_begin_ChaCha20_ctr32_avx2
.rva .LSEH_end_ChaCha20_ctr32_avx2
.rva .LSEH_info_ChaCha20_ctr32_avx2
___
$code.=<<___;
.section .xdata
.align 8
.LSEH_info_ChaCha20_ctr32:
.LSEH_info_ChaCha20_ctr32_nohw:
.byte 9,0,0,0
.rva se_handler

.LSEH_info_ChaCha20_ssse3:
.LSEH_info_ChaCha20_ctr32_ssse3:
.byte 9,0,0,0
.rva ssse3_handler
.rva .Lssse3_body,.Lssse3_epilogue

.LSEH_info_ChaCha20_4x:
.LSEH_info_ChaCha20_ctr32_ssse3_4x:
.byte 9,0,0,0
.rva full_handler
.rva .L4x_body,.L4x_epilogue
___
$code.=<<___ if ($avx>1);
.LSEH_info_ChaCha20_8x:
.LSEH_info_ChaCha20_ctr32_avx2:
.byte 9,0,0,0
.rva full_handler
.rva .L8x_body,.L8x_epilogue # HandlerData[]
Expand Down
6 changes: 4 additions & 2 deletions crypto/cpu_intel.c
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,8 @@ void OPENSSL_cpuid_setup(void) {

// Clear the XSAVE bit on Knights Landing to mimic Silvermont. This enables
// some Silvermont-specific codepaths which perform better. See OpenSSL
// commit 64d92d74985ebb3d0be58a9718f9e080a14a8e7f.
// commit 64d92d74985ebb3d0be58a9718f9e080a14a8e7f and
// |CRYPTO_cpu_perf_is_like_silvermont|.
if ((eax & 0x0fff0ff0) == 0x00050670 /* Knights Landing */ ||
(eax & 0x0fff0ff0) == 0x00080650 /* Knights Mill (per SDE) */) {
ecx &= ~(1u << 26);
Expand All @@ -177,7 +178,8 @@ void OPENSSL_cpuid_setup(void) {
// Clear AVX2 and AVX512* bits.
//
// TODO(davidben): Should bits 17 and 26-28 also be cleared? Upstream
// doesn't clear those.
// doesn't clear those. See the comments in
// |CRYPTO_hardware_supports_XSAVE|.
extended_features[0] &=
~((1u << 5) | (1u << 16) | (1u << 21) | (1u << 30) | (1u << 31));
}
Expand Down
41 changes: 34 additions & 7 deletions src/aead/chacha.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ impl Key {
}
if in_out.len() >= 1 {
chacha20_ctr32_ffi!(
unsafe { (1, cpu::Features, Overlapping<'_>) => ChaCha20_ctr32_nohw },
self, counter, in_out, cpu)
unsafe { (1, (), Overlapping<'_>) => ChaCha20_ctr32_nohw },
self, counter, in_out, ())
}
} else if #[cfg(all(target_arch = "arm", target_endian = "little"))] {
use cpu::{GetFeature as _, arm::Neon};
Expand All @@ -112,18 +112,45 @@ impl Key {
}
if in_out.len() >= 1 {
chacha20_ctr32_ffi!(
unsafe { (1, cpu::Features, &mut [u8]) => ChaCha20_ctr32_nohw },
self, counter, in_out.copy_within(), cpu)
unsafe { (1, (), &mut [u8]) => ChaCha20_ctr32_nohw },
self, counter, in_out.copy_within(), ())
}
} else if #[cfg(target_arch = "x86")] {
chacha20_ctr32_ffi!(
unsafe { (0, cpu::Features, &mut [u8]) => ChaCha20_ctr32 },
self, counter, in_out.copy_within(), cpu)
} else if #[cfg(target_arch = "x86_64")] {
chacha20_ctr32_ffi!(
unsafe { (0, cpu::Features, Overlapping<'_>) => ChaCha20_ctr32 },
self, counter, in_out, cpu)
use cpu::{GetFeature, intel::{Avx2, Ssse3}};
const SSE_MIN_LEN: usize = 128 + 1; // Also AVX2, SSSE3_4X, SSSE3
if in_out.len() >= SSE_MIN_LEN {
if let Some(cpu) = cpu.get_feature() {
return chacha20_ctr32_ffi!(
unsafe { (SSE_MIN_LEN, Avx2, Overlapping<'_>) => ChaCha20_ctr32_avx2 },
self, counter, in_out, cpu);
}
if let Some(cpu) = <cpu::Features as GetFeature<Ssse3>>::get_feature(&cpu) {
if in_out.len() >= 192 || !cpu.perf_is_like_silvermont() {
return chacha20_ctr32_ffi!(
unsafe {
(SSE_MIN_LEN, Ssse3, Overlapping<'_>) =>
ChaCha20_ctr32_ssse3_4x
},
self, counter, in_out, cpu)
}
return chacha20_ctr32_ffi!(
unsafe {
(SSE_MIN_LEN, Ssse3, Overlapping<'_>) => ChaCha20_ctr32_ssse3
},
self, counter, in_out, cpu)
}
}
if in_out.len() >= 1 {
chacha20_ctr32_ffi!(
unsafe { (1, (), Overlapping<'_>) => ChaCha20_ctr32_nohw },
self, counter, in_out, ())
}
} else {
let _: cpu::Features = cpu;
fallback::ChaCha20_ctr32(self, counter, in_out)
}
}
Expand Down
41 changes: 41 additions & 0 deletions src/cpu/intel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,11 +146,23 @@ cfg_if! {
mask: 1 << 22,
};

// We intentionally avoid defining an `XSave` accessor function. See
// `Ssse3::cpu_perf_is_like_silvermont`.
const XSAVE_BUT_NOT_REALLY: Feature = Feature {
word: 1,
mask: 1 << 26,
};

pub(crate) const AVX: Feature = Feature {
word: 1,
mask: 1 << 28,
};

const AVX2: Feature = Feature {
word: 2,
mask: 1 << 5,
};

const SHA: Feature = Feature {
word: 2,
mask: 1 << 29,
Expand All @@ -159,7 +171,36 @@ cfg_if! {
impl_get_feature!{ SSE41 => Sse41 }
impl_get_feature!{ MOVBE => Movbe }
impl_get_feature!{ AVX => Avx }
impl_get_feature!{ AVX2 => Avx2 }
impl_get_feature!{ SHA => Sha }

impl Ssse3 {
/// BoringSSL's counterpart is `CRYPTO_cpu_perf_is_like_silvermont`.
///
/// Returns true if, based on a heuristic, the
/// CPU has Silvermont-like performance characteristics. It is often faster to
/// run different codepaths on these CPUs than the available instructions would
/// otherwise select. See chacha-x86_64.pl.
///
/// Bonnell, Silvermont's predecessor in the Atom lineup, will also be matched by
/// this. Goldmont (Silvermont's successor in the Atom lineup) added XSAVE so it
/// isn't matched by this. Various sources indicate AMD first implemented MOVBE
/// and XSAVE at the same time in Jaguar, so it seems like AMD chips will not be
/// matched by this. That seems to be the case for other x86(-64) CPUs.
///
/// WARNING: This MUST NOT be used to guard the execution of the XSAVE
/// instruction. This is the "hardware supports XSAVE" bit, not the OSXSAVE bit
/// that indicates whether we can safely execute XSAVE. This bit may be set
/// even when XSAVE is disabled (by the operating system). See how the users of
/// this bit use it.
///
/// Historically, the XSAVE bit was artificially cleared on Knights Landing
/// and Knights Mill chips, but as Intel has removed all support from GCC,
/// LLVM, and SDE, we assume they are no longer worth special-casing.
pub fn perf_is_like_silvermont(self) -> bool {

Check warning on line 200 in src/cpu/intel.rs

View check run for this annotation

Codecov / codecov/patch

src/cpu/intel.rs#L200

Added line #L200 was not covered by tests
XSAVE_BUT_NOT_REALLY.available(self.0) && MOVBE.available(self.0)
}

Check warning on line 202 in src/cpu/intel.rs

View check run for this annotation

Codecov / codecov/patch

src/cpu/intel.rs#L202

Added line #L202 was not covered by tests
}
}
}

Expand Down

0 comments on commit 737a3ec

Please sign in to comment.