chacha: Move x86_64 dispatching from assembly to Rust (Merge BoringSSL …

…f5e0c8f) Also, the `_nohw` functions do not require `cpu::Features`. Make that clearer for the AArch64/ARM branches.
briansmith · Jan 17, 2025 · 737a3ec · 737a3ec
2 parents b72dd0d + f5e0c8f
commit 737a3ec
Show file tree

Hide file tree

Showing 5 changed files with 117 additions and 68 deletions.
diff --git a/build.rs b/build.rs
@@ -855,8 +855,11 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
         "CRYPTO_poly1305_update",
         "CRYPTO_poly1305_update_neon",
         "ChaCha20_ctr32",
+        "ChaCha20_ctr32_avx2",
         "ChaCha20_ctr32_neon",
         "ChaCha20_ctr32_nohw",
+        "ChaCha20_ctr32_ssse3",
+        "ChaCha20_ctr32_ssse3_4x",
         "LIMBS_add_mod",
         "LIMBS_are_even",
         "LIMBS_are_zero",

diff --git a/crypto/chacha/asm/chacha-x86_64.pl b/crypto/chacha/asm/chacha-x86_64.pl
@@ -76,8 +76,6 @@
 $code.=<<___;
 .text
 
-.extern OPENSSL_ia32cap_P
-
 .section .rodata
 .align	64
 .Lzero:
@@ -226,20 +224,12 @@ sub ROUND {			# critical path is 24 cycles per round
 ########################################################################
 # Generic code path that handles all lengths on pre-SSSE3 processors.
 $code.=<<___;
-.globl	ChaCha20_ctr32
-.type	ChaCha20_ctr32,\@function,5
+.globl	ChaCha20_ctr32_nohw
+.type	ChaCha20_ctr32_nohw,\@function,5
 .align	64
-ChaCha20_ctr32:
+ChaCha20_ctr32_nohw:
 .cfi_startproc
 	_CET_ENDBR
-	cmp	\$0,$len
-	je	.Lno_data
-	mov	OPENSSL_ia32cap_P+4(%rip),%r10
-___
-$code.=<<___;
-	test	\$`1<<(41-32)`,%r10d
-	jnz	.LChaCha20_ssse3
-
 	push	%rbx
 .cfi_push	rbx
 	push	%rbp
@@ -411,7 +401,7 @@ sub ROUND {			# critical path is 24 cycles per round
 .Lno_data:
 	ret
 .cfi_endproc
-.size	ChaCha20_ctr32,.-ChaCha20_ctr32
+.size	ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw
 ___
 
 ########################################################################
@@ -446,19 +436,16 @@ sub SSSE3ROUND {	# critical path is 20 "SIMD ticks" per round
 my $xframe = $win64 ? 32+8 : 8;
 
 $code.=<<___;
-.type	ChaCha20_ssse3,\@function,5
+.globl	ChaCha20_ctr32_ssse3
+.type	ChaCha20_ctr32_ssse3,\@function,5
 .align	32
-ChaCha20_ssse3:
-.LChaCha20_ssse3:
+ChaCha20_ctr32_ssse3:
 .cfi_startproc
+	_CET_ENDBR
 	mov	%rsp,%r9		# frame pointer
 .cfi_def_cfa_register	r9
 ___
 $code.=<<___;
-	cmp	\$128,$len		# we might throw away some data,
-	ja	.LChaCha20_4x		# but overall it won't be slower
-
-.Ldo_sse3_after_all:
 	sub	\$64+$xframe,%rsp
 ___
 $code.=<<___	if ($win64);
@@ -568,7 +555,7 @@ sub SSSE3ROUND {	# critical path is 20 "SIMD ticks" per round
 .Lssse3_epilogue:
 	ret
 .cfi_endproc
-.size	ChaCha20_ssse3,.-ChaCha20_ssse3
+.size	ChaCha20_ctr32_ssse3,.-ChaCha20_ctr32_ssse3
 ___
 }
 
@@ -706,29 +693,17 @@ sub SSSE3_lane_ROUND {
 my $xframe = $win64 ? 0xa8 : 8;
 
 $code.=<<___;
-.type	ChaCha20_4x,\@function,5
+.globl	ChaCha20_ctr32_ssse3_4x
+.type	ChaCha20_ctr32_ssse3_4x,\@function,5
 .align	32
-ChaCha20_4x:
-.LChaCha20_4x:
+ChaCha20_ctr32_ssse3_4x:
 .cfi_startproc
+	_CET_ENDBR
 	mov		%rsp,%r9		# frame pointer
 .cfi_def_cfa_register	r9
 	mov		%r10,%r11
 ___
-$code.=<<___	if ($avx>1);
-	shr		\$32,%r10		# OPENSSL_ia32cap_P+8
-	test		\$`1<<5`,%r10		# test AVX2
-	jnz		.LChaCha20_8x
-___
 $code.=<<___;
-	cmp		\$192,$len
-	ja		.Lproceed4x
-
-	and		\$`1<<26|1<<22`,%r11	# isolate XSAVE+MOVBE
-	cmp		\$`1<<22`,%r11		# check for MOVBE without XSAVE
-	je		.Ldo_sse3_after_all	# to detect Atom
-
-.Lproceed4x:
 	sub		\$0x140+$xframe,%rsp
 ___
 	################ stack layout
@@ -1156,7 +1131,7 @@ sub SSSE3_lane_ROUND {
 .L4x_epilogue:
 	ret
 .cfi_endproc
-.size	ChaCha20_4x,.-ChaCha20_4x
+.size	ChaCha20_ctr32_ssse3_4x,.-ChaCha20_ctr32_ssse3_4x
 ___
 }
 
@@ -1285,11 +1260,12 @@ sub AVX2_lane_ROUND {
 my $xframe = $win64 ? 0xa8 : 8;
 
 $code.=<<___;
-.type	ChaCha20_8x,\@function,5
+.globl	ChaCha20_ctr32_avx2
+.type	ChaCha20_ctr32_avx2,\@function,5
 .align	32
-ChaCha20_8x:
-.LChaCha20_8x:
+ChaCha20_ctr32_avx2:
 .cfi_startproc
+	_CET_ENDBR
 	mov		%rsp,%r9		# frame register
 .cfi_def_cfa_register	r9
 	sub		\$0x280+$xframe,%rsp
@@ -1801,7 +1777,7 @@ sub AVX2_lane_ROUND {
 .L8x_epilogue:
 	ret
 .cfi_endproc
-.size	ChaCha20_8x,.-ChaCha20_8x
+.size	ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2
 ___
 }
 
@@ -1985,42 +1961,42 @@ sub AVX2_lane_ROUND {
 
 .section	.pdata
 .align	4
-	.rva	.LSEH_begin_ChaCha20_ctr32
-	.rva	.LSEH_end_ChaCha20_ctr32
-	.rva	.LSEH_info_ChaCha20_ctr32
+	.rva	.LSEH_begin_ChaCha20_ctr32_nohw
+	.rva	.LSEH_end_ChaCha20_ctr32_nohw
+	.rva	.LSEH_info_ChaCha20_ctr32_nohw
 
-	.rva	.LSEH_begin_ChaCha20_ssse3
-	.rva	.LSEH_end_ChaCha20_ssse3
-	.rva	.LSEH_info_ChaCha20_ssse3
+	.rva	.LSEH_begin_ChaCha20_ctr32_ssse3
+	.rva	.LSEH_end_ChaCha20_ctr32_ssse3
+	.rva	.LSEH_info_ChaCha20_ctr32_ssse3
 
-	.rva	.LSEH_begin_ChaCha20_4x
-	.rva	.LSEH_end_ChaCha20_4x
-	.rva	.LSEH_info_ChaCha20_4x
+	.rva	.LSEH_begin_ChaCha20_ctr32_ssse3_4x
+	.rva	.LSEH_end_ChaCha20_ctr32_ssse3_4x
+	.rva	.LSEH_info_ChaCha20_ctr32_ssse3_4x
 ___
 $code.=<<___ if ($avx>1);
-	.rva	.LSEH_begin_ChaCha20_8x
-	.rva	.LSEH_end_ChaCha20_8x
-	.rva	.LSEH_info_ChaCha20_8x
+	.rva	.LSEH_begin_ChaCha20_ctr32_avx2
+	.rva	.LSEH_end_ChaCha20_ctr32_avx2
+	.rva	.LSEH_info_ChaCha20_ctr32_avx2
 ___
 $code.=<<___;
 .section	.xdata
 .align	8
-.LSEH_info_ChaCha20_ctr32:
+.LSEH_info_ChaCha20_ctr32_nohw:
 	.byte	9,0,0,0
 	.rva	se_handler
 
-.LSEH_info_ChaCha20_ssse3:
+.LSEH_info_ChaCha20_ctr32_ssse3:
 	.byte	9,0,0,0
 	.rva	ssse3_handler
 	.rva	.Lssse3_body,.Lssse3_epilogue
 
-.LSEH_info_ChaCha20_4x:
+.LSEH_info_ChaCha20_ctr32_ssse3_4x:
 	.byte	9,0,0,0
 	.rva	full_handler
 	.rva	.L4x_body,.L4x_epilogue
 ___
 $code.=<<___ if ($avx>1);
-.LSEH_info_ChaCha20_8x:
+.LSEH_info_ChaCha20_ctr32_avx2:
 	.byte	9,0,0,0
 	.rva	full_handler
 	.rva	.L8x_body,.L8x_epilogue			# HandlerData[]

diff --git a/crypto/cpu_intel.c b/crypto/cpu_intel.c
@@ -150,7 +150,8 @@ void OPENSSL_cpuid_setup(void) {
 
     // Clear the XSAVE bit on Knights Landing to mimic Silvermont. This enables
     // some Silvermont-specific codepaths which perform better. See OpenSSL
-    // commit 64d92d74985ebb3d0be58a9718f9e080a14a8e7f.
+    // commit 64d92d74985ebb3d0be58a9718f9e080a14a8e7f and
+    // |CRYPTO_cpu_perf_is_like_silvermont|.
     if ((eax & 0x0fff0ff0) == 0x00050670 /* Knights Landing */ ||
         (eax & 0x0fff0ff0) == 0x00080650 /* Knights Mill (per SDE) */) {
       ecx &= ~(1u << 26);
@@ -177,7 +178,8 @@ void OPENSSL_cpuid_setup(void) {
     // Clear AVX2 and AVX512* bits.
     //
     // TODO(davidben): Should bits 17 and 26-28 also be cleared? Upstream
-    // doesn't clear those.
+    // doesn't clear those. See the comments in
+    // |CRYPTO_hardware_supports_XSAVE|.
     extended_features[0] &=
         ~((1u << 5) | (1u << 16) | (1u << 21) | (1u << 30) | (1u << 31));
   }

diff --git a/src/aead/chacha.rs b/src/aead/chacha.rs
@@ -97,8 +97,8 @@ impl Key {
                 }
                 if in_out.len() >= 1 {
                     chacha20_ctr32_ffi!(
-                        unsafe { (1, cpu::Features, Overlapping<'_>) => ChaCha20_ctr32_nohw },
-                        self, counter, in_out, cpu)
+                        unsafe { (1, (), Overlapping<'_>) => ChaCha20_ctr32_nohw },
+                        self, counter, in_out, ())
                 }
             } else if #[cfg(all(target_arch = "arm", target_endian = "little"))] {
                 use cpu::{GetFeature as _, arm::Neon};
@@ -112,18 +112,45 @@ impl Key {
                 }
                 if in_out.len() >= 1 {
                     chacha20_ctr32_ffi!(
-                        unsafe { (1, cpu::Features, &mut [u8]) => ChaCha20_ctr32_nohw },
-                        self, counter, in_out.copy_within(), cpu)
+                        unsafe { (1, (), &mut [u8]) => ChaCha20_ctr32_nohw },
+                        self, counter, in_out.copy_within(), ())
                 }
             } else if #[cfg(target_arch = "x86")] {
                 chacha20_ctr32_ffi!(
                     unsafe { (0, cpu::Features, &mut [u8]) => ChaCha20_ctr32 },
                     self, counter, in_out.copy_within(), cpu)
             } else if #[cfg(target_arch = "x86_64")] {
-                chacha20_ctr32_ffi!(
-                    unsafe { (0, cpu::Features, Overlapping<'_>) => ChaCha20_ctr32 },
-                    self, counter, in_out, cpu)
+                use cpu::{GetFeature, intel::{Avx2, Ssse3}};
+                const SSE_MIN_LEN: usize = 128 + 1; // Also AVX2, SSSE3_4X, SSSE3
+                if in_out.len() >= SSE_MIN_LEN {
+                    if let Some(cpu) = cpu.get_feature() {
+                        return chacha20_ctr32_ffi!(
+                            unsafe { (SSE_MIN_LEN, Avx2, Overlapping<'_>) => ChaCha20_ctr32_avx2 },
+                            self, counter, in_out, cpu);
+                    }
+                    if let Some(cpu) = <cpu::Features as GetFeature<Ssse3>>::get_feature(&cpu) {
+                        if in_out.len() >= 192 || !cpu.perf_is_like_silvermont() {
+                            return chacha20_ctr32_ffi!(
+                                unsafe {
+                                    (SSE_MIN_LEN, Ssse3, Overlapping<'_>) =>
+                                    ChaCha20_ctr32_ssse3_4x
+                                },
+                                self, counter, in_out, cpu)
+                        }
+                        return chacha20_ctr32_ffi!(
+                            unsafe {
+                                (SSE_MIN_LEN, Ssse3, Overlapping<'_>) => ChaCha20_ctr32_ssse3
+                            },
+                            self, counter, in_out, cpu)
+                    }
+                }
+                if in_out.len() >= 1 {
+                    chacha20_ctr32_ffi!(
+                        unsafe { (1, (), Overlapping<'_>) => ChaCha20_ctr32_nohw },
+                        self, counter, in_out, ())
+                }
             } else {
+                let _: cpu::Features = cpu;
                 fallback::ChaCha20_ctr32(self, counter, in_out)
             }
         }

diff --git a/src/cpu/intel.rs b/src/cpu/intel.rs
@@ -146,11 +146,23 @@ cfg_if! {
             mask: 1 << 22,
         };
 
+        // We intentionally avoid defining an `XSave` accessor function. See
+        // `Ssse3::cpu_perf_is_like_silvermont`.
+        const XSAVE_BUT_NOT_REALLY: Feature = Feature {
+            word: 1,
+            mask: 1 << 26,
+        };
+
         pub(crate) const AVX: Feature = Feature {
             word: 1,
             mask: 1 << 28,
         };
 
+        const AVX2: Feature = Feature {
+            word: 2,
+            mask: 1 << 5,
+        };
+
         const SHA: Feature = Feature {
             word: 2,
             mask: 1 << 29,
@@ -159,7 +171,36 @@ cfg_if! {
         impl_get_feature!{ SSE41 => Sse41 }
         impl_get_feature!{ MOVBE => Movbe }
         impl_get_feature!{ AVX => Avx }
+        impl_get_feature!{ AVX2 => Avx2 }
         impl_get_feature!{ SHA => Sha }
+
+        impl Ssse3 {
+            /// BoringSSL's counterpart is `CRYPTO_cpu_perf_is_like_silvermont`.
+            ///
+            /// Returns true if, based on a heuristic, the
+            /// CPU has Silvermont-like performance characteristics. It is often faster to
+            /// run different codepaths on these CPUs than the available instructions would
+            /// otherwise select. See chacha-x86_64.pl.
+            ///
+            /// Bonnell, Silvermont's predecessor in the Atom lineup, will also be matched by
+            /// this. Goldmont (Silvermont's successor in the Atom lineup) added XSAVE so it
+            /// isn't matched by this. Various sources indicate AMD first implemented MOVBE
+            /// and XSAVE at the same time in Jaguar, so it seems like AMD chips will not be
+            /// matched by this. That seems to be the case for other x86(-64) CPUs.
+            ///
+            /// WARNING: This MUST NOT be used to guard the execution of the XSAVE
+            /// instruction. This is the "hardware supports XSAVE" bit, not the OSXSAVE bit
+            /// that indicates whether we can safely execute XSAVE. This bit may be set
+            /// even when XSAVE is disabled (by the operating system). See how the users of
+            /// this bit use it.
+            ///
+            /// Historically, the XSAVE bit was artificially cleared on Knights Landing
+            /// and Knights Mill chips, but as Intel has removed all support from GCC,
+            /// LLVM, and SDE, we assume they are no longer worth special-casing.
+            pub fn perf_is_like_silvermont(self) -> bool {
+                XSAVE_BUT_NOT_REALLY.available(self.0) && MOVBE.available(self.0)
+            }
+        }
     }
 }