From ff01186ade2267937e1f7a818aeb876e9453af48 Mon Sep 17 00:00:00 2001 From: "Robert B. Langer" Date: Tue, 12 Aug 2025 20:29:16 -0400 Subject: [PATCH] groestl: add AVX-512/GFNI backend --- Cargo.lock | 2 + groestl/Cargo.toml | 8 + groestl/src/block_api.rs | 8 +- groestl/src/compress_long.rs | 36 ++-- groestl/src/compress_short.rs | 98 +++------ groestl/src/compress_short/avx512_gfni.rs | 235 ++++++++++++++++++++++ groestl/src/compress_short/soft.rs | 84 ++++++++ groestl/src/lib.rs | 2 +- 8 files changed, 377 insertions(+), 96 deletions(-) create mode 100644 groestl/src/compress_short/avx512_gfni.rs create mode 100644 groestl/src/compress_short/soft.rs diff --git a/Cargo.lock b/Cargo.lock index 7c18d3b42..2847702ca 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -134,6 +134,8 @@ name = "groestl" version = "0.11.0-pre" dependencies = [ "base16ct", + "cfg-if", + "cpufeatures", "digest", "hex-literal", ] diff --git a/groestl/Cargo.toml b/groestl/Cargo.toml index 688ab0617..243ae4ad1 100644 --- a/groestl/Cargo.toml +++ b/groestl/Cargo.toml @@ -14,6 +14,10 @@ categories = ["cryptography", "no-std"] [dependencies] digest = "0.11.0-rc.0" +cfg-if = "1" + +[target.'cfg(any(target_arch = "x86_64", target_arch = "x86"))'.dependencies] +cpufeatures = "0.2.12" [dev-dependencies] digest = { version = "0.11.0-rc.0", features = ["dev"] } @@ -25,5 +29,9 @@ default = ["alloc"] alloc = ["digest/alloc"] zeroize = ["digest/zeroize"] +[lints.rust.unexpected_cfgs] +level = "warn" +check-cfg = ["cfg(groestl_force_soft)"] + [package.metadata.docs.rs] all-features = true diff --git a/groestl/src/block_api.rs b/groestl/src/block_api.rs index 5bb9cfc73..84c41adc3 100644 --- a/groestl/src/block_api.rs +++ b/groestl/src/block_api.rs @@ -1,4 +1,4 @@ -use core::fmt; +use core::{fmt, slice}; use digest::{ HashMarker, InvalidOutputSize, Output, block_api::{ @@ -36,9 +36,7 @@ macro_rules! impl_variant { #[inline] fn update_blocks(&mut self, blocks: &[Block]) { self.blocks_len += blocks.len() as u64; - for block in blocks { - $compress::compress(&mut self.state, block.as_ref()); - } + $compress::compress(&mut self.state, Block::::cast_slice_to_core(blocks)); } } @@ -72,7 +70,7 @@ macro_rules! impl_variant { self.blocks_len + 1 }; buffer.len64_padding_be(blocks_len, |block| { - $compress::compress(&mut self.state, block.as_ref()) + $compress::compress(&mut self.state, slice::from_ref(block.as_ref())) }); let res = $compress::p(&self.state); let n = $compress::COLS / 2; diff --git a/groestl/src/compress_long.rs b/groestl/src/compress_long.rs index 56010d1ea..a2fd4c642 100644 --- a/groestl/src/compress_long.rs +++ b/groestl/src/compress_long.rs @@ -65,23 +65,25 @@ fn rndp(mut x: [u64; COLS], r: u64) -> [u64; COLS] { ] } -pub(crate) fn compress(h: &mut [u64; COLS], block: &[u8; 128]) { - let mut q = [0u64; COLS]; - for (chunk, v) in block.chunks_exact(8).zip(q.iter_mut()) { - *v = u64::from_be_bytes(chunk.try_into().unwrap()); - } - let mut p = [0u64; COLS]; - for i in 0..COLS { - p[i] = h[i] ^ q[i]; - } - for i in 0..ROUNDS { - q = rndq(q, i); - } - for i in 0..ROUNDS { - p = rndp(p, i << 56); - } - for i in 0..COLS { - h[i] ^= q[i] ^ p[i]; +pub(crate) fn compress(h: &mut [u64; COLS], blocks: &[[u8; 128]]) { + for block in blocks { + let mut q = [0u64; COLS]; + for (chunk, v) in block.chunks_exact(8).zip(q.iter_mut()) { + *v = u64::from_be_bytes(chunk.try_into().unwrap()); + } + let mut p = [0u64; COLS]; + for i in 0..COLS { + p[i] = h[i] ^ q[i]; + } + for i in 0..ROUNDS { + q = rndq(q, i); + } + for i in 0..ROUNDS { + p = rndp(p, i << 56); + } + for i in 0..COLS { + h[i] ^= q[i] ^ p[i]; + } } } diff --git a/groestl/src/compress_short.rs b/groestl/src/compress_short.rs index 4257b4f1f..e337d3411 100644 --- a/groestl/src/compress_short.rs +++ b/groestl/src/compress_short.rs @@ -1,81 +1,33 @@ -#![allow(clippy::needless_range_loop)] -use crate::table::TABLE; - pub(crate) const COLS: usize = 8; -const ROUNDS: u64 = 10; -#[inline(always)] -fn column(x: &[u64; COLS], c: [usize; 8]) -> u64 { - let mut t = 0; - for i in 0..8 { - let sl = 8 * (7 - i); - let idx = ((x[c[i]] >> sl) & 0xFF) as usize; - t ^= TABLE[i][idx]; - } - t -} +mod soft; -#[inline(always)] -fn rndq(mut x: [u64; COLS], r: u64) -> [u64; COLS] { - for i in 0..COLS { - x[i] ^= u64::MAX.wrapping_sub((i as u64) << 4) ^ r; - } - [ - column(&x, [1, 3, 5, 7, 0, 2, 4, 6]), - column(&x, [2, 4, 6, 0, 1, 3, 5, 7]), - column(&x, [3, 5, 7, 1, 2, 4, 6, 0]), - column(&x, [4, 6, 0, 2, 3, 5, 7, 1]), - column(&x, [5, 7, 1, 3, 4, 6, 0, 2]), - column(&x, [6, 0, 2, 4, 5, 7, 1, 3]), - column(&x, [7, 1, 3, 5, 6, 0, 2, 4]), - column(&x, [0, 2, 4, 6, 7, 1, 3, 5]), - ] -} +cfg_if::cfg_if! { + if #[cfg(any(not(any(target_arch = "x86_64", target_arch = "x86")), groestl_force_soft))] { + pub(crate) use soft::*; + } else { + mod avx512_gfni; -#[inline(always)] -fn rndp(mut x: [u64; COLS], r: u64) -> [u64; COLS] { - for i in 0..COLS { - x[i] ^= ((i as u64) << 60) ^ r; - } - [ - column(&x, [0, 1, 2, 3, 4, 5, 6, 7]), - column(&x, [1, 2, 3, 4, 5, 6, 7, 0]), - column(&x, [2, 3, 4, 5, 6, 7, 0, 1]), - column(&x, [3, 4, 5, 6, 7, 0, 1, 2]), - column(&x, [4, 5, 6, 7, 0, 1, 2, 3]), - column(&x, [5, 6, 7, 0, 1, 2, 3, 4]), - column(&x, [6, 7, 0, 1, 2, 3, 4, 5]), - column(&x, [7, 0, 1, 2, 3, 4, 5, 6]), - ] -} + cpufeatures::new!(cpuid_avx512_gfni, "avx", "avx512f", "avx512vbmi", "gfni"); -pub(crate) fn compress(h: &mut [u64; COLS], block: &[u8; 64]) { - let mut q = [0u64; COLS]; - for (chunk, v) in block.chunks_exact(8).zip(q.iter_mut()) { - *v = u64::from_be_bytes(chunk.try_into().unwrap()); - } - let mut p = [0u64; COLS]; - for i in 0..COLS { - p[i] = h[i] ^ q[i]; - } - for i in 0..ROUNDS { - q = rndq(q, i); - } - for i in 0..ROUNDS { - p = rndp(p, i << 56); - } - for i in 0..COLS { - h[i] ^= q[i] ^ p[i]; - } -} + #[inline(always)] + pub(crate) fn compress(h: &mut [u64; COLS], blocks: &[[u8; 64]]) { + if cpuid_avx512_gfni::get() { + #[allow(unsafe_code)] + unsafe { avx512_gfni::compress(h, blocks); } + } else { + soft::compress(h, blocks); + } + } -pub(crate) fn p(h: &[u64; COLS]) -> [u64; COLS] { - let mut p = *h; - for i in 0..ROUNDS { - p = rndp(p, i << 56); - } - for i in 0..COLS { - p[i] ^= h[i]; + #[inline(always)] + pub(crate) fn p(h: &[u64; COLS]) -> [u64; COLS] { + if cpuid_avx512_gfni::get() { + #[allow(unsafe_code)] + unsafe { avx512_gfni::p(h) } + } else { + soft::p(h) + } + } } - p } diff --git a/groestl/src/compress_short/avx512_gfni.rs b/groestl/src/compress_short/avx512_gfni.rs new file mode 100644 index 000000000..c218d3922 --- /dev/null +++ b/groestl/src/compress_short/avx512_gfni.rs @@ -0,0 +1,235 @@ +#![allow(unsafe_code)] + +use core::mem::MaybeUninit; + +#[cfg(target_arch = "x86")] +use core::arch::x86; + +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64 as x86; + +use x86::{ + __m512i, _mm512_gf2p8affineinv_epi64_epi8, _mm512_gf2p8mul_epi8, _mm512_loadu_si512, + _mm512_permutexvar_epi8, _mm512_permutexvar_epi64, _mm512_rorv_epi64, _mm512_set_epi64, + _mm512_set1_epi64, _mm512_setr_epi64, _mm512_storeu_si512, _mm512_xor_si512, +}; + +#[inline(always)] +unsafe fn mix_bytes(x: __m512i) -> __m512i { + unsafe { + const ROW2: i64 = 0x0202020202020202; + const ROW3: i64 = 0x0303030303030303; + const ROW4: i64 = 0x0404040404040404; + const ROW5: i64 = 0x0505050505050505; + const ROW7: i64 = 0x0707070707070707; + + let wide0 = _mm512_permutexvar_epi64(_mm512_set1_epi64(0), x); + let wide1 = _mm512_permutexvar_epi64(_mm512_set1_epi64(1), x); + let wide2 = _mm512_permutexvar_epi64(_mm512_set1_epi64(2), x); + let wide3 = _mm512_permutexvar_epi64(_mm512_set1_epi64(3), x); + let wide4 = _mm512_permutexvar_epi64(_mm512_set1_epi64(4), x); + let wide5 = _mm512_permutexvar_epi64(_mm512_set1_epi64(5), x); + let wide6 = _mm512_permutexvar_epi64(_mm512_set1_epi64(6), x); + let wide7 = _mm512_permutexvar_epi64(_mm512_set1_epi64(7), x); + + _mm512_xor_si512( + _mm512_xor_si512( + _mm512_xor_si512( + _mm512_gf2p8mul_epi8( + wide0, + _mm512_setr_epi64(ROW2, ROW7, ROW5, ROW3, ROW5, ROW4, ROW3, ROW2), + ), + _mm512_gf2p8mul_epi8( + wide1, + _mm512_setr_epi64(ROW2, ROW2, ROW7, ROW5, ROW3, ROW5, ROW4, ROW3), + ), + ), + _mm512_xor_si512( + _mm512_gf2p8mul_epi8( + wide2, + _mm512_setr_epi64(ROW3, ROW2, ROW2, ROW7, ROW5, ROW3, ROW5, ROW4), + ), + _mm512_gf2p8mul_epi8( + wide3, + _mm512_setr_epi64(ROW4, ROW3, ROW2, ROW2, ROW7, ROW5, ROW3, ROW5), + ), + ), + ), + _mm512_xor_si512( + _mm512_xor_si512( + _mm512_gf2p8mul_epi8( + wide4, + _mm512_setr_epi64(ROW5, ROW4, ROW3, ROW2, ROW2, ROW7, ROW5, ROW3), + ), + _mm512_gf2p8mul_epi8( + wide5, + _mm512_setr_epi64(ROW3, ROW5, ROW4, ROW3, ROW2, ROW2, ROW7, ROW5), + ), + ), + _mm512_xor_si512( + _mm512_gf2p8mul_epi8( + wide6, + _mm512_setr_epi64(ROW5, ROW3, ROW5, ROW4, ROW3, ROW2, ROW2, ROW7), + ), + _mm512_gf2p8mul_epi8( + wide7, + _mm512_setr_epi64(ROW7, ROW5, ROW3, ROW5, ROW4, ROW3, ROW2, ROW2), + ), + ), + ), + ) + } +} + +#[inline(always)] +unsafe fn round256(x: __m512i) -> __m512i { + unsafe { + // AddRoundConstant + let rc = 0x7060504030201000u64 | (0x0101010101010101u64 * (R as u64)); + let x = _mm512_xor_si512( + x, + if P { + _mm512_setr_epi64(rc as i64, 0, 0, 0, 0, 0, 0, 0) + } else { + _mm512_set_epi64(!rc as i64, -1, -1, -1, -1, -1, -1, -1) + }, + ); + + // SubBytes (Rijndael S-box) + let x = _mm512_gf2p8affineinv_epi64_epi8( + x, + _mm512_set1_epi64(0xF1E3C78F1F3E7CF8u64 as i64), + 0b01100011, + ); + + // ShiftBytes + let x = _mm512_rorv_epi64( + x, + if P { + _mm512_setr_epi64(0, 8, 16, 24, 32, 40, 48, 56) + } else { + _mm512_setr_epi64(8, 24, 40, 56, 0, 16, 32, 48) + }, + ); + + // MixBytes + mix_bytes(x) + } +} + +#[inline(always)] +unsafe fn transpose(x: __m512i) -> __m512i { + unsafe { + _mm512_permutexvar_epi8( + _mm512_setr_epi64( + 0x3830282018100800, + 0x3931292119110901, + 0x3A322A221A120A02, + 0x3B332B231B130B03, + 0x3C342C241C140C04, + 0x3D352D251D150D05, + 0x3E362E261E160E06, + 0x3F372F271F170F07, + ), + x, + ) + } +} + +#[inline(always)] +unsafe fn transpose_byteswap(x: __m512i) -> __m512i { + unsafe { + // Equivalent to converting 64-bit integers to big-endian, then transposing + _mm512_permutexvar_epi8( + _mm512_setr_epi64( + 0x3f372f271f170f07, + 0x3e362e261e160e06, + 0x3d352d251d150d05, + 0x3c342c241c140c04, + 0x3b332b231b130b03, + 0x3a322a221a120a02, + 0x3931292119110901, + 0x3830282018100800, + ), + x, + ) + } +} + +#[inline(always)] +unsafe fn byteswap_transpose(x: __m512i) -> __m512i { + unsafe { + // Equivalent to transposing, then converting 64-bit integers to big-endian + _mm512_permutexvar_epi8( + _mm512_setr_epi64( + 0x0008101820283038, + 0x0109111921293139, + 0x020a121a222a323a, + 0x030b131b232b333b, + 0x040c141c242c343c, + 0x050d151d252d353d, + 0x060e161e262e363e, + 0x070f171f272f373f, + ), + x, + ) + } +} + +#[inline(always)] +unsafe fn permute256(x: __m512i) -> __m512i { + unsafe { + let x = round256::<0, P>(x); + let x = round256::<1, P>(x); + let x = round256::<2, P>(x); + let x = round256::<3, P>(x); + let x = round256::<4, P>(x); + let x = round256::<5, P>(x); + let x = round256::<6, P>(x); + let x = round256::<7, P>(x); + let x = round256::<8, P>(x); + round256::<9, P>(x) + } +} + +#[inline(always)] +unsafe fn compress256(h: __m512i, m: &[u8; 64]) -> __m512i { + unsafe { + let m = transpose(_mm512_loadu_si512(m.as_ptr() as *const __m512i)); + _mm512_xor_si512( + _mm512_xor_si512( + permute256::(_mm512_xor_si512(h, m)), + permute256::(m), + ), + h, + ) + } +} + +#[target_feature(enable = "avx,avx512f,gfni,avx512vbmi")] +pub(crate) unsafe fn compress(h: &mut [u64; super::COLS], blocks: &[[u8; 64]]) { + unsafe { + debug_assert_eq!(super::COLS * 64, 512); + + let mut h_ = transpose_byteswap(_mm512_loadu_si512(h.as_ptr() as *const __m512i)); + for m in blocks { + h_ = compress256(h_, m); + } + let h_ = byteswap_transpose(h_); + _mm512_storeu_si512(h.as_mut_ptr() as *mut __m512i, h_); + } +} + +#[target_feature(enable = "avx,avx512f,gfni,avx512vbmi")] +pub(crate) unsafe fn p(h: &[u64; super::COLS]) -> [u64; super::COLS] { + unsafe { + debug_assert_eq!(super::COLS * 64, 512); + + let h_ = transpose_byteswap(_mm512_loadu_si512(h.as_ptr() as *const __m512i)); + let p = permute256::(h_); + let h_ = byteswap_transpose(_mm512_xor_si512(h_, p)); + let mut h = MaybeUninit::uninit(); + _mm512_storeu_si512(h.as_mut_ptr() as *mut __m512i, h_); + h.assume_init() + } +} diff --git a/groestl/src/compress_short/soft.rs b/groestl/src/compress_short/soft.rs new file mode 100644 index 000000000..8702f6c8b --- /dev/null +++ b/groestl/src/compress_short/soft.rs @@ -0,0 +1,84 @@ +#![allow(clippy::needless_range_loop)] + +use super::COLS; +use crate::table::TABLE; + +const ROUNDS: u64 = 10; + +#[inline(always)] +fn column(x: &[u64; COLS], c: [usize; 8]) -> u64 { + let mut t = 0; + for i in 0..8 { + let sl = 8 * (7 - i); + let idx = ((x[c[i]] >> sl) & 0xFF) as usize; + t ^= TABLE[i][idx]; + } + t +} + +#[inline(always)] +fn rndq(mut x: [u64; COLS], r: u64) -> [u64; COLS] { + for i in 0..COLS { + x[i] ^= u64::MAX.wrapping_sub((i as u64) << 4) ^ r; + } + [ + column(&x, [1, 3, 5, 7, 0, 2, 4, 6]), + column(&x, [2, 4, 6, 0, 1, 3, 5, 7]), + column(&x, [3, 5, 7, 1, 2, 4, 6, 0]), + column(&x, [4, 6, 0, 2, 3, 5, 7, 1]), + column(&x, [5, 7, 1, 3, 4, 6, 0, 2]), + column(&x, [6, 0, 2, 4, 5, 7, 1, 3]), + column(&x, [7, 1, 3, 5, 6, 0, 2, 4]), + column(&x, [0, 2, 4, 6, 7, 1, 3, 5]), + ] +} + +#[inline(always)] +fn rndp(mut x: [u64; COLS], r: u64) -> [u64; COLS] { + for i in 0..COLS { + x[i] ^= ((i as u64) << 60) ^ r; + } + [ + column(&x, [0, 1, 2, 3, 4, 5, 6, 7]), + column(&x, [1, 2, 3, 4, 5, 6, 7, 0]), + column(&x, [2, 3, 4, 5, 6, 7, 0, 1]), + column(&x, [3, 4, 5, 6, 7, 0, 1, 2]), + column(&x, [4, 5, 6, 7, 0, 1, 2, 3]), + column(&x, [5, 6, 7, 0, 1, 2, 3, 4]), + column(&x, [6, 7, 0, 1, 2, 3, 4, 5]), + column(&x, [7, 0, 1, 2, 3, 4, 5, 6]), + ] +} + +pub(crate) fn compress(h: &mut [u64; COLS], blocks: &[[u8; 64]]) { + for block in blocks { + let mut q = [0u64; COLS]; + for (chunk, v) in block.chunks_exact(8).zip(q.iter_mut()) { + *v = u64::from_be_bytes(chunk.try_into().unwrap()); + } + let mut p = [0u64; COLS]; + for i in 0..COLS { + p[i] = h[i] ^ q[i]; + } + for i in 0..ROUNDS { + q = rndq(q, i); + } + for i in 0..ROUNDS { + p = rndp(p, i << 56); + } + for i in 0..COLS { + h[i] ^= q[i] ^ p[i]; + } + } +} + +pub(crate) fn p(h: &[u64; COLS]) -> [u64; COLS] { + let mut p = *h; + for i in 0..ROUNDS { + p = rndp(p, i << 56); + } + for i in 0..COLS { + p[i] ^= h[i]; + } + p +} diff --git a/groestl/src/lib.rs b/groestl/src/lib.rs index 0f8ac73f9..357123c8c 100644 --- a/groestl/src/lib.rs +++ b/groestl/src/lib.rs @@ -5,7 +5,7 @@ html_favicon_url = "https://raw.githubusercontent.com/RustCrypto/media/6ee8e381/logo.svg" )] #![cfg_attr(docsrs, feature(doc_auto_cfg))] -#![forbid(unsafe_code)] +#![deny(unsafe_code)] #![warn(missing_docs)] pub use digest::{self, Digest};