diff --git a/.gitignore b/.gitignore index 293a17bb6..695fe9b7b 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,6 @@ pgo-data.profdata # MacOS nuisances .DS_Store + +*.log + diff --git a/Cargo.toml b/Cargo.toml index 81dbbde49..3bb243ecf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,10 +9,12 @@ hashbrown = { version = "0.14.3", default-features = false, features = ["ahash", itertools = { version = "0.11.0", default-features = false } log = { version = "0.4.14", default-features = false } num = { version = "0.4", default-features = false, features = ["rand"] } +once_cell = { version = "1.18.0", default-features = false } rand = { version = "0.8.4", default-features = false } serde = { version = "1.0", default-features = false, features = ["derive"] } static_assertions = { version = "1.1.0", default-features = false } unroll = { version = "0.1.5", default-features = false } +zeknox = { path = "../zeknox/wrappers/rust" } [profile.release] opt-level = 3 diff --git a/field/Cargo.toml b/field/Cargo.toml index e13f49efd..39ee8ef07 100644 --- a/field/Cargo.toml +++ b/field/Cargo.toml @@ -19,6 +19,9 @@ serde = { workspace = true, features = ["alloc"] } static_assertions = { workspace = true } unroll = { workspace = true } +# cuda accelerator wrapper +zeknox = { workspace = true } + # Local dependencies plonky2_util = { version = "1.0.0", path = "../util", default-features = false } @@ -29,3 +32,12 @@ rustdoc-args = ["--html-in-header", ".cargo/katex-header.html"] [lints] workspace = true + + +[features] +# default = [] +default = [ "cuda" ] +# default = [ "cuda", "cuda_sanity_check" ] +cuda = [] +# sanity check: when this flag is on, the computation will done on both CPU and CUDA, and the results compared +cuda_sanity_check = ["cuda"] \ No newline at end of file diff --git a/field/src/fft.rs b/field/src/fft.rs index d078ca6c3..6f1d5355d 100644 --- a/field/src/fft.rs +++ b/field/src/fft.rs @@ -11,6 +11,36 @@ use crate::types::Field; pub type FftRootTable = Vec>; +pub fn batch_fft(input: &[PolynomialCoeffs]) -> Vec> { + #[cfg(feature = "cuda")] + { + use zeknox::ntt_batch; + use zeknox::types::NTTConfig; + + let mut data = input + .iter() + .flat_map(|poly| poly.coeffs.clone()) + .collect::>(); + let mut cfg = NTTConfig::default(); + cfg.batches = input.len() as u32; + let poly_len = input[0].len(); + ntt_batch(0, &mut data, log2_strict(poly_len), cfg); + + data.chunks(poly_len) + .map(|chunk| PolynomialValues::new(chunk.to_vec())) + .collect() + } + #[cfg(not(feature = "cuda"))] + { + let mut res = Vec::with_capacity(input.len()); + for poly in input.iter() { + let pv = fft_with_options(poly.clone(), None, None); + res.push(pv); + } + res + } +} + pub fn fft_root_table(n: usize) -> FftRootTable { let lg_n = log2_strict(n); // bases[i] = g^2^i, for i = 0, ..., lg_n - 1 @@ -32,16 +62,72 @@ pub fn fft_root_table(n: usize) -> FftRootTable { root_table } +#[cfg(feature = "cuda")] +fn fft_dispatch_gpu( + input: &mut [F], + zero_factor: Option, + root_table: Option<&FftRootTable>, +) { + if F::CUDA_SUPPORT { + use zeknox::ntt_batch; + use zeknox::types::NTTConfig; + + #[cfg(feature = "cuda_sanity_check")] + let cpu_res = { + let mut input_clone = input.to_vec(); + fft_dispatch_cpu(&mut input_clone, zero_factor, root_table); + input_clone + }; + + ntt_batch( + 0, + input, + input.len().trailing_zeros() as usize, + NTTConfig::default(), + ); + + #[cfg(feature = "cuda_sanity_check")] + for (i, (a, b)) in input.iter().zip(cpu_res.iter()).enumerate() { + if a != b { + panic!( + "Mismatch at index {}: gpu result = {}, cpu result = {}", + i, a, b + ); + } + } + return; + } else { + return fft_dispatch_cpu(input, zero_factor, root_table); + } +} + +fn fft_dispatch_cpu( + input: &mut [F], + zero_factor: Option, + root_table: Option<&FftRootTable>, +) { + if root_table.is_some() { + fft_classic(input, zero_factor.unwrap_or(0), root_table.unwrap()) + } else { + let computed = fft_root_table::(input.len()); + fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref()) + }; +} + #[inline] fn fft_dispatch( input: &mut [F], zero_factor: Option, root_table: Option<&FftRootTable>, ) { - let computed_root_table = root_table.is_none().then(|| fft_root_table(input.len())); - let used_root_table = root_table.or(computed_root_table.as_ref()).unwrap(); - - fft_classic(input, zero_factor.unwrap_or(0), used_root_table); + #[cfg(feature = "cuda")] + { + fft_dispatch_gpu(input, zero_factor, root_table) + } + #[cfg(not(feature = "cuda"))] + { + fft_dispatch_cpu(input, zero_factor, root_table) + } } #[inline] @@ -50,6 +136,7 @@ pub fn fft(poly: PolynomialCoeffs) -> PolynomialValues { } #[inline] + pub fn fft_with_options( poly: PolynomialCoeffs, zero_factor: Option, @@ -65,6 +152,28 @@ pub fn ifft(poly: PolynomialValues) -> PolynomialCoeffs { ifft_with_options(poly, None, None) } +#[inline] +pub fn ifft_cpu(poly: PolynomialValues) -> PolynomialCoeffs { + let n = poly.len(); + let lg_n = log2_strict(n); + let n_inv = F::inverse_2exp(lg_n); + + let PolynomialValues { values: mut buffer } = poly; + fft_dispatch_cpu(&mut buffer, None, None); + + // We reverse all values except the first, and divide each by n. + buffer[0] *= n_inv; + buffer[n / 2] *= n_inv; + for i in 1..(n / 2) { + let j = n - i; + let coeffs_i = buffer[j] * n_inv; + let coeffs_j = buffer[i] * n_inv; + buffer[i] = coeffs_i; + buffer[j] = coeffs_j; + } + PolynomialCoeffs { coeffs: buffer } +} + pub fn ifft_with_options( poly: PolynomialValues, zero_factor: Option, @@ -214,6 +323,17 @@ mod tests { #[test] fn fft_and_ifft() { + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + // Initialize twiddle factors for sizes we'll use + // degree_padded is 256 = 2^8 + // lde then add 4 more bits + for i in 8..=12 { + zeknox::init_twiddle_factors_rs(0, i); + } + } + type F = GoldilocksField; let degree = 200usize; let degree_padded = degree.next_power_of_two(); @@ -222,7 +342,7 @@ mod tests { // "random", the last degree_padded-degree of them are zero. let coeffs = (0..degree) .map(|i| F::from_canonical_usize(i * 1337 % 100)) - .chain(core::iter::repeat_n(F::ZERO, degree_padded - degree)) + .chain(core::iter::repeat(F::ZERO).take(degree_padded - degree)) .collect::>(); assert_eq!(coeffs.len(), degree_padded); let coefficients = PolynomialCoeffs { coeffs }; diff --git a/field/src/goldilocks_extensions.rs b/field/src/goldilocks_extensions.rs index 8f2d85253..6dd15ce0d 100644 --- a/field/src/goldilocks_extensions.rs +++ b/field/src/goldilocks_extensions.rs @@ -21,9 +21,10 @@ impl Extendable<2> for GoldilocksField { // DTH_ROOT = W^((ORDER - 1)/2) const DTH_ROOT: Self = Self(18446744069414584320); - const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] = [Self(0), Self(11713931119993638672)]; + const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] = + [Self(18081566051660590251), Self(16121475356294670766)]; - const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(7226896044987257365)]; + const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(15659105665374529263)]; } impl Mul for QuadraticExtension { @@ -44,11 +45,15 @@ impl Extendable<4> for GoldilocksField { // DTH_ROOT = W^((ORDER - 1)/4) const DTH_ROOT: Self = Self(281474976710656); - const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] = - [Self(0), Self(8295451483910296135), Self(0), Self(0)]; + const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] = [ + Self(5024755240244648895), + Self(13227474371289740625), + Self(3912887029498544536), + Self(3900057112666848848), + ]; const EXT_POWER_OF_TWO_GENERATOR: [Self; 4] = - [Self(0), Self(0), Self(0), Self(17216955519093520442)]; + [Self(0), Self(0), Self(0), Self(12587610116473453104)]; } impl Mul for QuarticExtension { @@ -70,11 +75,11 @@ impl Extendable<5> for GoldilocksField { const DTH_ROOT: Self = Self(1041288259238279555); const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 5] = [ - Self(4624713872807171977), - Self(381988216716071028), - Self(14499722700050429911), - Self(4870631734967222356), - Self(4518902370426242880), + Self(2899034827742553394), + Self(13012057356839176729), + Self(14593811582388663055), + Self(7722900811313895436), + Self(4557222484695340057), ]; const EXT_POWER_OF_TWO_GENERATOR: [Self; 5] = [ diff --git a/field/src/goldilocks_field.rs b/field/src/goldilocks_field.rs index a4da1fee0..449b62abd 100644 --- a/field/src/goldilocks_field.rs +++ b/field/src/goldilocks_field.rs @@ -77,17 +77,19 @@ impl Field for GoldilocksField { const CHARACTERISTIC_TWO_ADICITY: usize = Self::TWO_ADICITY; // Sage: `g = GF(p).multiplicative_generator()` - const MULTIPLICATIVE_GROUP_GENERATOR: Self = Self(14293326489335486720); + const MULTIPLICATIVE_GROUP_GENERATOR: Self = Self(7); // Sage: // ``` // g_2 = g^((p - 1) / 2^32) // g_2.multiplicative_order().factor() // ``` - const POWER_OF_TWO_GENERATOR: Self = Self(7277203076849721926); + const POWER_OF_TWO_GENERATOR: Self = Self(1753635133440165772); const BITS: usize = 64; + const CUDA_SUPPORT: bool = true; + fn order() -> BigUint { Self::ORDER.into() } diff --git a/field/src/interpolation.rs b/field/src/interpolation.rs index df7084572..9772fff56 100644 --- a/field/src/interpolation.rs +++ b/field/src/interpolation.rs @@ -77,6 +77,9 @@ pub fn interpolate2(points: [(F, F); 2], x: F) -> F { #[cfg(test)] mod tests { + #[cfg(feature = "cuda")] + use zeknox::init_twiddle_factors_rs; + use super::*; use crate::extension::quartic::QuarticExtension; use crate::goldilocks_field::GoldilocksField; @@ -87,7 +90,12 @@ mod tests { fn interpolant_random() { type F = GoldilocksField; - for deg in 0..10 { + #[cfg(feature = "cuda")] + zeknox::clear_cuda_errors_rs(); + + for deg in 2..10 { + #[cfg(feature = "cuda")] + init_twiddle_factors_rs(0, log2_ceil(deg)); let domain = F::rand_vec(deg); let coeffs = F::rand_vec(deg); let coeffs = PolynomialCoeffs { coeffs }; @@ -101,7 +109,13 @@ mod tests { fn interpolant_random_roots_of_unity() { type F = GoldilocksField; - for deg_log in 0..4 { + #[cfg(feature = "cuda")] + zeknox::clear_cuda_errors_rs(); + + for deg_log in 1..4 { + #[cfg(feature = "cuda")] + init_twiddle_factors_rs(0, deg_log); + let deg = 1 << deg_log; let domain = F::two_adic_subgroup(deg_log); let coeffs = F::rand_vec(deg); @@ -116,8 +130,15 @@ mod tests { fn interpolant_random_overspecified() { type F = GoldilocksField; + #[cfg(feature = "cuda")] + zeknox::clear_cuda_errors_rs(); + for deg in 0..10 { let points = deg + 5; + + #[cfg(feature = "cuda")] + init_twiddle_factors_rs(0, log2_ceil(points)); + let domain = F::rand_vec(points); let coeffs = F::rand_vec(deg); let coeffs = PolynomialCoeffs { coeffs }; @@ -137,6 +158,8 @@ mod tests { let points = [(F::rand(), F::rand()), (F::rand(), F::rand())]; let x = F::rand(); + #[cfg(feature = "cuda")] + init_twiddle_factors_rs(0, 2); let ev0 = interpolant(&points).eval(x); let ev1 = interpolate(&points, x, &barycentric_weights(&points)); let ev2 = interpolate2(points, x); diff --git a/field/src/lib.rs b/field/src/lib.rs index 9a2ea4f9c..be1c4b512 100644 --- a/field/src/lib.rs +++ b/field/src/lib.rs @@ -24,6 +24,7 @@ pub mod polynomial; pub mod secp256k1_base; pub mod secp256k1_scalar; pub mod types; +pub mod util; pub mod zero_poly_coset; #[cfg(test)] diff --git a/field/src/polynomial/division.rs b/field/src/polynomial/division.rs index 7d85d5492..a34602de4 100644 --- a/field/src/polynomial/division.rs +++ b/field/src/polynomial/division.rs @@ -78,7 +78,7 @@ impl PolynomialCoeffs { .iter() .rev() .scan(F::ZERO, |acc, &c| { - *acc = *acc * z + c; + *acc = c.multiply_accumulate(*acc, z); Some(*acc) }) .collect::>(); diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs index c13bbca27..ffe5d18c6 100644 --- a/field/src/polynomial/mod.rs +++ b/field/src/polynomial/mod.rs @@ -12,7 +12,7 @@ use plonky2_util::log2_strict; use serde::{Deserialize, Serialize}; use crate::extension::{Extendable, FieldExtension}; -use crate::fft::{fft, fft_with_options, ifft, FftRootTable}; +use crate::fft::{fft, fft_with_options, ifft, ifft_cpu, FftRootTable}; use crate::types::Field; /// A polynomial in point-value form. @@ -55,10 +55,17 @@ impl PolynomialValues { self.values.len() } + /// Adaptive IFFT: uses GPU if available, otherwise CPU. pub fn ifft(self) -> PolynomialCoeffs { ifft(self) } + /// Enfored to use CPU IFFT. + /// Used for bypass the GPU issue during setup phase. + pub fn ifft_cpu(self) -> PolynomialCoeffs { + ifft_cpu(self) + } + /// Returns the polynomial whose evaluation on the coset `shift*H` is `self`. pub fn coset_ifft(self, shift: F) -> PolynomialCoeffs { let mut shifted_coeffs = self.ifft(); @@ -438,7 +445,6 @@ impl Mul for &PolynomialCoeffs { #[cfg(test)] mod tests { - use std::time::Instant; use rand::rngs::OsRng; use rand::Rng; @@ -447,6 +453,17 @@ mod tests { use crate::goldilocks_field::GoldilocksField; use crate::types::Sample; + #[cfg(feature = "cuda")] + fn init_gpu_for_tests() { + zeknox::clear_cuda_errors_rs(); + // Initialize twiddle factors for various sizes + for i in 0..=20 { + zeknox::init_twiddle_factors_rs(0, i); + } + let coset_gen_u64 = 7u64; + zeknox::init_coset_rs(0, 20, coset_gen_u64); + } + #[test] fn test_trimmed() { type F = GoldilocksField; @@ -475,6 +492,9 @@ mod tests { #[test] fn test_coset_fft() { + #[cfg(feature = "cuda")] + init_gpu_for_tests(); + type F = GoldilocksField; let k = 8; @@ -496,6 +516,9 @@ mod tests { #[test] fn test_coset_ifft() { + #[cfg(feature = "cuda")] + init_gpu_for_tests(); + type F = GoldilocksField; let k = 8; @@ -601,7 +624,10 @@ mod tests { // Test to see which polynomial division method is faster for divisions of the type // `(X^n - 1)/(X - a) #[test] + #[cfg(not(feature = "cuda"))] fn test_division_linear() { + use std::time::Instant; + type F = GoldilocksField; let mut rng = OsRng; let l = 14; diff --git a/field/src/types.rs b/field/src/types.rs index 153231c9a..0c6947907 100644 --- a/field/src/types.rs +++ b/field/src/types.rs @@ -91,6 +91,9 @@ pub trait Field: /// The bit length of the field order. const BITS: usize; + /// Whether this field is supported by cuda + const CUDA_SUPPORT: bool = false; + fn order() -> BigUint; fn characteristic() -> BigUint; diff --git a/field/src/util.rs b/field/src/util.rs new file mode 100644 index 000000000..374a38986 --- /dev/null +++ b/field/src/util.rs @@ -0,0 +1,20 @@ +use alloc::vec::Vec; +use core::mem; + +// alloc memory for Vec, where every element is 0. (a lot) faster than vec![F::ZERO; len] +pub unsafe fn vec_zeroed(len: usize) -> Vec { + let elem_size = mem::size_of::(); + debug_assert!(elem_size != 0, "ZST not supported by this helper"); + + // Layout for len elements + let layout = alloc::alloc::Layout::array::(len).expect("layout overflow"); + + // Allocate zeroed memory + let ptr = alloc::alloc::alloc_zeroed(layout) as *mut F; + if ptr.is_null() { + alloc::alloc::handle_alloc_error(layout); + } + + // Take ownership as a Vec + Vec::from_raw_parts(ptr, len, len) +} diff --git a/plonky2/Cargo.toml b/plonky2/Cargo.toml index 36f26fb79..fafd57d68 100644 --- a/plonky2/Cargo.toml +++ b/plonky2/Cargo.toml @@ -11,12 +11,22 @@ repository.workspace = true keywords.workspace = true categories.workspace = true + [features] -default = ["gate_testing", "parallel", "rand_chacha", "std", "timing"] +# default = ["gate_testing", "rand_chacha", "std", "timing", "cuda"] +# default = ["gate_testing", "parallel", "rand_chacha", "std", "timing", ] +# +default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing", ] +# default = ["gate_testing", "rand_chacha", "std", "cuda", "timing", "cuda_sanity_check"] +# default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing", "cuda_sanity_check"] gate_testing = [] parallel = ["hashbrown/rayon", "plonky2_maybe_rayon/parallel"] std = ["anyhow/std", "rand/std", "itertools/use_std"] timing = ["std", "dep:web-time"] +cuda = ["plonky2_field/cuda"] +# sanity check: when this flag is on, the computation will done on both CPU and CUDA, and the results compared +cuda_sanity_check = [ "cuda", "plonky2_field/cuda_sanity_check" ] + [dependencies] ahash = { workspace = true } @@ -26,6 +36,7 @@ itertools = { workspace = true } keccak-hash = { version = "0.8.0", default-features = false } log = { workspace = true } num = { workspace = true } +once_cell = { workspace = true, features = ["std"] } rand = { workspace = true } rand_chacha = { version = "0.3.1", optional = true, default-features = false } serde = { workspace = true, features = ["rc"] } @@ -38,6 +49,9 @@ plonky2_field = { version = "1.0.0", path = "../field", default-features = false plonky2_maybe_rayon = { version = "1.0.0", path = "../maybe_rayon", default-features = false } plonky2_util = { version = "1.0.0", path = "../util", default-features = false } +# cuda accelerator wrapper +zeknox = { workspace = true } + # Plonky3 dependencies for Poseidon2 p3-poseidon2 = { git = "https://github.com/Plonky3/Plonky3.git", rev = "eeb4e37b20127c4daa871b2bad0df30a7c7380db" } p3-goldilocks = { git = "https://github.com/Plonky3/Plonky3.git", rev = "eeb4e37b20127c4daa871b2bad0df30a7c7380db" } diff --git a/plonky2/benches/merkle.rs b/plonky2/benches/merkle.rs index 6230c1343..e9995be1a 100644 --- a/plonky2/benches/merkle.rs +++ b/plonky2/benches/merkle.rs @@ -23,7 +23,7 @@ pub(crate) fn bench_merkle_tree>(c: &mut Criterion) { let size = 1 << size_log; group.bench_with_input(BenchmarkId::from_parameter(size), &size, |b, _| { let leaves = vec![F::rand_vec(ELEMS_PER_LEAF); size]; - b.iter(|| MerkleTree::::new(leaves.clone(), 0)); + b.iter(|| MerkleTree::::new_from_2d(leaves.clone(), 0)); }); } } diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs index be491aa21..2f8bb84ae 100644 --- a/plonky2/examples/fibonacci.rs +++ b/plonky2/examples/fibonacci.rs @@ -38,6 +38,27 @@ fn work>() -> Result<()> { cur_target = temp; } + #[cfg(feature = "cuda")] + { + use plonky2_util::log2_ceil; + + let size = log2_ceil(builder.num_gates()); + + zeknox::clear_cuda_errors_rs(); + println!( + "Initializing CUDA twiddle factors for dimeinsions 2^{} and 2^{}", + size, + size + 3 + ); + + zeknox::init_twiddle_factors_rs(0, size); + zeknox::init_twiddle_factors_rs(0, size + 3); + + // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR) + let coset_gen_u64 = 7u64; + zeknox::init_coset_rs(0, size + 3, coset_gen_u64); + } + // Public inputs are the two initial values (provided below) and the result (which is generated). builder.register_public_input(initial_a); builder.register_public_input(initial_b); diff --git a/plonky2/src/batch_fri/oracle.rs b/plonky2/src/batch_fri/oracle.rs index 1f34cfef3..76316d8f5 100644 --- a/plonky2/src/batch_fri/oracle.rs +++ b/plonky2/src/batch_fri/oracle.rs @@ -15,7 +15,7 @@ use crate::fri::oracle::PolynomialBatch; use crate::fri::proof::FriProof; use crate::fri::structure::{FriBatchInfo, FriInstanceInfo}; use crate::fri::FriParams; -use crate::hash::batch_merkle_tree::BatchMerkleTree; +// use crate::hash::batch_merkle_tree::BatchMerkleTree; use crate::hash::hash_types::RichField; use crate::iop::challenger::Challenger; use crate::plonk::config::GenericConfig; @@ -300,6 +300,18 @@ mod test { reduction_arity_bits, }; + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + // Initialize twiddle factors for all dimensions that will be used + // This test involves multiple polynomials and recursive verification, + // so we initialize a wider range of dimensions to be safe + let current_log_size = k0 + fri_params.config.rate_bits; + for i in 0..=current_log_size + 5 { + zeknox::init_twiddle_factors_rs(0, i); + } + } + let n0 = 1 << k0; let n1 = 1 << k1; let n2 = 1 << k2; diff --git a/plonky2/src/batch_fri/prover.rs b/plonky2/src/batch_fri/prover.rs index e71fe25b4..a12f87667 100644 --- a/plonky2/src/batch_fri/prover.rs +++ b/plonky2/src/batch_fri/prover.rs @@ -104,7 +104,8 @@ pub(crate) fn batch_fri_committed_trees< reverse_index_bits_in_place(&mut final_values.values); let chunked_values = final_values.values.par_chunks(arity).map(flatten).collect(); - let tree = MerkleTree::::new(chunked_values, fri_params.config.cap_height); + let tree = + MerkleTree::::new_from_2d(chunked_values, fri_params.config.cap_height); challenger.observe_cap(&tree.cap); trees.push(tree); @@ -263,6 +264,17 @@ mod tests { }; let n = 1 << k; + + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + // Initialize twiddle factors for all dimensions that will be used + let current_log_size = k + fri_params.config.rate_bits; + for i in 0..=current_log_size { + zeknox::init_twiddle_factors_rs(0, i); + } + } + let trace = PolynomialValues::new((1..n + 1).map(F::from_canonical_i64).collect_vec()); let polynomial_batch: BatchFriOracle = BatchFriOracle::from_values( @@ -359,6 +371,16 @@ mod tests { reduction_arity_bits, }; + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + // Initialize twiddle factors for all dimensions that will be used + let current_log_size = k0 + fri_params.config.rate_bits; + for i in 0..=current_log_size { + zeknox::init_twiddle_factors_rs(0, i); + } + } + let n0 = 1 << k0; let n1 = 1 << k1; let n2 = 1 << k2; diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs index e413071a4..292f2a0cf 100644 --- a/plonky2/src/fri/oracle.rs +++ b/plonky2/src/fri/oracle.rs @@ -62,10 +62,14 @@ impl, C: GenericConfig, const D: usize> timing: &mut TimingTree, fft_root_table: Option<&FftRootTable>, ) -> Self { + // The first IFFT is always done on CPU to avoid strange GPU issue. let coeffs = timed!( timing, - "IFFT", - values.into_par_iter().map(|v| v.ifft()).collect::>() + "CPU IFFT", + values + .into_par_iter() + .map(|v| v.ifft_cpu()) + .collect::>() ); Self::from_coeffs( @@ -86,6 +90,35 @@ impl, C: GenericConfig, const D: usize> cap_height: usize, timing: &mut TimingTree, fft_root_table: Option<&FftRootTable>, + ) -> Self { + #[cfg(feature = "cuda")] + return Self::from_coeffs_gpu( + polynomials, + rate_bits, + blinding, + cap_height, + timing, + fft_root_table, + ); + #[cfg(not(feature = "cuda"))] + Self::from_coeffs_cpu( + polynomials, + rate_bits, + blinding, + cap_height, + timing, + fft_root_table, + ) + } + + /// Creates a list polynomial commitment for the polynomials `polynomials`. + fn from_coeffs_cpu( + polynomials: Vec>, + rate_bits: usize, + blinding: bool, + cap_height: usize, + timing: &mut TimingTree, + fft_root_table: Option<&FftRootTable>, ) -> Self { let degree = polynomials[0].len(); let lde_values = timed!( @@ -99,7 +132,171 @@ impl, C: GenericConfig, const D: usize> let merkle_tree = timed!( timing, "build Merkle tree", - MerkleTree::new(leaves, cap_height) + MerkleTree::new_from_2d(leaves, cap_height) + ); + + Self { + polynomials, + merkle_tree, + degree_log: log2_strict(degree), + rate_bits, + blinding, + } + } + + #[cfg(feature = "cuda")] + fn from_coeffs_gpu( + polynomials: Vec>, + rate_bits: usize, + blinding: bool, + cap_height: usize, + timing: &mut TimingTree, + _fft_root_table: Option<&FftRootTable>, + ) -> Self { + assert!(F::CUDA_SUPPORT, "CUDA is not support for this field"); + + let degree = polynomials[0].len(); + + // If blinding, salt with two random elements to each leaf vector. + let salt_size = if blinding { SALT_SIZE } else { 0 }; + + Self::from_coeffs_gpu_helper( + polynomials, + rate_bits, + blinding, + cap_height, + timing, + degree, + salt_size, + ) + } + + #[cfg(feature = "cuda")] + fn from_coeffs_gpu_helper( + polynomials: Vec>, + rate_bits: usize, + blinding: bool, + cap_height: usize, + timing: &mut TimingTree, + degree: usize, + salt_size: usize, + ) -> Self { + use plonky2_field::util::vec_zeroed; + use zeknox::device::memory::HostOrDeviceSlice; + use zeknox::types::{NTTConfig, TransposeConfig}; + use zeknox::{ntt_batch_ptr, transpose_rev_batch}; + + let lde_size = degree << rate_bits; + let num_polys = polynomials.len() + salt_size; + let total_alloc_size = num_polys * lde_size; + + let salt_polys = (0..salt_size) + .map(|_| F::rand_vec(lde_size)) + .collect::>(); + + // Step 1: Compute coset FFT on GPU, keeping data on GPU + let gpu_lde_values = timed!(timing, "GPU coset FFT", { + // Allocate GPU memory for all polynomials + log::debug!( + "Allocating GPU memory for {} polynomials of size {} (total {} elements)", + num_polys, + lde_size, + total_alloc_size + ); + + let mut gpu_buffer = timed!( + timing, + format!("cuda alloc memory for {} elements", total_alloc_size).as_ref(), + HostOrDeviceSlice::cuda_malloc(0, total_alloc_size) + .expect("Failed to allocate GPU memory") + ); + + // Copy all data to GPU in one go + let mut flat_data = timed!(timing, "Prepare CPU memory", unsafe { + vec_zeroed::(total_alloc_size) + }); + + timed!(timing, "Copy CPU memory", { + for i in 0..polynomials.len() { + flat_data[i * lde_size..i * lde_size + degree] + .copy_from_slice(polynomials[i].coeffs.as_ref()) + } + for i in polynomials.len()..num_polys { + flat_data[i * lde_size..(i + 1) * lde_size] + .copy_from_slice(salt_polys[i - polynomials.len()].as_slice()); + } + }); + + timed!( + timing, + "CPU to GPU", + gpu_buffer + .copy_from_host(&flat_data) + .expect("Failed to copy data to GPU") + ); + + // Perform batched NTT on GPU + // Technically we don't really need to do FFTs for the salt polynomial + // but then the cuda memory becomes extremely difficult to handle + // so we might as well do those FFTs. + let log_domain_size = log2_strict(lde_size); + let ntt_config = NTTConfig { + batches: num_polys as u32, + are_inputs_on_device: true, + are_outputs_on_device: true, + with_coset: true, + ..Default::default() + }; + timed!( + timing, + format!( + "GPU batch NTT for {} poly of degree {}", + num_polys, lde_size + ) + .as_ref(), + ntt_batch_ptr(0, gpu_buffer.as_mut_ptr(), log_domain_size, ntt_config) + ); + gpu_buffer + }); + + // Step 2: Transpose on GPU using Zeknox + let gpu_transposed = timed!(timing, "GPU transpose", { + let mut gpu_output = HostOrDeviceSlice::cuda_malloc(0, total_alloc_size) + .expect("Failed to allocate GPU memory for transpose"); + + let log_n = log2_strict(lde_size); + let transpose_config = TransposeConfig { + batches: num_polys as u32, + are_inputs_on_device: true, + are_outputs_on_device: true, + }; + + transpose_rev_batch( + 0, + gpu_output.as_mut_ptr(), + gpu_lde_values.as_ptr(), + log_n, + transpose_config, + ); + + gpu_output + }); + + // Step 3: Copy back to CPU + let leaves_1d = timed!(timing, "GPU to CPU", { + let mut cpu_data = vec![F::ZERO; total_alloc_size]; + + gpu_transposed + .copy_to_host(&mut cpu_data, total_alloc_size) + .expect("Failed to copy data from GPU"); + + cpu_data + }); + + let merkle_tree = timed!( + timing, + "build Merkle tree", + MerkleTree::new_from_1d(leaves_1d, polynomials.len(), cap_height) ); Self { @@ -142,7 +339,7 @@ impl, C: GenericConfig, const D: usize> pub fn get_lde_values(&self, index: usize, step: usize) -> &[F] { let index = index * step; let index = reverse_bits(index, self.degree_log + self.rate_bits); - let slice = &self.merkle_tree.leaves[index]; + let slice = &self.merkle_tree.get(index); &slice[..slice.len() - if self.blinding { SALT_SIZE } else { 0 }] } diff --git a/plonky2/src/fri/prover.rs b/plonky2/src/fri/prover.rs index 24c88ced7..e5792cb2c 100644 --- a/plonky2/src/fri/prover.rs +++ b/plonky2/src/fri/prover.rs @@ -101,7 +101,8 @@ fn fri_committed_trees, C: GenericConfig, .par_chunks(arity) .map(|chunk: &[F::Extension]| flatten(chunk)) .collect(); - let tree = MerkleTree::::new(chunked_values, fri_params.config.cap_height); + let tree = + MerkleTree::::new_from_2d(chunked_values, fri_params.config.cap_height); challenger.observe_cap(&tree.cap); trees.push(tree); diff --git a/plonky2/src/gadgets/arithmetic_extension.rs b/plonky2/src/gadgets/arithmetic_extension.rs index 9d1088030..6c8a253cf 100644 --- a/plonky2/src/gadgets/arithmetic_extension.rs +++ b/plonky2/src/gadgets/arithmetic_extension.rs @@ -629,6 +629,18 @@ mod tests { #[test] fn test_mul_many() -> Result<()> { + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + // Initialize twiddle factors for a range of sizes that might be used + for i in 0..=20 { + zeknox::init_twiddle_factors_rs(0, i); + } + // Initialize coset for Goldilocks field (coset generator = 7) + let coset_gen_u64 = 7u64; + zeknox::init_coset_rs(0, 20, coset_gen_u64); + } + const D: usize = 2; type C = PoseidonGoldilocksConfig; type F = >::F; @@ -665,6 +677,18 @@ mod tests { #[test] fn test_div_extension() -> Result<()> { + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + // Initialize twiddle factors for a range of sizes that might be used + for i in 0..=20 { + zeknox::init_twiddle_factors_rs(0, i); + } + // Initialize coset for Goldilocks field (coset generator = 7) + let coset_gen_u64 = 7u64; + zeknox::init_coset_rs(0, 20, coset_gen_u64); + } + const D: usize = 2; type C = PoseidonGoldilocksConfig; type F = >::F; @@ -692,6 +716,18 @@ mod tests { #[test] fn test_mul_algebra() -> Result<()> { + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + // Initialize twiddle factors for a range of sizes that might be used + for i in 0..=20 { + zeknox::init_twiddle_factors_rs(0, i); + } + // Initialize coset for Goldilocks field (coset generator = 7) + let coset_gen_u64 = 7u64; + zeknox::init_coset_rs(0, 20, coset_gen_u64); + } + const D: usize = 2; type C = KeccakGoldilocksConfig; type F = >::F; diff --git a/plonky2/src/gates/equality_base.rs b/plonky2/src/gates/equality_base.rs index 50a315e81..86c302e18 100644 --- a/plonky2/src/gates/equality_base.rs +++ b/plonky2/src/gates/equality_base.rs @@ -160,7 +160,6 @@ impl, const D: usize> Gate for EqualityGate { ) }) .collect(); - //println!("generators {:?}", result.len()); result } diff --git a/plonky2/src/hash/keccak.rs b/plonky2/src/hash/keccak.rs index d3fa8c4b2..61e7cb87e 100644 --- a/plonky2/src/hash/keccak.rs +++ b/plonky2/src/hash/keccak.rs @@ -7,7 +7,7 @@ use keccak_hash::keccak; use crate::hash::hash_types::{BytesHash, RichField}; use crate::hash::hashing::PlonkyPermutation; -use crate::plonk::config::Hasher; +use crate::plonk::config::{Hasher, HasherType}; use crate::util::serialization::Write; pub const SPONGE_RATE: usize = 8; @@ -103,6 +103,7 @@ impl PlonkyPermutation for KeccakPermutation { pub struct KeccakHash; impl Hasher for KeccakHash { const HASH_SIZE: usize = N; + const HASHER_TYPE: HasherType = HasherType::Keccak; type Hash = BytesHash; type Permutation = KeccakPermutation; diff --git a/plonky2/src/hash/merkle_proofs.rs b/plonky2/src/hash/merkle_proofs.rs index 74963a2a4..cbd20cea5 100644 --- a/plonky2/src/hash/merkle_proofs.rs +++ b/plonky2/src/hash/merkle_proofs.rs @@ -240,6 +240,7 @@ impl, const D: usize> CircuitBuilder { /// Same as `verify_batch_merkle_proof_to_cap`, except with the final "cap index" as separate parameter, /// rather than being contained in `leaf_index_bits`. + #[allow(dead_code)] pub(crate) fn verify_batch_merkle_proof_to_cap_with_cap_index>( &mut self, leaf_data: &[Vec], @@ -342,7 +343,8 @@ mod tests { let n = 1 << log_n; let cap_height = 1; let leaves = random_data::(n, 7); - let tree = MerkleTree::>::Hasher>::new(leaves, cap_height); + let tree = + MerkleTree::>::Hasher>::new_from_2d(leaves, cap_height); let i: usize = OsRng.gen_range(0..n); let proof = tree.prove(i); @@ -359,9 +361,10 @@ mod tests { let i_c = builder.constant(F::from_canonical_usize(i)); let i_bits = builder.split_le(i_c, log_n); - let data = builder.add_virtual_targets(tree.leaves[i].len()); + let data = builder.add_virtual_targets(tree.leaf_size); + let leaf = tree.get(i); for j in 0..data.len() { - pw.set_target(data[j], tree.leaves[i][j])?; + pw.set_target(data[j], leaf[j])?; } builder.verify_merkle_proof_to_cap::<>::InnerHasher>( diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs index 31bcf5e37..4773d71ab 100644 --- a/plonky2/src/hash/merkle_tree.rs +++ b/plonky2/src/hash/merkle_tree.rs @@ -1,16 +1,52 @@ -#[cfg(not(feature = "std"))] -use alloc::vec::Vec; use core::mem::MaybeUninit; use core::slice; +use std::collections::HashSet; +#[cfg(feature = "cuda")] +use std::sync::Arc; +#[cfg(feature = "cuda")] +use std::sync::Mutex; +use std::time::Instant; +#[cfg(not(feature = "std"))] +use std::vec::Vec; +use num::range; +#[cfg(feature = "cuda")] +use once_cell::sync::Lazy; use plonky2_maybe_rayon::*; use serde::{Deserialize, Serialize}; +#[cfg(feature = "cuda")] +use zeknox::device::memory::HostOrDeviceSlice; +#[cfg(feature = "cuda")] +use zeknox::device::stream::CudaStream; +#[cfg(feature = "cuda")] +use zeknox::{ + fill_digests_buf_linear_gpu_with_gpu_ptr, fill_digests_buf_linear_multigpu_with_gpu_ptr, +}; use crate::hash::hash_types::RichField; +#[cfg(feature = "cuda")] +use crate::hash::hash_types::NUM_HASH_OUT_ELTS; use crate::hash::merkle_proofs::MerkleProof; +#[cfg(feature = "cuda")] +use crate::plonk::config::HasherType; use crate::plonk::config::{GenericHashOut, Hasher}; use crate::util::log2_strict; +#[cfg(feature = "cuda")] +pub static GPU_ID: Lazy>> = Lazy::new(|| Arc::new(Mutex::new(0))); + +#[cfg(feature = "cuda")] +fn print_time(_now: Instant, _msg: &str) { + // println!("Time {} {} ms", _msg, _now.elapsed().as_millis()); +} + +#[cfg(not(feature = "cuda"))] +#[allow(dead_code)] +fn print_time(_now: Instant, _msg: &str) {} + +#[cfg(feature = "cuda")] +const FORCE_SINGLE_GPU: bool = true; + /// The Merkle cap of height `h` of a Merkle tree is the `h`-th layer (from the root) of the tree. /// It can be used in place of the root to verify Merkle paths, which are `h` elements shorter. #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] @@ -45,7 +81,10 @@ impl> MerkleCap { #[derive(Clone, Debug, Eq, PartialEq)] pub struct MerkleTree> { /// The data in the leaves of the Merkle tree. - pub leaves: Vec>, + // pub leaves: Vec>, + pub leaves: Vec, + + pub leaf_size: usize, /// The digests in the tree. Consists of `cap.len()` sub-trees, each corresponding to one /// element in `cap`. Each subtree is contiguous and located at @@ -64,6 +103,7 @@ pub struct MerkleTree> { impl> Default for MerkleTree { fn default() -> Self { Self { + leaf_size: 0, leaves: Vec::new(), digests: Vec::new(), cap: MerkleCap::default(), @@ -71,7 +111,7 @@ impl> Default for MerkleTree { } } -pub(crate) fn capacity_up_to_mut(v: &mut Vec, len: usize) -> &mut [MaybeUninit] { +fn capacity_up_to_mut(v: &mut Vec, len: usize) -> &mut [MaybeUninit] { assert!(v.capacity() >= len); let v_ptr = v.as_mut_ptr().cast::>(); unsafe { @@ -83,59 +123,105 @@ pub(crate) fn capacity_up_to_mut(v: &mut Vec, len: usize) -> &mut [MaybeUn } } -pub(crate) fn fill_subtree>( +fn fill_subtree>( digests_buf: &mut [MaybeUninit], - leaves: &[Vec], + leaves: &[F], + leaf_size: usize, ) -> H::Hash { - assert_eq!(leaves.len(), digests_buf.len() / 2 + 1); - if digests_buf.is_empty() { - H::hash_or_noop(&leaves[0]) - } else { - // Layout is: left recursive output || left child digest - // || right child digest || right recursive output. - // Split `digests_buf` into the two recursive outputs (slices) and two child digests - // (references). - let (left_digests_buf, right_digests_buf) = digests_buf.split_at_mut(digests_buf.len() / 2); - let (left_digest_mem, left_digests_buf) = left_digests_buf.split_last_mut().unwrap(); - let (right_digest_mem, right_digests_buf) = right_digests_buf.split_first_mut().unwrap(); - // Split `leaves` between both children. - let (left_leaves, right_leaves) = leaves.split_at(leaves.len() / 2); - - let (left_digest, right_digest) = plonky2_maybe_rayon::join( - || fill_subtree::(left_digests_buf, left_leaves), - || fill_subtree::(right_digests_buf, right_leaves), - ); + let leaves_count = leaves.len() / leaf_size; + + // if one leaf => return it hash + if leaves_count == 1 { + let hash = H::hash_or_noop(leaves); + digests_buf[0].write(hash); + return hash; + } + // if two leaves => return their concat hash + if leaves_count == 2 { + let (leaf1, leaf2) = leaves.split_at(leaf_size); + let hash_left = H::hash_or_noop(leaf1); + let hash_right = H::hash_or_noop(leaf2); + digests_buf[0].write(hash_left); + digests_buf[1].write(hash_right); + return H::two_to_one(hash_left, hash_right); + } + + assert_eq!(leaves_count, digests_buf.len() / 2 + 1); + + // leaves first - we can do all in parallel + let (_, digests_leaves) = digests_buf.split_at_mut(digests_buf.len() - leaves_count); + digests_leaves + .into_par_iter() + .enumerate() + .for_each(|(leaf_idx, digest)| { + let (_, r) = leaves.split_at(leaf_idx * leaf_size); + let (leaf, _) = r.split_at(leaf_size); + digest.write(H::hash_or_noop(leaf)); + }); - left_digest_mem.write(left_digest); - right_digest_mem.write(right_digest); - H::two_to_one(left_digest, right_digest) + // internal nodes - we can do in parallel per level + let mut last_index = digests_buf.len() - leaves_count; + + for level_log in range(1, log2_strict(leaves_count)).rev() { + let level_size = 1 << level_log; + let (_, digests_slice) = digests_buf.split_at_mut(last_index - level_size); + let (digests_slice, next_digests) = digests_slice.split_at_mut(level_size); + + digests_slice + .into_par_iter() + .zip(last_index - level_size..last_index) + .for_each(|(digest, idx)| { + let left_idx = 2 * (idx + 1) - last_index; + let right_idx = left_idx + 1; + + unsafe { + let left_digest = next_digests[left_idx].assume_init(); + let right_digest = next_digests[right_idx].assume_init(); + digest.write(H::two_to_one(left_digest, right_digest)); + } + }); + last_index -= level_size; } + + // return cap hash + let hash: >::Hash; + unsafe { + let left_digest = digests_buf[0].assume_init(); + let right_digest = digests_buf[1].assume_init(); + hash = H::two_to_one(left_digest, right_digest); + } + hash } -pub(crate) fn fill_digests_buf>( +fn fill_digests_buf>( digests_buf: &mut [MaybeUninit], cap_buf: &mut [MaybeUninit], - leaves: &[Vec], + leaves: &Vec, + leaf_size: usize, cap_height: usize, ) { // Special case of a tree that's all cap. The usual case will panic because we'll try to split // an empty slice into chunks of `0`. (We would not need this if there was a way to split into // `blah` chunks as opposed to chunks _of_ `blah`.) + let leaves_count = leaves.len() / leaf_size; + if digests_buf.is_empty() { - debug_assert_eq!(cap_buf.len(), leaves.len()); + debug_assert_eq!(cap_buf.len(), leaves_count); cap_buf .par_iter_mut() - .zip(leaves) - .for_each(|(cap_buf, leaf)| { + .enumerate() + .for_each(|(leaf_idx, cap_buf)| { + let (_, r) = leaves.split_at(leaf_idx * leaf_size); + let (leaf, _) = r.split_at(leaf_size); cap_buf.write(H::hash_or_noop(leaf)); }); return; } let subtree_digests_len = digests_buf.len() >> cap_height; - let subtree_leaves_len = leaves.len() >> cap_height; + let subtree_leaves_len = leaves_count >> cap_height; let digests_chunks = digests_buf.par_chunks_exact_mut(subtree_digests_len); - let leaves_chunks = leaves.par_chunks_exact(subtree_leaves_len); + let leaves_chunks = leaves.par_chunks_exact(subtree_leaves_len * leaf_size); assert_eq!(digests_chunks.len(), cap_buf.len()); assert_eq!(digests_chunks.len(), leaves_chunks.len()); digests_chunks.zip(cap_buf).zip(leaves_chunks).for_each( @@ -143,55 +229,251 @@ pub(crate) fn fill_digests_buf>( // We have `1 << cap_height` sub-trees, one for each entry in `cap`. They are totally // independent, so we schedule one task for each. `digests_buf` and `leaves` are split // into `1 << cap_height` slices, one for each sub-tree. - subtree_cap.write(fill_subtree::(subtree_digests, subtree_leaves)); + subtree_cap.write(fill_subtree::( + subtree_digests, + subtree_leaves, + leaf_size, + )); }, ); + + // TODO - debug code - to remove in future + /* + let digests_count: u64 = digests_buf.len().try_into().unwrap(); + let leaves_count: u64 = leaves.len().try_into().unwrap(); + let cap_height: u64 = cap_height.try_into().unwrap(); + let leaf_size: u64 = leaves[0].len().try_into().unwrap(); + let fname = format!("cpu-{}-{}-{}-{}.txt", digests_count, leaves_count, leaf_size, cap_height); + let mut file = File::create(fname).unwrap(); + for digest in digests_buf { + unsafe { + let hash = digest.assume_init().to_vec(); + for x in hash { + let str = format!("{} ", x.to_canonical_u64()); + file.write_all(str.as_bytes()); + } + } + file.write_all(b"\n"); + } + */ +} + +#[cfg(feature = "cuda")] +#[repr(C)] +union U8U64 { + f1: [u8; 32], + f2: [u64; 4], +} + +#[cfg(feature = "cuda")] +fn fill_digests_buf_gpu>( + digests_buf: &mut [MaybeUninit], + cap_buf: &mut [MaybeUninit], + leaves: &Vec, + leaf_size: usize, + cap_height: usize, +) { + let leaves_count = leaves.len() / leaf_size; + + let num_gpus: usize = std::env::var("NUM_OF_GPUS") + .unwrap_or("1".to_string()) + .parse() + .unwrap(); + + let mut gpu_id_lock = GPU_ID.lock().unwrap(); + let gpu_id = *gpu_id_lock; + *gpu_id_lock += 1; + if *gpu_id_lock >= num_gpus as u64 { + *gpu_id_lock = 0; + } + log::debug!("Using GPU id {} leave length {}", gpu_id, leaves.len()); + + let now = Instant::now(); + let gpu_leaves_buf_result = HostOrDeviceSlice::cuda_malloc(gpu_id as i32, leaves.len()); + + if gpu_leaves_buf_result.is_err() { + panic!("CUDA malloc failed, falling back to CPU for Merkle tree generation"); + } + + let mut gpu_leaves_buf = gpu_leaves_buf_result.unwrap(); + print_time(now, "alloc gpu leaves buffer"); + + let now = Instant::now(); + let _ = gpu_leaves_buf.copy_from_host(leaves.as_slice()); + print_time(now, "leaves copy to gpu"); + + let now = Instant::now(); + fill_digests_buf_gpu_ptr::( + digests_buf, + cap_buf, + gpu_leaves_buf.as_mut_ptr(), + leaves_count, + leaf_size, + cap_height, + gpu_id, + ); + print_time(now, "fill_digests_buf_gpu_ptr"); } -pub(crate) fn merkle_tree_prove>( - leaf_index: usize, +#[cfg(feature = "cuda")] +fn fill_digests_buf_gpu_ptr>( + digests_buf: &mut [MaybeUninit], + cap_buf: &mut [MaybeUninit], + leaves_ptr: *const F, leaves_len: usize, + leaf_len: usize, cap_height: usize, - digests: &[H::Hash], -) -> Vec { - let num_layers = log2_strict(leaves_len) - cap_height; - debug_assert_eq!(leaf_index >> (cap_height + num_layers), 0); - - let digest_len = 2 * (leaves_len - (1 << cap_height)); - assert_eq!(digest_len, digests.len()); - - let digest_tree: &[H::Hash] = { - let tree_index = leaf_index >> num_layers; - let tree_len = digest_len >> cap_height; - &digests[tree_len * tree_index..tree_len * (tree_index + 1)] + gpu_id: u64, +) { + let digests_count: u64 = digests_buf.len().try_into().unwrap(); + let leaves_count: u64 = leaves_len.try_into().unwrap(); + let caps_count: u64 = cap_buf.len().try_into().unwrap(); + let cap_height: u64 = cap_height.try_into().unwrap(); + let leaf_size: u64 = leaf_len.try_into().unwrap(); + + let now = Instant::now(); + // if digests_buf is empty (size 0), just allocate a few bytes to avoid errors + let digests_size = if digests_buf.len() == 0 { + NUM_HASH_OUT_ELTS + } else { + digests_buf.len() * NUM_HASH_OUT_ELTS + }; + let caps_size = if cap_buf.len() == 0 { + NUM_HASH_OUT_ELTS + } else { + cap_buf.len() * NUM_HASH_OUT_ELTS }; - // Mask out high bits to get the index within the sub-tree. - let mut pair_index = leaf_index & ((1 << num_layers) - 1); - (0..num_layers) - .map(|i| { - let parity = pair_index & 1; - pair_index >>= 1; - - // The layers' data is interleaved as follows: - // [layer 0, layer 1, layer 0, layer 2, layer 0, layer 1, layer 0, layer 3, ...]. - // Each of the above is a pair of siblings. - // `pair_index` is the index of the pair within layer `i`. - // The index of that the pair within `digests` is - // `pair_index * 2 ** (i + 1) + (2 ** i - 1)`. - let siblings_index = (pair_index << (i + 1)) + (1 << i) - 1; - // We have an index for the _pair_, but we want the index of the _sibling_. - // Double the pair index to get the index of the left sibling. Conditionally add `1` - // if we are to retrieve the right sibling. - let sibling_index = 2 * siblings_index + (1 - parity); - digest_tree[sibling_index] - }) - .collect() + let mut gpu_digests_buf: HostOrDeviceSlice<'_, F> = + HostOrDeviceSlice::cuda_malloc(gpu_id as i32, digests_size).unwrap(); + let mut gpu_cap_buf: HostOrDeviceSlice<'_, F> = + HostOrDeviceSlice::cuda_malloc(gpu_id as i32, caps_size).unwrap(); + + unsafe { + let num_gpus: usize = std::env::var("NUM_OF_GPUS") + .unwrap_or("1".to_string()) + .parse() + .unwrap(); + if !FORCE_SINGLE_GPU + && leaves_count >= (1 << 12) + && cap_height > 0 + && num_gpus > 1 + && H::HASHER_TYPE == HasherType::PoseidonBN128 + { + // println!("Multi GPU"); + fill_digests_buf_linear_multigpu_with_gpu_ptr( + gpu_digests_buf.as_mut_ptr() as *mut core::ffi::c_void, + gpu_cap_buf.as_mut_ptr() as *mut core::ffi::c_void, + leaves_ptr as *mut core::ffi::c_void, + digests_count, + caps_count, + leaves_count, + leaf_size, + cap_height, + H::HASHER_TYPE as u64, + ); + } else { + // println!("Single GPU"); + fill_digests_buf_linear_gpu_with_gpu_ptr( + gpu_digests_buf.as_mut_ptr() as *mut core::ffi::c_void, + gpu_cap_buf.as_mut_ptr() as *mut core::ffi::c_void, + leaves_ptr as *mut core::ffi::c_void, + digests_count, + caps_count, + leaves_count, + leaf_size, + cap_height, + H::HASHER_TYPE as u64, + gpu_id, + ); + } + } + print_time(now, "fill init"); + + let mut host_digests: Vec = vec![F::ZERO; digests_size]; + let mut host_caps: Vec = vec![F::ZERO; caps_size]; + let stream1 = CudaStream::create().unwrap(); + let stream2 = CudaStream::create().unwrap(); + + gpu_digests_buf + .copy_to_host_async(host_digests.as_mut_slice(), &stream1) + .expect("copy digests"); + gpu_cap_buf + .copy_to_host_async(host_caps.as_mut_slice(), &stream2) + .expect("copy caps"); + stream1.synchronize().expect("cuda sync"); + stream2.synchronize().expect("cuda sync"); + stream1.destroy().expect("cuda stream destroy"); + stream2.destroy().expect("cuda stream destroy"); + + let now = Instant::now(); + + if digests_buf.len() > 0 { + host_digests + .chunks_exact(4) + .zip(digests_buf) + .for_each(|(x, y)| { + unsafe { + let mut parts = U8U64 { f1: [0; 32] }; + parts.f2[0] = x[0].to_canonical_u64(); + parts.f2[1] = x[1].to_canonical_u64(); + parts.f2[2] = x[2].to_canonical_u64(); + parts.f2[3] = x[3].to_canonical_u64(); + let (slice, _) = parts.f1.split_at(H::HASH_SIZE); + let h: H::Hash = H::Hash::from_bytes(slice); + y.write(h); + }; + }); + } + + if cap_buf.len() > 0 { + host_caps.chunks_exact(4).zip(cap_buf).for_each(|(x, y)| { + unsafe { + let mut parts = U8U64 { f1: [0; 32] }; + parts.f2[0] = x[0].to_canonical_u64(); + parts.f2[1] = x[1].to_canonical_u64(); + parts.f2[2] = x[2].to_canonical_u64(); + parts.f2[3] = x[3].to_canonical_u64(); + let (slice, _) = parts.f1.split_at(H::HASH_SIZE); + let h: H::Hash = H::Hash::from_bytes(slice); + y.write(h); + }; + }); + } + print_time(now, "copy results"); +} + +#[cfg(feature = "cuda")] +fn fill_digests_buf_meta>( + digests_buf: &mut [MaybeUninit], + cap_buf: &mut [MaybeUninit], + leaves: &Vec, + leaf_size: usize, + cap_height: usize, +) { + // if the input is small or if it Keccak hashing, just do the hashing on CPU + if leaf_size <= H::HASH_SIZE / 8 || H::HASHER_TYPE == HasherType::Keccak { + fill_digests_buf::(digests_buf, cap_buf, leaves, leaf_size, cap_height); + } else { + fill_digests_buf_gpu::(digests_buf, cap_buf, leaves, leaf_size, cap_height); + } +} + +#[cfg(not(feature = "cuda"))] +fn fill_digests_buf_meta>( + digests_buf: &mut [MaybeUninit], + cap_buf: &mut [MaybeUninit], + leaves: &Vec, + leaf_size: usize, + cap_height: usize, +) { + fill_digests_buf::(digests_buf, cap_buf, leaves, leaf_size, cap_height); } impl> MerkleTree { - pub fn new(leaves: Vec>, cap_height: usize) -> Self { - let log2_leaves_len = log2_strict(leaves.len()); + pub fn new_from_1d(leaves_1d: Vec, leaf_size: usize, cap_height: usize) -> Self { + let leaves_len = leaves_1d.len() / leaf_size; + let log2_leaves_len = log2_strict(leaves_len); assert!( cap_height <= log2_leaves_len, "cap_height={} should be at most log2(leaves.len())={}", @@ -199,7 +481,7 @@ impl> MerkleTree { log2_leaves_len ); - let num_digests = 2 * (leaves.len() - (1 << cap_height)); + let num_digests = 2 * (leaves_len - (1 << cap_height)); let mut digests = Vec::with_capacity(num_digests); let len_cap = 1 << cap_height; @@ -207,7 +489,42 @@ impl> MerkleTree { let digests_buf = capacity_up_to_mut(&mut digests, num_digests); let cap_buf = capacity_up_to_mut(&mut cap, len_cap); - fill_digests_buf::(digests_buf, cap_buf, &leaves[..], cap_height); + + #[cfg(feature = "cuda_sanity_check")] + let (digests_buf_cpu, cap_cpu) = { + let mut digests_buf_cpu = digests_buf.to_vec(); + let mut cap_buf_cpu = cap_buf.to_vec(); + + fill_digests_buf::( + &mut digests_buf_cpu, + &mut cap_buf_cpu, + &leaves_1d.clone(), + leaf_size, + cap_height, + ); + + (digests_buf_cpu, cap_buf_cpu) + }; + + fill_digests_buf_meta::(digests_buf, cap_buf, &leaves_1d, leaf_size, cap_height); + + #[cfg(feature = "cuda_sanity_check")] + { + for i in 0..num_digests { + unsafe { + let hash1 = digests_buf[i].assume_init(); + let hash2 = digests_buf_cpu[i].assume_init(); + assert_eq!(hash1, hash2, "Digest mismatch at index {}", i); + } + } + for i in 0..len_cap { + unsafe { + let hash1 = cap_buf[i].assume_init(); + let hash2 = cap_cpu[i].assume_init(); + assert_eq!(hash1, hash2, "Cap mismatch at index {}", i); + } + } + } unsafe { // SAFETY: `fill_digests_buf` and `cap` initialized the spare capacity up to @@ -215,38 +532,355 @@ impl> MerkleTree { digests.set_len(num_digests); cap.set_len(len_cap); } + Self { + leaves: leaves_1d, + leaf_size, + digests, + cap: MerkleCap(cap), + } + } + + pub fn new_from_2d(leaves_2d: Vec>, cap_height: usize) -> Self { + let leaf_size = leaves_2d[0].len(); + let leaves_count = leaves_2d.len(); + let zeros = vec![F::from_canonical_u64(0); leaf_size]; + let mut leaves_1d: Vec = Vec::with_capacity(leaves_count * leaf_size); + for idx in 0..leaves_count { + if leaves_2d[idx].len() == 0 { + leaves_1d.extend(zeros.clone()); + } else { + leaves_1d.extend(leaves_2d[idx].clone()); + } + } + Self::new_from_1d(leaves_1d, leaf_size, cap_height) + } + pub fn new_from_fields( + leaves_1d: Vec, + leaf_size: usize, + digests: Vec, + cap: MerkleCap, + ) -> Self { Self { - leaves, + leaves: leaves_1d, + leaf_size, + digests, + cap, + } + } + + #[cfg(feature = "cuda")] + pub fn new_from_gpu_leaves( + leaves_gpu_ptr: &HostOrDeviceSlice<'_, F>, + leaves_len: usize, + leaf_len: usize, + cap_height: usize, + ) -> Self { + use plonky2_field::util::vec_zeroed; + + let log2_leaves_len = log2_strict(leaves_len); + assert!( + cap_height <= log2_leaves_len, + "cap_height={} should be at most log2(leaves.len())={}", + cap_height, + log2_leaves_len + ); + + // copy data from GPU in async mode + let mut host_leaves: Vec = unsafe { vec_zeroed(leaves_len * leaf_len) }; + let stream_copy = CudaStream::create().unwrap(); + + let start = std::time::Instant::now(); + leaves_gpu_ptr + .copy_to_host_async(host_leaves.as_mut_slice(), &stream_copy) + .expect("copy to host error"); + print_time(start, "copy leaves from GPU async"); + + let num_digests = 2 * (leaves_len - (1 << cap_height)); + let mut digests = Vec::with_capacity(num_digests); + + let len_cap = 1 << cap_height; + let mut cap = Vec::with_capacity(len_cap); + + let digests_buf = capacity_up_to_mut(&mut digests, num_digests); + let cap_buf = capacity_up_to_mut(&mut cap, len_cap); + let now = Instant::now(); + let gpu_id = 0; + fill_digests_buf_gpu_ptr::( + digests_buf, + cap_buf, + leaves_gpu_ptr.as_ptr(), + leaves_len, + leaf_len, + cap_height, + gpu_id, + ); + print_time(now, "fill digests buffer"); + + unsafe { + // SAFETY: `fill_digests_buf` and `cap` initialized the spare capacity up to + // `num_digests` and `len_cap`, resp. + digests.set_len(num_digests); + cap.set_len(len_cap); + } + /* + println!{"Digest Buffer"}; + for dg in &digests { + println!("{:?}", dg); + } + println!{"Cap Buffer"}; + for dg in &cap { + println!("{:?}", dg); + } + */ + let _ = stream_copy.synchronize(); + let _ = stream_copy.destroy(); + + Self { + leaves: host_leaves, + leaf_size: leaf_len, digests, cap: MerkleCap(cap), } } pub fn get(&self, i: usize) -> &[F] { - &self.leaves[i] + let (_, v) = self.leaves.split_at(i * self.leaf_size); + let (v, _) = v.split_at(self.leaf_size); + v + } + + pub fn get_leaves_1d(&self) -> Vec { + self.leaves.clone() + } + + pub fn get_leaves_2d(&self) -> Vec> { + let v2d: Vec> = self + .leaves + .chunks_exact(self.leaf_size) + .map(|leaf| leaf.to_vec()) + .collect(); + v2d + } + + pub fn get_leaves_count(&self) -> usize { + self.leaves.len() / self.leaf_size + } + + pub fn change_leaf_and_update(&mut self, leaf: Vec, leaf_index: usize) { + assert_eq!(leaf.len(), self.leaf_size); + let leaves_count = self.leaves.len() / self.leaf_size; + assert!(leaf_index < leaves_count); + + let cap_height = log2_strict(self.cap.len()); + let mut leaves = self.leaves.clone(); + let start = leaf_index * self.leaf_size; + let leaf_copy = leaf.clone(); + leaf.into_iter() + .enumerate() + .for_each(|(i, el)| leaves[start + i] = el); + + let digests_len = self.digests.len(); + let cap_len = self.cap.0.len(); + let digests_buf = capacity_up_to_mut(&mut self.digests, digests_len); + let cap_buf = capacity_up_to_mut(&mut self.cap.0, cap_len); + self.leaves = leaves; + if digests_buf.is_empty() { + cap_buf[leaf_index].write(H::hash_or_noop(leaf_copy.as_slice())); + } else { + let subtree_leaves_len = leaves_count >> cap_height; + let subtree_idx = leaf_index / subtree_leaves_len; + let subtree_digests_len = digests_buf.len() >> cap_height; + let subtree_offset = subtree_idx * subtree_digests_len; + let idx_in_subtree = + subtree_digests_len - subtree_leaves_len + leaf_index % subtree_leaves_len; + if subtree_leaves_len == 2 { + digests_buf[subtree_offset + idx_in_subtree] + .write(H::hash_or_noop(leaf_copy.as_slice())); + } else { + assert!(subtree_leaves_len > 2); + let idx = subtree_offset + idx_in_subtree; + digests_buf[idx].write(H::hash_or_noop(leaf_copy.as_slice())); + let mut child_idx: i64 = idx_in_subtree as i64; + let mut parent_idx: i64 = child_idx / 2 - 1; + while child_idx > 1 { + unsafe { + let mut left_idx = subtree_offset + child_idx as usize; + let mut right_idx = subtree_offset + child_idx as usize + 1; + if child_idx & 1 == 1 { + left_idx = subtree_offset + child_idx as usize - 1; + right_idx = subtree_offset + child_idx as usize; + } + let left_digest = digests_buf[left_idx].assume_init(); + let right_digest = digests_buf[right_idx].assume_init(); + digests_buf[subtree_offset + parent_idx as usize] + .write(H::two_to_one(left_digest, right_digest)); + } + child_idx = parent_idx; + parent_idx = child_idx / 2 - 1; + } + } + unsafe { + let left_digest = digests_buf[subtree_offset].assume_init(); + let right_digest = digests_buf[subtree_offset + 1].assume_init(); + cap_buf[subtree_idx].write(H::two_to_one(left_digest, right_digest)); + } + } + } + + pub fn change_leaves_in_range_and_update( + &mut self, + new_leaves: Vec>, + start_index: usize, + end_index: usize, + ) { + assert_eq!(new_leaves.len(), end_index - start_index); + assert_eq!(new_leaves[0].len(), self.leaf_size); + + let tree_leaves_count = self.leaves.len() / self.leaf_size; + assert!(start_index < end_index); + assert!(end_index < tree_leaves_count); + + let cap_height = log2_strict(self.cap.len()); + let mut leaves = self.leaves.clone(); + + leaves[start_index * self.leaf_size..end_index * self.leaf_size] + .chunks_exact_mut(self.leaf_size) + .zip(new_leaves.clone()) + .for_each(|(x, y)| { + for j in 0..self.leaf_size { + x[j] = y[j]; + } + }); + + let digests_len = self.digests.len(); + let cap_len = self.cap.0.len(); + let digests_buf = capacity_up_to_mut(&mut self.digests, digests_len); + let cap_buf = capacity_up_to_mut(&mut self.cap.0, cap_len); + self.leaves = leaves; + if digests_buf.is_empty() { + cap_buf[start_index..end_index] + .par_iter_mut() + .zip(new_leaves) + .for_each(|(cap, leaf)| { + cap.write(H::hash_or_noop(leaf.as_slice())); + }); + } else { + let subtree_leaves_len = tree_leaves_count >> cap_height; + let subtree_digests_len = digests_buf.len() >> cap_height; + + let mut positions: Vec = (start_index..end_index) + .map(|idx| { + let subtree_idx = idx / subtree_leaves_len; + let subtree_offset = subtree_idx * subtree_digests_len; + let idx_in_subtree = + subtree_digests_len - subtree_leaves_len + idx % subtree_leaves_len; + subtree_offset + idx_in_subtree + }) + .collect(); + + // TODO change to parallel loop + for i in 0..positions.len() { + digests_buf[positions[i]].write(H::hash_or_noop(new_leaves[i].as_slice())); + } + + if subtree_digests_len > 2 { + let rounds = log2_strict(tree_leaves_count) - cap_height - 1; + for _ in 0..rounds { + let mut parent_indexes: HashSet = HashSet::new(); + let parents: Vec = positions + .par_iter() + .map(|pos| { + let subtree_offset = pos / subtree_digests_len; + let idx_in_subtree = pos % subtree_digests_len; + let mut parent_idx = 0; + if idx_in_subtree > 1 { + parent_idx = idx_in_subtree / 2 - 1; + } + subtree_offset * subtree_digests_len + parent_idx + }) + .collect(); + for p in parents { + parent_indexes.insert(p); + } + positions = parent_indexes.into_iter().collect(); + + // TODO change to parallel loop + for i in 0..positions.len() { + let subtree_offset = positions[i] / subtree_digests_len; + let idx_in_subtree = positions[i] % subtree_digests_len; + let digest_idx = + subtree_offset * subtree_digests_len + 2 * (idx_in_subtree + 1); + unsafe { + let left_digest = digests_buf[digest_idx].assume_init(); + let right_digest = digests_buf[digest_idx + 1].assume_init(); + digests_buf[positions[i]] + .write(H::two_to_one(left_digest, right_digest)); + } + } + } + } + + let mut cap_indexes: HashSet = HashSet::new(); + for idx in start_index..end_index { + cap_indexes.insert(idx / subtree_leaves_len); + } + + unsafe { + for idx in cap_indexes { + let digest_idx = idx * subtree_digests_len; + let left_digest = digests_buf[digest_idx].assume_init(); + let right_digest = digests_buf[digest_idx + 1].assume_init(); + cap_buf[idx].write(H::two_to_one(left_digest, right_digest)); + } + } + } } /// Create a Merkle proof from a leaf index. pub fn prove(&self, leaf_index: usize) -> MerkleProof { let cap_height = log2_strict(self.cap.len()); - let siblings = - merkle_tree_prove::(leaf_index, self.leaves.len(), cap_height, &self.digests); + let num_layers = log2_strict(self.get_leaves_count()) - cap_height; + let subtree_digest_size = (1 << (num_layers + 1)) - 2; // 2 ^ (k+1) - 2 + let subtree_idx = leaf_index / (1 << num_layers); + + let siblings: Vec<>::Hash> = Vec::with_capacity(num_layers); + if num_layers == 0 { + return MerkleProof { siblings }; + } + + // digests index where we start + let idx = subtree_digest_size - (1 << num_layers) + (leaf_index % (1 << num_layers)); + + let siblings = (0..num_layers) + .map(|i| { + // relative index + let rel_idx = (idx + 2 - (1 << i + 1)) / (1 << i); + // absolute index + let mut abs_idx = subtree_idx * subtree_digest_size + rel_idx; + if (rel_idx & 1) == 1 { + abs_idx -= 1; + } else { + abs_idx += 1; + } + self.digests[abs_idx] + }) + .collect(); MerkleProof { siblings } } } #[cfg(test)] -pub(crate) mod tests { +mod tests { use anyhow::Result; use super::*; use crate::field::extension::Extendable; use crate::hash::merkle_proofs::verify_merkle_proof_to_cap; - use crate::plonk::config::{GenericConfig, PoseidonGoldilocksConfig}; + use crate::plonk::config::{GenericConfig, KeccakGoldilocksConfig, PoseidonGoldilocksConfig}; - pub(crate) fn random_data(n: usize, k: usize) -> Vec> { + fn random_data(n: usize, k: usize) -> Vec> { (0..n).map(|_| F::rand_vec(k)).collect() } @@ -258,7 +892,7 @@ pub(crate) mod tests { leaves: Vec>, cap_height: usize, ) -> Result<()> { - let tree = MerkleTree::::new(leaves.clone(), cap_height); + let tree = MerkleTree::::new_from_2d(leaves.clone(), cap_height); for (i, leaf) in leaves.into_iter().enumerate() { let proof = tree.prove(i); verify_merkle_proof_to_cap(leaf, i, &tree.cap, &proof)?; @@ -266,6 +900,224 @@ pub(crate) mod tests { Ok(()) } + fn verify_change_leaf_and_update(log_n: usize, cap_h: usize) { + const D: usize = 2; + type C = PoseidonGoldilocksConfig; + type F = >::F; + + let n = 1 << log_n; + let k = 7; + let mut leaves = random_data::(n, k); + + let mut mt1 = + MerkleTree::>::Hasher>::new_from_2d(leaves.clone(), cap_h); + + let tmp = random_data::(1, k); + leaves[0] = tmp[0].clone(); + let mt2 = MerkleTree::>::Hasher>::new_from_2d(leaves, cap_h); + + mt1.change_leaf_and_update(tmp[0].clone(), 0); + + /* + println!("Tree 1"); + mt1.digests.into_iter().for_each( + |x| { + println!("{:?}", x); + } + ); + println!("Tree 2"); + mt2.digests.into_iter().for_each( + |x| { + println!("{:?}", x); + } + ); + */ + + mt1.digests + .into_par_iter() + .zip(mt2.digests) + .for_each(|(d1, d2)| { + assert_eq!(d1, d2); + }); + + mt1.cap + .0 + .into_par_iter() + .zip(mt2.cap.0) + .for_each(|(d1, d2)| { + assert_eq!(d1, d2); + }); + } + + fn verify_change_leaf_and_update_range_one_by_one( + leaves_count: usize, + leaf_size: usize, + cap_height: usize, + start_index: usize, + end_index: usize, + ) { + use plonky2_field::types::Field; + + const D: usize = 2; + type C = PoseidonGoldilocksConfig; + type F = >::F; + + let raw_leaves: Vec> = random_data::(leaves_count, leaf_size); + let vals: Vec> = random_data::(end_index - start_index, leaf_size); + + let mut leaves1_1d: Vec = raw_leaves.into_iter().flatten().collect(); + let leaves2_1d: Vec = leaves1_1d.clone(); + + let mut tree2 = MerkleTree::>::Hasher>::new_from_1d( + leaves2_1d, leaf_size, cap_height, + ); + + // v1 + let now = Instant::now(); + for i in start_index..end_index { + for j in 0..leaf_size { + leaves1_1d[i * leaf_size + j] = vals[i - start_index][j]; + } + } + let tree1 = MerkleTree::>::Hasher>::new_from_1d( + leaves1_1d, leaf_size, cap_height, + ); + println!("Time V1: {} ms", now.elapsed().as_millis()); + + // v2 + let now = Instant::now(); + for idx in start_index..end_index { + let mut leaf: Vec = vec![F::from_canonical_u64(0); leaf_size]; + for j in 0..leaf_size { + leaf[j] = vals[idx - start_index][j]; + } + tree2.change_leaf_and_update(leaf, idx); + } + println!("Time V2: {} ms", now.elapsed().as_millis()); + + // compare leaves + let t2leaves = tree2.get_leaves_1d(); + tree1 + .get_leaves_1d() + .chunks_exact(leaf_size) + .enumerate() + .for_each(|(i, x)| { + let mut ok = true; + for j in 0..leaf_size { + if x[j] != t2leaves[i * leaf_size + j] { + ok = false; + break; + } + } + if !ok { + println!("Leaves different at index {:?}", i); + } + assert!(ok); + }); + + // compare trees + tree1.digests.into_iter().enumerate().for_each(|(i, x)| { + let y = tree2.digests[i]; + if x != y { + println!("Digests different at index {:?}", i); + } + assert_eq!(x, y); + }); + tree1.cap.0.into_iter().enumerate().for_each(|(i, x)| { + let y = tree2.cap.0[i]; + if x != y { + println!("Cap different at index {:?}", i); + } + assert_eq!(x, y); + }); + } + + fn verify_change_leaf_and_update_range( + leaves_count: usize, + leaf_size: usize, + cap_height: usize, + start_index: usize, + end_index: usize, + ) { + // use plonky2_field::types::Field; + + const D: usize = 2; + type C = PoseidonGoldilocksConfig; + type F = >::F; + + let raw_leaves: Vec> = random_data::(leaves_count, leaf_size); + let vals: Vec> = random_data::(end_index - start_index, leaf_size); + + let mut leaves1_1d: Vec = raw_leaves.into_iter().flatten().collect(); + let leaves2_1d: Vec = leaves1_1d.clone(); + + let mut tree2 = MerkleTree::>::Hasher>::new_from_1d( + leaves2_1d, leaf_size, cap_height, + ); + + // v1 + let now = Instant::now(); + for i in start_index..end_index { + for j in 0..leaf_size { + leaves1_1d[i * leaf_size + j] = vals[i - start_index][j]; + } + } + let tree1 = MerkleTree::>::Hasher>::new_from_1d( + leaves1_1d, leaf_size, cap_height, + ); + println!("Time V1: {} ms", now.elapsed().as_millis()); + + // v2 + let now = Instant::now(); + /* + for idx in start_index..end_index { + let mut leaf: Vec = vec![F::from_canonical_u64(0); leaf_size]; + for j in 0..leaf_size { + leaf[j] = vals[idx - start_index][j]; + } + tree2.change_leaf_and_update(leaf, idx); + } + */ + tree2.change_leaves_in_range_and_update(vals, start_index, end_index); + println!("Time V2: {} ms", now.elapsed().as_millis()); + + // compare leaves + let t2leaves = tree2.get_leaves_1d(); + tree1 + .get_leaves_1d() + .chunks_exact(leaf_size) + .enumerate() + .for_each(|(i, x)| { + let mut ok = true; + for j in 0..leaf_size { + if x[j] != t2leaves[i * leaf_size + j] { + ok = false; + break; + } + } + if !ok { + println!("Leaves different at index {:?}", i); + } + assert!(ok); + }); + + // compare trees + tree1.digests.into_iter().enumerate().for_each(|(i, x)| { + let y = tree2.digests[i]; + if x != y { + println!("Digests different at index {:?}", i); + } + assert_eq!(x, y); + }); + tree1.cap.0.into_iter().enumerate().for_each(|(i, x)| { + let y = tree2.cap.0[i]; + if x != y { + println!("Cap different at index {:?}", i); + } + assert_eq!(x, y); + }); + } + #[test] #[should_panic] fn test_cap_height_too_big() { @@ -277,7 +1129,7 @@ pub(crate) mod tests { let cap_height = log_n + 1; // Should panic if `cap_height > len_n`. let leaves = random_data::(1 << log_n, 7); - let _ = MerkleTree::>::Hasher>::new(leaves, cap_height); + let _ = MerkleTree::>::Hasher>::new_from_2d(leaves, cap_height); } #[test] @@ -296,12 +1148,89 @@ pub(crate) mod tests { } #[test] - fn test_merkle_trees() -> Result<()> { + fn test_change_leaf_and_update() -> Result<()> { + // small tree, 1 cap + verify_change_leaf_and_update(3, 0); + // small tree, 2 cap + verify_change_leaf_and_update(3, 1); + // small tree, 4 cap + verify_change_leaf_and_update(3, 2); + // small tree, all cap + verify_change_leaf_and_update(3, 3); + + // big tree + verify_change_leaf_and_update(12, 3); + + Ok(()) + } + + #[test] + fn test_change_leaf_and_update_range() -> Result<()> { + for h in 0..11 { + println!( + "Run verify_change_leaf_and_update_range_one_by_one() for height {:?}", + h + ); + verify_change_leaf_and_update_range_one_by_one(1024, 68, h, 32, 48); + println!( + "Run verify_change_leaf_and_update_range() for height {:?}", + h + ); + verify_change_leaf_and_update_range(1024, 68, h, 32, 48); + } + + Ok(()) + } + + #[test] + fn test_merkle_trees_poseidon_g64() -> Result<()> { const D: usize = 2; type C = PoseidonGoldilocksConfig; type F = >::F; - let log_n = 8; + // GPU warmup + #[cfg(feature = "cuda")] + let _x: HostOrDeviceSlice<'_, F> = HostOrDeviceSlice::cuda_malloc(0, 64).unwrap(); + + let log_n = 12; + let n = 1 << log_n; + let leaves = random_data::(n, 7); + + verify_all_leaves::(leaves, 1)?; + + Ok(()) + } + + #[cfg(feature = "cuda")] + #[test] + fn test_merkle_trees_cuda_poseidon_g64() -> Result<()> { + const D: usize = 2; + type C = PoseidonGoldilocksConfig; + type F = >::F; + + let log_n = 14; + let n = 1 << log_n; + let leaves = random_data::(n, 7); + let leaves_1d: Vec = leaves.into_iter().flatten().collect(); + + let mut gpu_data: HostOrDeviceSlice<'_, F> = + HostOrDeviceSlice::cuda_malloc(0, n * 7).unwrap(); + gpu_data + .copy_from_host(leaves_1d.as_slice()) + .expect("copy data to gpu"); + + MerkleTree::>::Hasher>::new_from_gpu_leaves(&gpu_data, n, 7, 1); + + Ok(()) + } + + #[test] + fn test_merkle_trees_keccak() -> Result<()> { + const D: usize = 2; + type C = KeccakGoldilocksConfig; + type F = >::F; + + let log_n = 12; let n = 1 << log_n; let leaves = random_data::(n, 7); diff --git a/plonky2/src/hash/mod.rs b/plonky2/src/hash/mod.rs index 0d3436973..d8090310d 100644 --- a/plonky2/src/hash/mod.rs +++ b/plonky2/src/hash/mod.rs @@ -2,7 +2,7 @@ //! as well as specific hash functions implementation. mod arch; -pub mod batch_merkle_tree; +// pub mod batch_merkle_tree; pub mod hash_types; pub mod hashing; pub mod keccak; diff --git a/plonky2/src/hash/path_compression.rs b/plonky2/src/hash/path_compression.rs index 517576bf0..5a7f7e1ca 100644 --- a/plonky2/src/hash/path_compression.rs +++ b/plonky2/src/hash/path_compression.rs @@ -129,8 +129,14 @@ mod tests { type F = >::F; let h = 10; let cap_height = 3; - let vs = (0..1 << h).map(|_| vec![F::rand()]).collect::>(); - let mt = MerkleTree::>::Hasher>::new(vs.clone(), cap_height); + let vs = (0..1 << h) + .flat_map(|_| vec![F::rand()]) + .collect::>(); + let mt = MerkleTree::>::Hasher>::new_from_1d( + vs.clone(), + 1, + cap_height, + ); let mut rng = OsRng; let k = rng.gen_range(1..=1 << h); @@ -139,7 +145,10 @@ mod tests { let compressed_proofs = compress_merkle_proofs(cap_height, &indices, &proofs); let decompressed_proofs = decompress_merkle_proofs( - &indices.iter().map(|&i| vs[i].clone()).collect::>(), + &indices + .iter() + .map(|&i| vec![vs[i].clone()]) + .collect::>(), &indices, &compressed_proofs, h, diff --git a/plonky2/src/hash/poseidon.rs b/plonky2/src/hash/poseidon.rs index a7c763252..2b6dd0ac2 100644 --- a/plonky2/src/hash/poseidon.rs +++ b/plonky2/src/hash/poseidon.rs @@ -18,7 +18,7 @@ use crate::hash::hashing::{compress, hash_n_to_hash_no_pad, PlonkyPermutation}; use crate::iop::ext_target::ExtensionTarget; use crate::iop::target::{BoolTarget, Target}; use crate::plonk::circuit_builder::CircuitBuilder; -use crate::plonk::config::{AlgebraicHasher, Hasher}; +use crate::plonk::config::{AlgebraicHasher, Hasher, HasherType}; pub const SPONGE_RATE: usize = 8; pub const SPONGE_CAPACITY: usize = 4; @@ -874,6 +874,7 @@ impl PlonkyPermutation< pub struct PoseidonHash; impl Hasher for PoseidonHash { const HASH_SIZE: usize = 4 * 8; + const HASHER_TYPE: HasherType = HasherType::Poseidon; type Hash = HashOut; type Permutation = PoseidonPermutation; diff --git a/plonky2/src/hash/poseidon2/hash.rs b/plonky2/src/hash/poseidon2/hash.rs index d6fb72bfd..e73db08bc 100644 --- a/plonky2/src/hash/poseidon2/hash.rs +++ b/plonky2/src/hash/poseidon2/hash.rs @@ -12,7 +12,7 @@ use crate::hash::hashing::{compress, hash_n_to_hash_no_pad, PlonkyPermutation}; use crate::iop::ext_target::ExtensionTarget; use crate::iop::target::{BoolTarget, Target}; use crate::plonk::circuit_builder::CircuitBuilder; -use crate::plonk::config::{AlgebraicHasher, Hasher}; +use crate::plonk::config::{AlgebraicHasher, Hasher, HasherType}; pub trait Poseidon2: PrimeField64 { #[inline] @@ -498,6 +498,7 @@ fn sum_12(inputs: &[F]) -> F { pub struct Poseidon2Hash; impl Hasher for Poseidon2Hash { const HASH_SIZE: usize = 4 * 8; + const HASHER_TYPE: HasherType = HasherType::Poseidon2; type Hash = HashOut; type Permutation = Poseidon2Permutation; diff --git a/plonky2/src/iop/generator.rs b/plonky2/src/iop/generator.rs index f81508b7a..d24d8d42b 100644 --- a/plonky2/src/iop/generator.rs +++ b/plonky2/src/iop/generator.rs @@ -36,7 +36,6 @@ pub fn generate_partial_witness< let config = &common_data.config; let generators = &prover_data.generators; let generator_indices_by_watches = &prover_data.generator_indices_by_watches; - let mut witness = PartitionWitness::new( config.num_wires, common_data.degree(), diff --git a/plonky2/src/lib.rs b/plonky2/src/lib.rs index 8772ecfc0..4bf8cc982 100644 --- a/plonky2/src/lib.rs +++ b/plonky2/src/lib.rs @@ -11,7 +11,7 @@ pub extern crate alloc; #[doc(inline)] pub use plonky2_field as field; -pub mod batch_fri; +// pub mod batch_fri; pub mod fri; pub mod gadgets; pub mod gates; diff --git a/plonky2/src/plonk/circuit_builder.rs b/plonky2/src/plonk/circuit_builder.rs index 421d15cdc..74fa4be62 100644 --- a/plonky2/src/plonk/circuit_builder.rs +++ b/plonky2/src/plonk/circuit_builder.rs @@ -1225,6 +1225,24 @@ impl, const D: usize> CircuitBuilder { let max_fft_points = 1 << (degree_bits + max(rate_bits, log2_ceil(quotient_degree_factor))); let fft_root_table = fft_root_table(max_fft_points); + // Initialize GPU twiddle factors for all sizes we'll use + #[cfg(feature = "cuda")] + { + if F::CUDA_SUPPORT { + zeknox::clear_cuda_errors_rs(); + // Initialize twiddle factors for degree and LDE sizes + // degree_bits: the base degree + // degree_bits + rate_bits: the LDE size used in from_coeffs + // We need to initialize from 0 to cover all potential sizes + let max_log_size = degree_bits + max(rate_bits, log2_ceil(quotient_degree_factor)); + for i in 0..=max_log_size { + zeknox::init_twiddle_factors_rs(0, i); + } + } + } + + // This part of the code on GPU is buggy. So we use CPU for computation. + // It does not impact performance as this is only done once during setup. let constants_sigmas_commitment = if commit_to_sigma { let constants_sigmas_vecs = [constant_vecs, sigma_vecs.clone()].concat(); PolynomialBatch::::from_values( diff --git a/plonky2/src/plonk/circuit_data.rs b/plonky2/src/plonk/circuit_data.rs index e06e9b69b..61aa36ab0 100644 --- a/plonky2/src/plonk/circuit_data.rs +++ b/plonky2/src/plonk/circuit_data.rs @@ -19,6 +19,7 @@ use core::ops::{Range, RangeFrom}; use std::collections::BTreeMap; use anyhow::Result; +use log::Level; use serde::Serialize; use super::circuit_builder::LookupWire; @@ -213,12 +214,11 @@ impl, C: GenericConfig, const D: usize> } pub fn prove(&self, inputs: PartialWitness) -> Result> { - prove::( - &self.prover_only, - &self.common, - inputs, - &mut TimingTree::default(), - ) + let mut timing = TimingTree::new("CircuitData::prove", Level::Debug); + + let res = prove::(&self.prover_only, &self.common, inputs, &mut timing); + timing.print(); + res } pub fn verify(&self, proof_with_pis: ProofWithPublicInputs) -> Result<()> { diff --git a/plonky2/src/plonk/config.rs b/plonky2/src/plonk/config.rs index 4bbcbc4bc..b3c44fce4 100644 --- a/plonky2/src/plonk/config.rs +++ b/plonky2/src/plonk/config.rs @@ -24,6 +24,14 @@ use crate::hash::poseidon2::hash::Poseidon2Hash; use crate::iop::target::{BoolTarget, Target}; use crate::plonk::circuit_builder::CircuitBuilder; +#[derive(PartialEq, Debug, Copy, Clone)] +pub enum HasherType { + Poseidon = 0, + Keccak = 1, + PoseidonBN128 = 2, + Poseidon2 = 3, +} + pub trait GenericHashOut: Copy + Clone + Debug + Eq + PartialEq + Send + Sync + Serialize + DeserializeOwned { @@ -35,6 +43,8 @@ pub trait GenericHashOut: /// Trait for hash functions. pub trait Hasher: Sized + Copy + Debug + Eq + PartialEq { + const HASHER_TYPE: HasherType; + /// Size of `Hash` in bytes. const HASH_SIZE: usize; diff --git a/plonky2/src/plonk/prover.rs b/plonky2/src/plonk/prover.rs index ac0a683cf..18ce4fc64 100644 --- a/plonky2/src/plonk/prover.rs +++ b/plonky2/src/plonk/prover.rs @@ -7,6 +7,7 @@ use core::mem::swap; use anyhow::{ensure, Result}; use hashbrown::HashMap; +use plonky2_field::util::vec_zeroed; use plonky2_maybe_rayon::*; use super::circuit_builder::{LookupChallenges, LookupWire}; @@ -159,17 +160,16 @@ where "compute full witness", partition_witness.full_witness() ); - let wires_values: Vec> = timed!( timing, "compute wire polynomials", + // Use sequential iteration for deterministic results witness .wire_values .par_iter() .map(|column| PolynomialValues::new(column.clone())) .collect() ); - let wires_commitment = timed!( timing, "compute wires commitment", @@ -182,7 +182,6 @@ where prover_data.fft_root_table.as_ref(), ) ); - let mut challenger = Challenger::::new(); // Observe the FRI config @@ -270,6 +269,7 @@ where &gammas, &deltas, &alphas, + timing, ) ); @@ -304,6 +304,7 @@ where challenger.observe_cap::("ient_polys_commitment.merkle_tree.cap); let zeta = challenger.get_extension_challenge::(); + // To avoid leaking witness data, we want to ensure that our opening locations, `zeta` and // `g * zeta`, are not in our subgroup `H`. It suffices to check `zeta` only, since // `(g * zeta)^n = zeta^n`, where `n` is the order of `g`. @@ -326,6 +327,7 @@ where common_data ) ); + challenger.observe_openings(&openings.to_fri_openings()); let instance = common_data.get_fri_instance(zeta); @@ -621,6 +623,7 @@ fn compute_quotient_polys< gammas: &[F], deltas: &[F], alphas: &[F], + timing: &mut TimingTree, ) -> Vec> { let num_challenges = common_data.config.num_challenges; @@ -640,7 +643,12 @@ fn compute_quotient_polys< // steps away since we work on an LDE of degree `max_filtered_constraint_degree`. let next_step = 1 << quotient_degree_bits; - let points = F::two_adic_subgroup(common_data.degree_bits() + quotient_degree_bits); + let points = timed!( + timing, + "set up subgroup generators", + F::two_adic_subgroup(common_data.degree_bits() + quotient_degree_bits) + ); + let lde_size = points.len(); let z_h_on_coset = ZeroPolyOnCoset::new(common_data.degree_bits(), quotient_degree_bits); @@ -649,167 +657,184 @@ fn compute_quotient_polys< // These values are used to produce the final RE constraints for each lut, // and are the same each time in check_lookup_constraints_batched. // lut_poly_evals[i][j] gives the eval for the i'th challenge and the j'th lookup table - let lut_re_poly_evals: Vec> = if has_lookup { - let num_lut_slots = LookupTableGate::num_slots(&common_data.config); - (0..num_challenges) - .map(move |i| { - let cur_deltas = &deltas[NUM_COINS_LOOKUP * i..NUM_COINS_LOOKUP * (i + 1)]; - let cur_challenge_delta = cur_deltas[LookupChallenges::ChallengeDelta as usize]; - - (LookupSelectors::StartEnd as usize..common_data.num_lookup_selectors) - .map(|r| { - let lut_row_number = common_data.luts - [r - LookupSelectors::StartEnd as usize] - .len() - .div_ceil(num_lut_slots); - - get_lut_poly( - common_data, - r - LookupSelectors::StartEnd as usize, - cur_deltas, - num_lut_slots * lut_row_number, - ) - .eval(cur_challenge_delta) - }) - .collect() - }) - .collect() - } else { - vec![] - }; + let lut_re_poly_evals: Vec> = timed!( + timing, + "compute LUT RE polynomial evals", + if has_lookup { + let num_lut_slots = LookupTableGate::num_slots(&common_data.config); + (0..num_challenges) + .map(move |i| { + let cur_deltas = &deltas[NUM_COINS_LOOKUP * i..NUM_COINS_LOOKUP * (i + 1)]; + let cur_challenge_delta = cur_deltas[LookupChallenges::ChallengeDelta as usize]; + + (LookupSelectors::StartEnd as usize..common_data.num_lookup_selectors) + .map(|r| { + let lut_row_number = common_data.luts + [r - LookupSelectors::StartEnd as usize] + .len() + .div_ceil(num_lut_slots); + + get_lut_poly( + common_data, + r - LookupSelectors::StartEnd as usize, + cur_deltas, + num_lut_slots * lut_row_number, + ) + .eval(cur_challenge_delta) + }) + .collect() + }) + .collect() + } else { + vec![] + } + ); let lut_re_poly_evals_refs: Vec<&[F]> = lut_re_poly_evals.iter().map(|v| v.as_slice()).collect(); - let points_batches = points.par_chunks(BATCH_SIZE); let num_batches = points.len().div_ceil(BATCH_SIZE); - let quotient_values: Vec> = points_batches - .enumerate() - .flat_map(|(batch_i, xs_batch)| { - // Each batch must be the same size, except the last one, which may be smaller. - debug_assert!( - xs_batch.len() == BATCH_SIZE - || (batch_i == num_batches - 1 && xs_batch.len() <= BATCH_SIZE) - ); - - let indices_batch: Vec = - (BATCH_SIZE * batch_i..BATCH_SIZE * batch_i + xs_batch.len()).collect(); - - let mut shifted_xs_batch = Vec::with_capacity(xs_batch.len()); - let mut local_zs_batch = Vec::with_capacity(xs_batch.len()); - let mut next_zs_batch = Vec::with_capacity(xs_batch.len()); - - let mut local_lookup_batch = Vec::with_capacity(xs_batch.len()); - let mut next_lookup_batch = Vec::with_capacity(xs_batch.len()); - - let mut partial_products_batch = Vec::with_capacity(xs_batch.len()); - let mut s_sigmas_batch = Vec::with_capacity(xs_batch.len()); - - let mut local_constants_batch_refs = Vec::with_capacity(xs_batch.len()); - let mut local_wires_batch_refs = Vec::with_capacity(xs_batch.len()); - - for (&i, &x) in indices_batch.iter().zip(xs_batch) { - let shifted_x = F::coset_shift() * x; - let i_next = (i + next_step) % lde_size; - let local_constants_sigmas = prover_data - .constants_sigmas_commitment - .get_lde_values(i, step); - let local_constants = &local_constants_sigmas[common_data.constants_range()]; - let s_sigmas = &local_constants_sigmas[common_data.sigmas_range()]; - let local_wires = wires_commitment.get_lde_values(i, step); - let local_zs_partial_and_lookup = - zs_partial_products_and_lookup_commitment.get_lde_values(i, step); - let next_zs_partial_and_lookup = - zs_partial_products_and_lookup_commitment.get_lde_values(i_next, step); - - let local_zs = &local_zs_partial_and_lookup[common_data.zs_range()]; - - let next_zs = &next_zs_partial_and_lookup[common_data.zs_range()]; - - let partial_products = - &local_zs_partial_and_lookup[common_data.partial_products_range()]; - - if has_lookup { - let local_lookup_zs = &local_zs_partial_and_lookup[common_data.lookup_range()]; - - let next_lookup_zs = &next_zs_partial_and_lookup[common_data.lookup_range()]; - debug_assert_eq!(local_lookup_zs.len(), common_data.num_all_lookup_polys()); - - local_lookup_batch.push(local_lookup_zs); - next_lookup_batch.push(next_lookup_zs); - } + let quotient_values: Vec> = timed!(timing, "compute quotient value", { + points + .par_chunks(BATCH_SIZE) + .enumerate() + .flat_map(|(batch_i, xs_batch)| { + // Each batch must be the same size, except the last one, which may be smaller. + debug_assert!( + xs_batch.len() == BATCH_SIZE + || (batch_i == num_batches - 1 && xs_batch.len() <= BATCH_SIZE) + ); - debug_assert_eq!(local_wires.len(), common_data.config.num_wires); - debug_assert_eq!(local_zs.len(), num_challenges); + let batch_size = xs_batch.len(); + let batch_start = BATCH_SIZE * batch_i; - local_constants_batch_refs.push(local_constants); - local_wires_batch_refs.push(local_wires); + let mut shifted_xs_batch = Vec::with_capacity(batch_size); + let mut local_zs_batch = Vec::with_capacity(batch_size); + let mut next_zs_batch = Vec::with_capacity(batch_size); - shifted_xs_batch.push(shifted_x); - local_zs_batch.push(local_zs); - next_zs_batch.push(next_zs); - partial_products_batch.push(partial_products); - s_sigmas_batch.push(s_sigmas); - } + let mut local_lookup_batch = Vec::with_capacity(batch_size); + let mut next_lookup_batch = Vec::with_capacity(batch_size); + + let mut partial_products_batch = Vec::with_capacity(batch_size); + let mut s_sigmas_batch = Vec::with_capacity(batch_size); + + let mut local_constants_batch_refs = Vec::with_capacity(batch_size); + let mut local_wires_batch_refs = Vec::with_capacity(batch_size); + + for (j, &x) in xs_batch.iter().enumerate() { + let i = batch_start + j; + let shifted_x = F::coset_shift() * x; + let i_next = (i + next_step) % lde_size; + let local_constants_sigmas = prover_data + .constants_sigmas_commitment + .get_lde_values(i, step); + let local_constants = &local_constants_sigmas[common_data.constants_range()]; + let s_sigmas = &local_constants_sigmas[common_data.sigmas_range()]; + let local_wires = wires_commitment.get_lde_values(i, step); + let local_zs_partial_and_lookup = + zs_partial_products_and_lookup_commitment.get_lde_values(i, step); + let next_zs_partial_and_lookup = + zs_partial_products_and_lookup_commitment.get_lde_values(i_next, step); + + let local_zs = &local_zs_partial_and_lookup[common_data.zs_range()]; + + let next_zs = &next_zs_partial_and_lookup[common_data.zs_range()]; - // NB (JN): I'm not sure how (in)efficient the below is. It needs measuring. - let mut local_constants_batch = - vec![F::ZERO; xs_batch.len() * local_constants_batch_refs[0].len()]; - for i in 0..local_constants_batch_refs[0].len() { - for (j, constants) in local_constants_batch_refs.iter().enumerate() { - local_constants_batch[i * xs_batch.len() + j] = constants[i]; + let partial_products = + &local_zs_partial_and_lookup[common_data.partial_products_range()]; + + if has_lookup { + let local_lookup_zs = + &local_zs_partial_and_lookup[common_data.lookup_range()]; + + let next_lookup_zs = + &next_zs_partial_and_lookup[common_data.lookup_range()]; + debug_assert_eq!(local_lookup_zs.len(), common_data.num_all_lookup_polys()); + + local_lookup_batch.push(local_lookup_zs); + next_lookup_batch.push(next_lookup_zs); + } + + debug_assert_eq!(local_wires.len(), common_data.config.num_wires); + debug_assert_eq!(local_zs.len(), num_challenges); + + local_constants_batch_refs.push(local_constants); + local_wires_batch_refs.push(local_wires); + + shifted_xs_batch.push(shifted_x); + local_zs_batch.push(local_zs); + next_zs_batch.push(next_zs); + partial_products_batch.push(partial_products); + s_sigmas_batch.push(s_sigmas); } - } - let mut local_wires_batch = - vec![F::ZERO; xs_batch.len() * local_wires_batch_refs[0].len()]; - for i in 0..local_wires_batch_refs[0].len() { - for (j, wires) in local_wires_batch_refs.iter().enumerate() { - local_wires_batch[i * xs_batch.len() + j] = wires[i]; + // Optimized transposition with better cache locality + let n_constants = local_constants_batch_refs[0].len(); + let mut local_constants_batch = vec![F::ZERO; xs_batch.len() * n_constants]; + for i in 0..n_constants { + let offset = i * xs_batch.len(); + for (j, constants) in local_constants_batch_refs.iter().enumerate() { + local_constants_batch[offset + j] = constants[i]; + } } - } - let vars_batch = EvaluationVarsBaseBatch::new( - xs_batch.len(), - &local_constants_batch, - &local_wires_batch, - public_inputs_hash, - ); + let n_wires = local_wires_batch_refs[0].len(); + let mut local_wires_batch = vec![F::ZERO; xs_batch.len() * n_wires]; + for i in 0..n_wires { + let offset = i * xs_batch.len(); + for (j, wires) in local_wires_batch_refs.iter().enumerate() { + local_wires_batch[offset + j] = wires[i]; + } + } - let mut quotient_values_batch = eval_vanishing_poly_base_batch::( - common_data, - &indices_batch, - &shifted_xs_batch, - vars_batch, - &local_zs_batch, - &next_zs_batch, - &local_lookup_batch, - &next_lookup_batch, - &partial_products_batch, - &s_sigmas_batch, - betas, - gammas, - deltas, - alphas, - &z_h_on_coset, - &lut_re_poly_evals_refs, - ); - - for (&i, quotient_values) in indices_batch.iter().zip(quotient_values_batch.iter_mut()) - { - let denominator_inv = z_h_on_coset.eval_inverse(i); - quotient_values - .iter_mut() - .for_each(|v| *v *= denominator_inv); - } - quotient_values_batch - }) - .collect(); + let vars_batch = EvaluationVarsBaseBatch::new( + xs_batch.len(), + &local_constants_batch, + &local_wires_batch, + public_inputs_hash, + ); - transpose("ient_values) - .into_par_iter() - .map(PolynomialValues::new) - .map(|values| values.coset_ifft(F::coset_shift())) - .collect() + let indices_batch: Vec = (batch_start..batch_start + batch_size).collect(); + let mut quotient_values_batch = eval_vanishing_poly_base_batch::( + common_data, + &indices_batch, + &shifted_xs_batch, + vars_batch, + &local_zs_batch, + &next_zs_batch, + &local_lookup_batch, + &next_lookup_batch, + &partial_products_batch, + &s_sigmas_batch, + betas, + gammas, + deltas, + alphas, + &z_h_on_coset, + &lut_re_poly_evals_refs, + ); + + for (j, quotient_values) in quotient_values_batch.iter_mut().enumerate() { + let i = batch_start + j; + let denominator_inv = z_h_on_coset.eval_inverse(i); + quotient_values + .iter_mut() + .for_each(|v| *v *= denominator_inv); + } + + quotient_values_batch + }) + .collect() + }); + + timed!( + timing, + "transpose and final ifft", + transpose("ient_values) + .into_par_iter() + .map(PolynomialValues::new) + .map(|values| values.coset_ifft(F::coset_shift())) + .collect() + ) } diff --git a/plonky2/src/plonk/vanishing_poly.rs b/plonky2/src/plonk/vanishing_poly.rs index 48179ce63..6a5b8a266 100644 --- a/plonky2/src/plonk/vanishing_poly.rs +++ b/plonky2/src/plonk/vanishing_poly.rs @@ -211,20 +211,29 @@ pub(crate) fn eval_vanishing_poly_base_batch, const let num_challenges = common_data.config.num_challenges; let num_routed_wires = common_data.config.num_routed_wires; - let mut numerator_values = Vec::with_capacity(num_routed_wires); - let mut denominator_values = Vec::with_capacity(num_routed_wires); + // Pre-allocate reusable buffers with exact capacities + let mut numerator_values = Vec::with_capacity(num_routed_wires * num_challenges); + let mut denominator_values = Vec::with_capacity(num_routed_wires * num_challenges); // The L_0(x) (Z(x) - 1) vanishing terms. let mut vanishing_z_1_terms = Vec::with_capacity(num_challenges); // The terms checking the partial products. - let mut vanishing_partial_products_terms = Vec::new(); + let mut vanishing_partial_products_terms = Vec::with_capacity(num_challenges * num_prods); // The terms checking the lookup constraints. - let mut vanishing_all_lookup_terms = if has_lookup { + let lookup_terms_capacity = if has_lookup { let num_sldc_polys = common_data.num_lookup_polys - 1; - Vec::with_capacity( - common_data.config.num_challenges * (4 + common_data.luts.len() + 2 * num_sldc_polys), - ) + num_challenges * (4 + common_data.luts.len() + 2 * num_sldc_polys) + } else { + 0 + }; + let mut vanishing_all_lookup_terms = Vec::with_capacity(lookup_terms_capacity); + + // Pre-allocate selector buffer if needed + let selector_offset = common_data.selectors_info.num_selectors(); + let num_lookup_selectors = common_data.num_lookup_selectors; + let mut lookup_selectors = if has_lookup && num_lookup_selectors > 0 { + Vec::with_capacity(num_lookup_selectors) } else { Vec::new() }; @@ -235,22 +244,20 @@ pub(crate) fn eval_vanishing_poly_base_batch, const let x = xs_batch[k]; let vars = vars_batch.view(k); - let lookup_selectors: Vec = (0..common_data.num_lookup_selectors) - .map(|i| vars.local_constants[common_data.selectors_info.num_selectors() + i]) - .collect(); + // Reuse lookup_selectors buffer + if has_lookup { + lookup_selectors.clear(); + lookup_selectors.extend( + (0..num_lookup_selectors).map(|i| vars.local_constants[selector_offset + i]), + ); + } let local_zs = local_zs_batch[k]; let next_zs = next_zs_batch[k]; - let local_lookup_zs = if has_lookup { - local_lookup_zs_batch[k] - } else { - &[] - }; - - let next_lookup_zs = if has_lookup { - next_lookup_zs_batch[k] + let (local_lookup_zs, next_lookup_zs) = if has_lookup { + (local_lookup_zs_batch[k], next_lookup_zs_batch[k]) } else { - &[] + (&[][..], &[][..]) }; let partial_products = partial_products_batch[k]; @@ -259,6 +266,16 @@ pub(crate) fn eval_vanishing_poly_base_batch, const let constraint_terms = PackedStridedView::new(&constraint_terms_batch, n, k); let l_0_x = z_h_on_coset.eval_l_0(index, x); + + // Pre-compute common values for all challenges + let beta_x_s_ids: Vec = if num_challenges > 0 { + (0..num_routed_wires) + .map(|j| common_data.k_is[j] * x) + .collect() + } else { + Vec::new() + }; + for i in 0..num_challenges { let z_x = local_zs[i]; let z_gx = next_zs[i]; @@ -268,10 +285,10 @@ pub(crate) fn eval_vanishing_poly_base_batch, const if has_lookup { let cur_deltas = &deltas[NUM_COINS_LOOKUP * i..NUM_COINS_LOOKUP * (i + 1)]; - let cur_local_lookup_zs = &local_lookup_zs - [common_data.num_lookup_polys * i..common_data.num_lookup_polys * (i + 1)]; - let cur_next_lookup_zs = &next_lookup_zs - [common_data.num_lookup_polys * i..common_data.num_lookup_polys * (i + 1)]; + let lookup_poly_start = common_data.num_lookup_polys * i; + let lookup_poly_end = lookup_poly_start + common_data.num_lookup_polys; + let cur_local_lookup_zs = &local_lookup_zs[lookup_poly_start..lookup_poly_end]; + let cur_next_lookup_zs = &next_lookup_zs[lookup_poly_start..lookup_poly_end]; let lookup_constraints = check_lookup_constraints_batch( common_data, @@ -285,17 +302,17 @@ pub(crate) fn eval_vanishing_poly_base_batch, const vanishing_all_lookup_terms.extend(lookup_constraints); } - numerator_values.extend((0..num_routed_wires).map(|j| { - let wire_value = vars.local_wires[j]; - let k_i = common_data.k_is[j]; - let s_id = k_i * x; - wire_value + betas[i] * s_id + gammas[i] - })); - denominator_values.extend((0..num_routed_wires).map(|j| { + let beta_i = betas[i]; + let gamma_i = gammas[i]; + + // Compute numerators and denominators in a single pass + for j in 0..num_routed_wires { let wire_value = vars.local_wires[j]; - let s_sigma = s_sigmas[j]; - wire_value + betas[i] * s_sigma + gammas[i] - })); + numerator_values + .push(wire_value + gamma_i.multiply_accumulate(beta_i, beta_x_s_ids[j])); + denominator_values + .push(wire_value + gamma_i.multiply_accumulate(beta_i, s_sigmas[j])); + } // The partial products considered for this iteration of `i`. let current_partial_products = &partial_products[i * num_prods..(i + 1) * num_prods]; @@ -587,7 +604,9 @@ pub fn check_lookup_constraints_batch, const D: usi // Check RE row transition constraint. let mut cur_sum = next_z_re; for elt in ¤t_lookup_combos { - cur_sum = cur_sum * deltas[LookupChallenges::ChallengeDelta as usize] + *elt; + // cur_sum = cur_sum * deltas[LookupChallenges::ChallengeDelta as usize] + *elt; + cur_sum = + elt.multiply_accumulate(cur_sum, deltas[LookupChallenges::ChallengeDelta as usize]); } let unfiltered_re_line = z_re - cur_sum; @@ -639,7 +658,10 @@ pub fn check_lookup_constraints_batch, const D: usi let lut_sum_prods_with_mul = (poly * lut_degree ..min((poly + 1) * lut_degree, num_lut_slots)) .fold(F::ZERO, |acc, i| { - acc + vars.local_wires[LookupTableGate::wire_ith_multiplicity(i)] * lut_prod_i(i) + acc.multiply_accumulate( + vars.local_wires[LookupTableGate::wire_ith_multiplicity(i)], + lut_prod_i(i), + ) }); // The previous element is the previous poly of the current row or the last poly of the next row. @@ -656,7 +678,8 @@ pub fn check_lookup_constraints_batch, const D: usi .push(lookup_selectors[LookupSelectors::TransSre as usize] * unfiltered_sum_transition); // Check LDC row and col transitions. It's the same constraint, with a row transition happening for slot == 0. - let unfiltered_ldc_transition = lu_prod * (z_x_lookup_sldcs[poly] - prev) + lu_sum_prods; + let unfiltered_ldc_transition = + lu_sum_prods.multiply_accumulate(lu_prod, z_x_lookup_sldcs[poly] - prev); constraints .push(lookup_selectors[LookupSelectors::TransLdc as usize] * unfiltered_ldc_transition); } diff --git a/plonky2/src/util/mod.rs b/plonky2/src/util/mod.rs index 8f9960034..73ec06f7a 100644 --- a/plonky2/src/util/mod.rs +++ b/plonky2/src/util/mod.rs @@ -3,7 +3,7 @@ #[cfg(not(feature = "std"))] use alloc::vec::Vec; -use plonky2_maybe_rayon::*; +use plonky2_maybe_rayon::{MaybeIntoParIter, ParallelIterator}; #[doc(inline)] pub use plonky2_util::*; @@ -23,7 +23,12 @@ pub(crate) fn transpose_poly_values(polys: Vec>) - } pub fn transpose(matrix: &[Vec]) -> Vec> { + if matrix.is_empty() { + return vec![]; + } + let len = matrix[0].len(); + (0..len) .into_par_iter() .map(|i| matrix.iter().map(|row| row[i]).collect()) diff --git a/plonky2/src/util/serialization/mod.rs b/plonky2/src/util/serialization/mod.rs index 3755851ac..209b43d3c 100644 --- a/plonky2/src/util/serialization/mod.rs +++ b/plonky2/src/util/serialization/mod.rs @@ -321,21 +321,21 @@ pub trait Read { H: Hasher, { let leaves_len = self.read_usize()?; - let mut leaves = Vec::with_capacity(leaves_len); + let leaf_len = self.read_usize()?; + let mut leaves_2d = Vec::with_capacity(leaves_len * leaf_len); for _ in 0..leaves_len { - let leaf_len = self.read_usize()?; - leaves.push(self.read_field_vec(leaf_len)?); + leaves_2d.push(self.read_field_vec(leaf_len)?); } + let leaves_1d = leaves_2d.into_iter().flatten().collect(); + let digests_len = self.read_usize()?; let digests = self.read_hash_vec::(digests_len)?; let cap_height = self.read_usize()?; let cap = self.read_merkle_cap::(cap_height)?; - Ok(MerkleTree { - leaves, - digests, - cap, - }) + Ok(MerkleTree::new_from_fields( + leaves_1d, leaf_len, digests, cap, + )) } /// Reads a value of type [`OpeningSet`] from `self` with the given `common_data`. @@ -1421,10 +1421,11 @@ pub trait Write { F: RichField, H: Hasher, { - self.write_usize(tree.leaves.len())?; - for i in 0..tree.leaves.len() { - self.write_usize(tree.leaves[i].len())?; - self.write_field_vec(&tree.leaves[i])?; + let leaves_count = tree.get_leaves_count(); + self.write_usize(leaves_count)?; + self.write_usize(tree.leaf_size)?; + for i in 0..leaves_count { + self.write_field_vec(&tree.get(i))?; } self.write_hash_vec::(&tree.digests)?; self.write_usize(tree.cap.height())?; diff --git a/test_gpu.sh b/test_gpu.sh new file mode 100755 index 000000000..9979bfeaf --- /dev/null +++ b/test_gpu.sh @@ -0,0 +1,141 @@ +#!/bin/bash + +# GPU Testing Script for Plonky2 +# This script validates CUDA setup, zeknox library, and runs GPU-accelerated tests + +set -e # Exit on any error + +# Color codes for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo "=========================================" +echo "Plonky2 GPU Testing Script" +echo "=========================================" +echo "" + +# Step 1: Check NVIDIA driver and CUDA +echo -e "${YELLOW}[1/7] Checking NVIDIA driver and CUDA...${NC}" +if ! command -v nvidia-smi &> /dev/null; then + echo -e "${RED}ERROR: nvidia-smi not found. Please install NVIDIA drivers.${NC}" + exit 1 +fi + +echo "NVIDIA Driver Information:" +nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv +echo "" + +# Check for CUDA toolkit +if command -v nvcc &> /dev/null; then + echo "CUDA Compiler Version:" + nvcc --version | grep "release" + echo "" +elif [ -n "$CUDA_HOME" ]; then + echo "CUDA_HOME is set to: $CUDA_HOME" + echo "" +else + echo -e "${YELLOW}WARNING: nvcc not found and CUDA_HOME not set. CUDA toolkit may not be installed.${NC}" + echo "Continuing anyway as runtime libraries may still be available..." + echo "" +fi + +echo -e "${GREEN} NVIDIA driver check passed${NC}" +echo "" + +# Step 2: Check zeknox library +echo -e "${YELLOW}[2/7] Checking zeknox library...${NC}" +ZEKNOX_PATH="../zeknox" +if [ ! -d "$ZEKNOX_PATH" ]; then + echo -e "${RED}ERROR: zeknox library not found at $ZEKNOX_PATH${NC}" + echo "Expected location: $(cd .. && pwd)/zeknox" + exit 1 +fi + +if [ -d "$ZEKNOX_PATH/wrappers/rust" ]; then + echo "Found zeknox library at: $(cd $ZEKNOX_PATH && pwd)" + echo "Rust wrappers directory exists: $ZEKNOX_PATH/wrappers/rust" +else + echo -e "${YELLOW}WARNING: zeknox/wrappers/rust not found, but zeknox directory exists${NC}" +fi + +echo -e "${GREEN} zeknox library check passed${NC}" +echo "" + +# Step 3: Run field tests +echo -e "${YELLOW}[3/7] Running field tests with GPU acceleration...${NC}" +echo "Command: cd field && cargo test --release --features=cuda -- --test-threads=1" +echo "" + +cd field +if cargo test --release --features=cuda -- --test-threads=1; then + echo "" + echo -e "${GREEN} Field tests passed${NC}" +else + echo "" + echo -e "${RED}ERROR: Field tests failed${NC}" + cd .. + exit 1 +fi +cd .. +echo "" + +# Step 4: Run fibonacci example with CUDA for correctness +echo -e "${YELLOW}[4/7] Running fibonacci example with CUDA features...${NC}" +echo "Command: NUM_OF_GPUS=1 cargo run --release --features=cuda_sanity_check --example fibonacci" +echo "" + +if NUM_OF_GPUS=1 cargo run --release --features=cuda_sanity_check --example fibonacci; then + echo "" + echo -e "${GREEN} Fibonacci example completed successfully with GPU${NC}" +else + echo "" + echo -e "${RED}ERROR: Fibonacci example failed with GPU${NC}" + exit 1 +fi +echo "" + +# Step 5: Run fibonacci example with CUDA for speed +echo -e "${YELLOW}[5/7] Running fibonacci example with CUDA features...${NC}" +echo "Command: NUM_OF_GPUS=1 cargo run --release --example fibonacci --features=cuda" +echo "" + +if NUM_OF_GPUS=1 cargo run --release --example fibonacci --features=cuda; then + echo "" + echo -e "${GREEN} Fibonacci example completed successfully with GPU${NC}" +else + echo "" + echo -e "${RED}ERROR: Fibonacci example failed with GPU${NC}" + exit 1 +fi +echo "" + + +# Step 6: Run fibonacci example with CPU +echo -e "${YELLOW}[6/7] Running fibonacci example with CUDA features...${NC}" +echo "Command: NUM_OF_GPUS=1 cargo run --release --example fibonacci" +echo "" + +if cargo run --release --example fibonacci; then + echo "" + echo -e "${GREEN} Fibonacci example completed successfully with CPU${NC}" +else + echo "" + echo -e "${RED}ERROR: Fibonacci example failed with CPU${NC}" + exit 1 +fi +echo "" + +# Step 7: Summary +echo "=========================================" +echo -e "${GREEN}All GPU tests completed successfully!${NC}" +echo "=========================================" +echo "" +echo "Tests run:" +echo "  NVIDIA driver and CUDA verification" +echo "  zeknox library verification" +echo "  Field tests (FFT, polynomials, interpolation, cosets)" +echo "  Fibonacci proof generation with GPU acceleration" +echo "" +echo -e "${GREEN}GPU testing complete!${NC}"