From 0c6091e5adcf153fd916795a3bc060a0f258f6e6 Mon Sep 17 00:00:00 2001 From: zhenfeizhang Date: Fri, 14 Nov 2025 10:18:20 -0500 Subject: [PATCH 01/37] fix parameters --- Cargo.toml | 2 ++ field/src/goldilocks_extensions.rs | 5 +++-- field/src/goldilocks_field.rs | 4 ++-- field/src/lib.rs | 1 - 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 81dbbde49..eed6218b8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,6 +13,8 @@ rand = { version = "0.8.4", default-features = false } serde = { version = "1.0", default-features = false, features = ["derive"] } static_assertions = { version = "1.1.0", default-features = false } unroll = { version = "0.1.5", default-features = false } +zeknox_= { path = "../zeknox/wrappers/rust"} + [profile.release] opt-level = 3 diff --git a/field/src/goldilocks_extensions.rs b/field/src/goldilocks_extensions.rs index 8f2d85253..8b7d5607e 100644 --- a/field/src/goldilocks_extensions.rs +++ b/field/src/goldilocks_extensions.rs @@ -21,9 +21,10 @@ impl Extendable<2> for GoldilocksField { // DTH_ROOT = W^((ORDER - 1)/2) const DTH_ROOT: Self = Self(18446744069414584320); - const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] = [Self(0), Self(11713931119993638672)]; + const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] = + [Self(18081566051660590251), Self(16121475356294670766)]; - const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(7226896044987257365)]; + const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(15659105665374529263)]; } impl Mul for QuadraticExtension { diff --git a/field/src/goldilocks_field.rs b/field/src/goldilocks_field.rs index b0191ca59..4e459c908 100644 --- a/field/src/goldilocks_field.rs +++ b/field/src/goldilocks_field.rs @@ -77,14 +77,14 @@ impl Field for GoldilocksField { const CHARACTERISTIC_TWO_ADICITY: usize = Self::TWO_ADICITY; // Sage: `g = GF(p).multiplicative_generator()` - const MULTIPLICATIVE_GROUP_GENERATOR: Self = Self(14293326489335486720); + const MULTIPLICATIVE_GROUP_GENERATOR: Self = Self(7); // Sage: // ``` // g_2 = g^((p - 1) / 2^32) // g_2.multiplicative_order().factor() // ``` - const POWER_OF_TWO_GENERATOR: Self = Self(7277203076849721926); + const POWER_OF_TWO_GENERATOR: Self = Self(1753635133440165772); const BITS: usize = 64; diff --git a/field/src/lib.rs b/field/src/lib.rs index c713db885..9a2ea4f9c 100644 --- a/field/src/lib.rs +++ b/field/src/lib.rs @@ -4,7 +4,6 @@ #![deny(rustdoc::broken_intra_doc_links)] #![deny(missing_debug_implementations)] #![feature(specialization)] -#![cfg_attr(target_arch = "x86_64", feature(stdarch_x86_avx512))] #![cfg_attr(not(test), no_std)] extern crate alloc; From 7065a8277aa1f426a7f3bd2380716c6f8e31dfc3 Mon Sep 17 00:00:00 2001 From: zhenfeizhang Date: Fri, 14 Nov 2025 19:37:21 -0500 Subject: [PATCH 02/37] fix fft --- field/Cargo.toml | 9 +++++ field/src/fft.rs | 60 ++++++++++++++++++++++++++++-- field/src/goldilocks_extensions.rs | 20 ++++++---- field/src/goldilocks_field.rs | 2 + field/src/interpolation.rs | 27 +++++++++++++- field/src/polynomial/mod.rs | 54 +++++++++++++++++++++++++++ field/src/types.rs | 3 ++ 7 files changed, 162 insertions(+), 13 deletions(-) diff --git a/field/Cargo.toml b/field/Cargo.toml index e13f49efd..49cb04494 100644 --- a/field/Cargo.toml +++ b/field/Cargo.toml @@ -19,6 +19,9 @@ serde = { workspace = true, features = ["alloc"] } static_assertions = { workspace = true } unroll = { workspace = true } +# cuda accelerator wrapper +zeknox = { workspace = true } + # Local dependencies plonky2_util = { version = "1.0.0", path = "../util", default-features = false } @@ -29,3 +32,9 @@ rustdoc-args = ["--html-in-header", ".cargo/katex-header.html"] [lints] workspace = true + + +[features] +# default = [] +default = [ "cuda" ] +cuda = [] \ No newline at end of file diff --git a/field/src/fft.rs b/field/src/fft.rs index d078ca6c3..85defc48b 100644 --- a/field/src/fft.rs +++ b/field/src/fft.rs @@ -32,16 +32,59 @@ pub fn fft_root_table(n: usize) -> FftRootTable { root_table } +#[cfg(feature = "cuda")] +fn fft_dispatch_gpu( + input: &mut [F], + zero_factor: Option, + root_table: Option<&FftRootTable>, +) { + use zeknox::ntt_batch; + use zeknox::types::NTTConfig; + if F::CUDA_SUPPORT { + return ntt_batch( + 0, + input.as_mut_ptr(), + input.len().trailing_zeros() as usize, + NTTConfig::default(), + ); + } else { + return fft_dispatch_cpu(input, zero_factor, root_table); + } +} + +fn fft_dispatch_cpu( + input: &mut [F], + zero_factor: Option, + root_table: Option<&FftRootTable>, +) { + if root_table.is_some() { + return fft_classic(input, zero_factor.unwrap_or(0), root_table.unwrap()); + } else { + // let pre_computed = F::pre_compute_fft_root_table(input.len()); + // if pre_computed.is_some() { + // return fft_classic(input, zero_factor.unwrap_or(0), pre_computed.unwrap()); + // } else { + // let computed = fft_root_table::(input.len()); + + // return fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref()); + // } + let computed = fft_root_table::(input.len()); + + return fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref()); + }; +} + #[inline] fn fft_dispatch( input: &mut [F], zero_factor: Option, root_table: Option<&FftRootTable>, ) { - let computed_root_table = root_table.is_none().then(|| fft_root_table(input.len())); - let used_root_table = root_table.or(computed_root_table.as_ref()).unwrap(); + #[cfg(feature = "cuda")] + return fft_dispatch_gpu(input, zero_factor, root_table); - fft_classic(input, zero_factor.unwrap_or(0), used_root_table); + #[cfg(not(feature = "cuda"))] + return fft_dispatch_cpu(input, zero_factor, root_table); } #[inline] @@ -206,6 +249,8 @@ mod tests { use alloc::vec::Vec; use plonky2_util::{log2_ceil, log2_strict}; + #[cfg(feature = "cuda")] + use zeknox::init_twiddle_factors_rs; use crate::fft::{fft, fft_with_options, ifft}; use crate::goldilocks_field::GoldilocksField; @@ -218,6 +263,13 @@ mod tests { let degree = 200usize; let degree_padded = degree.next_power_of_two(); + #[cfg(feature = "cuda")] + let log_degree = { + zeknox::clear_cuda_errors_rs(); + let log_degree = degree_padded.trailing_zeros() as usize; + init_twiddle_factors_rs(0, log_degree); + log_degree + }; // Create a vector of coeffs; the first degree of them are // "random", the last degree_padded-degree of them are zero. let coeffs = (0..degree) @@ -239,6 +291,8 @@ mod tests { } for r in 0..4 { + #[cfg(feature = "cuda")] + init_twiddle_factors_rs(0, log_degree + r); // expand coefficients by factor 2^r by filling with zeros let zero_tail = coefficients.lde(r); assert_eq!( diff --git a/field/src/goldilocks_extensions.rs b/field/src/goldilocks_extensions.rs index 8b7d5607e..6dd15ce0d 100644 --- a/field/src/goldilocks_extensions.rs +++ b/field/src/goldilocks_extensions.rs @@ -45,11 +45,15 @@ impl Extendable<4> for GoldilocksField { // DTH_ROOT = W^((ORDER - 1)/4) const DTH_ROOT: Self = Self(281474976710656); - const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] = - [Self(0), Self(8295451483910296135), Self(0), Self(0)]; + const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] = [ + Self(5024755240244648895), + Self(13227474371289740625), + Self(3912887029498544536), + Self(3900057112666848848), + ]; const EXT_POWER_OF_TWO_GENERATOR: [Self; 4] = - [Self(0), Self(0), Self(0), Self(17216955519093520442)]; + [Self(0), Self(0), Self(0), Self(12587610116473453104)]; } impl Mul for QuarticExtension { @@ -71,11 +75,11 @@ impl Extendable<5> for GoldilocksField { const DTH_ROOT: Self = Self(1041288259238279555); const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 5] = [ - Self(4624713872807171977), - Self(381988216716071028), - Self(14499722700050429911), - Self(4870631734967222356), - Self(4518902370426242880), + Self(2899034827742553394), + Self(13012057356839176729), + Self(14593811582388663055), + Self(7722900811313895436), + Self(4557222484695340057), ]; const EXT_POWER_OF_TWO_GENERATOR: [Self; 5] = [ diff --git a/field/src/goldilocks_field.rs b/field/src/goldilocks_field.rs index 4e459c908..ae8457744 100644 --- a/field/src/goldilocks_field.rs +++ b/field/src/goldilocks_field.rs @@ -88,6 +88,8 @@ impl Field for GoldilocksField { const BITS: usize = 64; + const CUDA_SUPPORT: bool = true; + fn order() -> BigUint { Self::ORDER.into() } diff --git a/field/src/interpolation.rs b/field/src/interpolation.rs index df7084572..9772fff56 100644 --- a/field/src/interpolation.rs +++ b/field/src/interpolation.rs @@ -77,6 +77,9 @@ pub fn interpolate2(points: [(F, F); 2], x: F) -> F { #[cfg(test)] mod tests { + #[cfg(feature = "cuda")] + use zeknox::init_twiddle_factors_rs; + use super::*; use crate::extension::quartic::QuarticExtension; use crate::goldilocks_field::GoldilocksField; @@ -87,7 +90,12 @@ mod tests { fn interpolant_random() { type F = GoldilocksField; - for deg in 0..10 { + #[cfg(feature = "cuda")] + zeknox::clear_cuda_errors_rs(); + + for deg in 2..10 { + #[cfg(feature = "cuda")] + init_twiddle_factors_rs(0, log2_ceil(deg)); let domain = F::rand_vec(deg); let coeffs = F::rand_vec(deg); let coeffs = PolynomialCoeffs { coeffs }; @@ -101,7 +109,13 @@ mod tests { fn interpolant_random_roots_of_unity() { type F = GoldilocksField; - for deg_log in 0..4 { + #[cfg(feature = "cuda")] + zeknox::clear_cuda_errors_rs(); + + for deg_log in 1..4 { + #[cfg(feature = "cuda")] + init_twiddle_factors_rs(0, deg_log); + let deg = 1 << deg_log; let domain = F::two_adic_subgroup(deg_log); let coeffs = F::rand_vec(deg); @@ -116,8 +130,15 @@ mod tests { fn interpolant_random_overspecified() { type F = GoldilocksField; + #[cfg(feature = "cuda")] + zeknox::clear_cuda_errors_rs(); + for deg in 0..10 { let points = deg + 5; + + #[cfg(feature = "cuda")] + init_twiddle_factors_rs(0, log2_ceil(points)); + let domain = F::rand_vec(points); let coeffs = F::rand_vec(deg); let coeffs = PolynomialCoeffs { coeffs }; @@ -137,6 +158,8 @@ mod tests { let points = [(F::rand(), F::rand()), (F::rand(), F::rand())]; let x = F::rand(); + #[cfg(feature = "cuda")] + init_twiddle_factors_rs(0, 2); let ev0 = interpolant(&points).eval(x); let ev1 = interpolate(&points, x, &barycentric_weights(&points)); let ev2 = interpolate2(points, x); diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs index c13bbca27..2a97352c4 100644 --- a/field/src/polynomial/mod.rs +++ b/field/src/polynomial/mod.rs @@ -440,6 +440,8 @@ impl Mul for &PolynomialCoeffs { mod tests { use std::time::Instant; + #[cfg(feature = "cuda")] + use plonky2_util::log2_ceil; use rand::rngs::OsRng; use rand::Rng; @@ -479,6 +481,13 @@ mod tests { let k = 8; let n = 1 << k; + + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + zeknox::init_twiddle_factors_rs(0, k); + } + let poly = PolynomialCoeffs::new(F::rand_vec(n)); let shift = F::rand(); let coset_evals = poly.coset_fft(shift).values; @@ -500,6 +509,13 @@ mod tests { let k = 8; let n = 1 << k; + + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + zeknox::init_twiddle_factors_rs(0, k); + } + let evals = PolynomialValues::new(F::rand_vec(n)); let shift = F::rand(); let coeffs = evals.clone().coset_ifft(shift); @@ -520,6 +536,12 @@ mod tests { type F = GoldilocksField; let mut rng = OsRng; let (a_deg, b_deg) = (rng.gen_range(1..10_000), rng.gen_range(1..10_000)); + + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + zeknox::init_twiddle_factors_rs(0, log2_ceil(a_deg + b_deg + 1)); + } let a = PolynomialCoeffs::new(F::rand_vec(a_deg)); let b = PolynomialCoeffs::new(F::rand_vec(b_deg)); let m1 = &a * &b; @@ -537,11 +559,24 @@ mod tests { let mut rng = OsRng; let a_deg = rng.gen_range(0..1_000); let n = rng.gen_range(1..1_000); + + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + for i in 1..=log2_ceil(max(a_deg, n)) + 1 { + zeknox::init_twiddle_factors_rs(0, i); + } + } + let mut a = PolynomialCoeffs::new(F::rand_vec(a_deg + 1)); + println!("a {} b {}", a.len(), n); + if a.coeffs[0].is_zero() { a.coeffs[0] = F::ONE; // First coefficient needs to be nonzero. } let b = a.inv_mod_xn(n); + println!("a {} b {}", a.len(), b.len()); + let mut m = &a * &b; m.coeffs.truncate(n); m.trim(); @@ -575,6 +610,15 @@ mod tests { type F = GoldilocksField; let mut rng = OsRng; let (a_deg, b_deg) = (rng.gen_range(1..10_000), rng.gen_range(1..10_000)); + + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + for i in 1..=log2_ceil(max(a_deg, b_deg)) + 1 { + zeknox::init_twiddle_factors_rs(0, i); + } + } + let a = PolynomialCoeffs::new(F::rand_vec(a_deg)); let b = PolynomialCoeffs::new(F::rand_vec(b_deg)); let (q, r) = a.div_rem(&b); @@ -606,6 +650,7 @@ mod tests { let mut rng = OsRng; let l = 14; let n = 1 << l; + let g = F::primitive_root_of_unity(l); let xn_minus_one = { let mut xn_min_one_vec = vec![F::ZERO; n + 1]; @@ -616,6 +661,15 @@ mod tests { let a = g.exp_u64(rng.gen_range(0..(n as u64))); let denom = PolynomialCoeffs::new(vec![-a, F::ONE]); + + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + for i in 1..=l + 1 { + zeknox::init_twiddle_factors_rs(0, i); + } + } + let now = Instant::now(); xn_minus_one.div_rem(&denom); println!("Division time: {:?}", now.elapsed()); diff --git a/field/src/types.rs b/field/src/types.rs index d714b7a84..5a34bb6a3 100644 --- a/field/src/types.rs +++ b/field/src/types.rs @@ -91,6 +91,9 @@ pub trait Field: /// The bit length of the field order. const BITS: usize; + /// Whether this field is supported by cuda + const CUDA_SUPPORT: bool = false; + fn order() -> BigUint; fn characteristic() -> BigUint; From fb3c96f7558ad22c1d9d05083a48aa6150563280 Mon Sep 17 00:00:00 2001 From: zhenfeizhang Date: Tue, 18 Nov 2025 08:10:50 -0500 Subject: [PATCH 03/37] fix FFT/cosetFFT GPUs --- field/src/fft.rs | 475 +++++++++++++++++++++++++++++++++++- field/src/polynomial/mod.rs | 9 + 2 files changed, 483 insertions(+), 1 deletion(-) diff --git a/field/src/fft.rs b/field/src/fft.rs index 85defc48b..eeb86b62d 100644 --- a/field/src/fft.rs +++ b/field/src/fft.rs @@ -52,6 +52,177 @@ fn fft_dispatch_gpu( } } +/// Batch FFT computation for multiple polynomials on GPU +#[cfg(feature = "cuda")] +fn fft_batch_dispatch_gpu( + inputs: &mut [F], + poly_size: usize, + num_polys: usize, + zero_factor: Option, + root_table: Option<&FftRootTable>, +) { + use zeknox::ntt_batch; + use zeknox::types::NTTConfig; + + if F::CUDA_SUPPORT { + let mut cfg = NTTConfig::default(); + cfg.batches = num_polys as u32; + + return ntt_batch( + 0, + inputs.as_mut_ptr(), + poly_size.trailing_zeros() as usize, + cfg, + ); + } else { + // Fallback to CPU: process each polynomial separately + for i in 0..num_polys { + let start = i * poly_size; + let end = start + poly_size; + fft_dispatch_cpu(&mut inputs[start..end], zero_factor, root_table); + } + } +} + +#[cfg(feature = "cuda")] +pub(crate) fn coset_fft_gpu( + poly: PolynomialCoeffs, + zero_factor: Option, + root_table: Option<&FftRootTable>, +) -> PolynomialValues { + use zeknox::ntt_batch; + use zeknox::types::NTTConfig; + + if !F::CUDA_SUPPORT { + // Fallback to CPU if CUDA not supported for this field + let modified_poly: PolynomialCoeffs = F::coset_shift() + .powers() + .zip(&poly.coeffs) + .map(|(r, &c)| r * c) + .collect::>() + .into(); + return fft_with_options(modified_poly, zero_factor, root_table); + } + + let PolynomialCoeffs { coeffs: mut buffer } = poly; + let lg_n = buffer.len().trailing_zeros() as usize; + + // // Initialize coset on GPU + // // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR) + // // TODO: Make this generic for other fields if needed + // let coset_gen_u64 = 7u64; + // init_coset_rs(0, lg_n, coset_gen_u64); + + // Configure NTT for coset + let mut cfg = NTTConfig::default(); + cfg.with_coset = true; + cfg.ntt_type = zeknox::types::NTTType::Coset; + + // Perform coset NTT on GPU + ntt_batch(0, buffer.as_mut_ptr(), lg_n, cfg); + + PolynomialValues::new(buffer) +} + +/// Batch coset FFT computation for multiple polynomials on GPU +#[cfg(feature = "cuda")] +fn coset_fft_batch_gpu( + polys: Vec>, + zero_factor: Option, + root_table: Option<&FftRootTable>, +) -> Vec> { + use zeknox::ntt_batch; + use zeknox::types::NTTConfig; + + if polys.is_empty() { + return Vec::new(); + } + + let num_polys = polys.len(); + let poly_size = polys[0].len(); + + // Verify all polynomials have the same size + assert!( + polys.iter().all(|p| p.len() == poly_size), + "All polynomials must have the same size for batch coset FFT" + ); + + if !F::CUDA_SUPPORT { + // Fallback to CPU if CUDA not supported for this field + return polys + .into_iter() + .map(|poly| { + let modified_poly: PolynomialCoeffs = F::coset_shift() + .powers() + .zip(&poly.coeffs) + .map(|(r, &c)| r * c) + .collect::>() + .into(); + fft_with_options(modified_poly, zero_factor, root_table) + }) + .collect(); + } + + // Flatten all polynomials into a single contiguous buffer + let mut buffer: Vec = Vec::with_capacity(num_polys * poly_size); + for poly in polys { + buffer.extend_from_slice(&poly.coeffs); + } + + let lg_n = poly_size.trailing_zeros() as usize; + + // Configure NTT for batch coset + let mut cfg = NTTConfig::default(); + cfg.batches = num_polys as u32; + cfg.with_coset = true; + cfg.ntt_type = zeknox::types::NTTType::Coset; + + // Perform batch coset NTT on GPU + ntt_batch(0, buffer.as_mut_ptr(), lg_n, cfg); + + // Split the buffer back into separate polynomials + buffer + .chunks(poly_size) + .map(|chunk| PolynomialValues::new(chunk.to_vec())) + .collect() +} + +/// Compute coset FFT for multiple polynomials in batch. +/// All polynomials must have the same size (power of 2). +/// Returns a vector of PolynomialValues in the same order as input. +pub fn coset_fft_batch(polys: Vec>) -> Vec> { + coset_fft_batch_with_options(polys, None, None) +} + +/// Compute coset FFT for multiple polynomials in batch with options. +/// All polynomials must have the same size (power of 2). +/// Returns a vector of PolynomialValues in the same order as input. +pub fn coset_fft_batch_with_options( + polys: Vec>, + zero_factor: Option, + root_table: Option<&FftRootTable>, +) -> Vec> { + #[cfg(feature = "cuda")] + return coset_fft_batch_gpu(polys, zero_factor, root_table); + + #[cfg(not(feature = "cuda"))] + { + // CPU fallback: process each polynomial separately + polys + .into_iter() + .map(|poly| { + let modified_poly: PolynomialCoeffs = F::coset_shift() + .powers() + .zip(&poly.coeffs) + .map(|(r, &c)| r * c) + .collect::>() + .into(); + fft_with_options(modified_poly, zero_factor, root_table) + }) + .collect() + } +} + fn fft_dispatch_cpu( input: &mut [F], zero_factor: Option, @@ -103,6 +274,66 @@ pub fn fft_with_options( PolynomialValues::new(buffer) } +/// Compute FFT for multiple polynomials in batch. +/// All polynomials must have the same size (power of 2). +/// Returns a vector of PolynomialValues in the same order as input. +#[inline] +pub fn fft_batch(polys: Vec>) -> Vec> { + fft_batch_with_options(polys, None, None) +} + +/// Compute FFT for multiple polynomials in batch with options. +/// All polynomials must have the same size (power of 2). +/// Returns a vector of PolynomialValues in the same order as input. +pub fn fft_batch_with_options( + polys: Vec>, + zero_factor: Option, + root_table: Option<&FftRootTable>, +) -> Vec> { + if polys.is_empty() { + return Vec::new(); + } + + let num_polys = polys.len(); + let poly_size = polys[0].len(); + + // Verify all polynomials have the same size + assert!( + polys.iter().all(|p| p.len() == poly_size), + "All polynomials must have the same size for batch FFT" + ); + assert!( + poly_size.is_power_of_two(), + "Polynomial size must be a power of 2" + ); + + // Flatten all polynomials into a single contiguous buffer + let mut buffer: Vec = Vec::with_capacity(num_polys * poly_size); + for poly in polys { + buffer.extend_from_slice(&poly.coeffs); + } + + // Dispatch to GPU or CPU batch processing + #[cfg(feature = "cuda")] + fft_batch_dispatch_gpu(&mut buffer, poly_size, num_polys, zero_factor, root_table); + + #[cfg(not(feature = "cuda"))] + { + // CPU fallback: process each polynomial separately + for i in 0..num_polys { + let start = i * poly_size; + let end = start + poly_size; + fft_dispatch_cpu(&mut buffer[start..end], zero_factor, root_table); + } + } + + // Split the buffer back into separate polynomials + buffer + .chunks(poly_size) + .map(|chunk| PolynomialValues::new(chunk.to_vec())) + .collect() +} + #[inline] pub fn ifft(poly: PolynomialValues) -> PolynomialCoeffs { ifft_with_options(poly, None, None) @@ -252,7 +483,7 @@ mod tests { #[cfg(feature = "cuda")] use zeknox::init_twiddle_factors_rs; - use crate::fft::{fft, fft_with_options, ifft}; + use crate::fft::{coset_fft_batch, fft, fft_batch, fft_with_options, ifft}; use crate::goldilocks_field::GoldilocksField; use crate::polynomial::{PolynomialCoeffs, PolynomialValues}; use crate::types::Field; @@ -302,6 +533,248 @@ mod tests { } } + #[test] + #[cfg(feature = "cuda")] + fn test_fft_gpu_vs_cpu_single() { + type F = GoldilocksField; + + // Test various polynomial sizes + for log_size in [8, 10, 12, 14] { + let size = 1 << log_size; + zeknox::clear_cuda_errors_rs(); + init_twiddle_factors_rs(0, log_size); + + // Create a random polynomial + let coeffs: Vec = (0..size) + .map(|i| F::from_canonical_usize(i * 7919 % 1000000)) + .collect(); + + let poly = PolynomialCoeffs { + coeffs: coeffs.clone(), + }; + + // Compute FFT using GPU (via fft function which dispatches to GPU) + let gpu_result = fft(poly.clone()); + + // Compute FFT using CPU (force CPU path) + let mut cpu_buffer = coeffs.clone(); + super::fft_dispatch_cpu(&mut cpu_buffer, None, None); + let cpu_result = PolynomialValues::new(cpu_buffer); + + // Compare results + assert_eq!( + gpu_result.len(), + cpu_result.len(), + "GPU and CPU results have different lengths for size {}", + size + ); + + for i in 0..size { + assert_eq!( + gpu_result.values[i], cpu_result.values[i], + "Mismatch at index {} for polynomial size {}", + i, size + ); + } + } + } + + #[test] + #[cfg(feature = "cuda")] + fn test_fft_batch_gpu_vs_cpu() { + type F = GoldilocksField; + + let poly_size: usize = 1 << 10; // 1024 elements + let num_polys = 8; + let log_size = poly_size.trailing_zeros() as usize; + + zeknox::clear_cuda_errors_rs(); + init_twiddle_factors_rs(0, log_size); + + // Create multiple random polynomials + let polys: Vec> = (0..num_polys) + .map(|batch_idx| { + let coeffs: Vec = (0..poly_size) + .map(|i| F::from_canonical_usize((i * 7919 + batch_idx * 12345) % 1000000)) + .collect(); + PolynomialCoeffs { coeffs } + }) + .collect(); + + // Compute batch FFT using GPU + let gpu_results = fft_batch(polys.clone()); + + // Compute FFT for each polynomial using CPU + let cpu_results: Vec> = polys + .into_iter() + .map(|poly| { + let mut buffer = poly.coeffs.clone(); + super::fft_dispatch_cpu(&mut buffer, None, None); + PolynomialValues::new(buffer) + }) + .collect(); + + // Compare results + assert_eq!(gpu_results.len(), cpu_results.len()); + for (batch_idx, (gpu_result, cpu_result)) in + gpu_results.iter().zip(cpu_results.iter()).enumerate() + { + assert_eq!(gpu_result.len(), cpu_result.len()); + for i in 0..poly_size { + assert_eq!( + gpu_result.values[i], cpu_result.values[i], + "Batch FFT mismatch at batch {} index {}", + batch_idx, i + ); + } + } + } + + #[test] + #[cfg(feature = "cuda")] + fn test_coset_fft_gpu_vs_cpu_single() { + use zeknox::init_coset_rs; + + use crate::types::PrimeField64; + type F = GoldilocksField; + + for log_size in [8, 10, 12] { + let size = 1 << log_size; + zeknox::clear_cuda_errors_rs(); + init_twiddle_factors_rs(0, log_size); + + // Initialize coset for GPU + let coset_gen_u64 = F::coset_shift().to_canonical_u64(); + init_coset_rs(0, log_size, coset_gen_u64); + + // Create a random polynomial + let coeffs: Vec = (0..size) + .map(|i| F::from_canonical_usize(i * 8191 % 1000000)) + .collect(); + + let poly = PolynomialCoeffs { + coeffs: coeffs.clone(), + }; + + // Compute coset FFT using GPU + let gpu_result = super::coset_fft_gpu(poly.clone(), None, None); + + // Compute coset FFT using CPU (apply coset shift then FFT) + let modified_poly: PolynomialCoeffs = F::coset_shift() + .powers() + .zip(&coeffs) + .map(|(r, &c)| r * c) + .collect::>() + .into(); + + let mut cpu_buffer = modified_poly.coeffs; + super::fft_dispatch_cpu(&mut cpu_buffer, None, None); + let cpu_result = PolynomialValues::new(cpu_buffer); + + // Compare results + assert_eq!( + gpu_result.len(), + cpu_result.len(), + "GPU and CPU coset FFT results have different lengths for size {}", + size + ); + + for i in 0..size { + assert_eq!( + gpu_result.values[i], cpu_result.values[i], + "Coset FFT mismatch at index {} for polynomial size {}", + i, size + ); + } + } + } + + #[test] + #[cfg(feature = "cuda")] + fn test_coset_fft_batch_gpu_vs_cpu() { + use zeknox::init_coset_rs; + + use crate::types::PrimeField64; + type F = GoldilocksField; + + let poly_size: usize = 1 << 10; // 1024 elements + let num_polys = 8; + let log_size = poly_size.trailing_zeros() as usize; + + zeknox::clear_cuda_errors_rs(); + init_twiddle_factors_rs(0, log_size); + + // Initialize coset for GPU + let coset_gen_u64 = F::coset_shift().to_canonical_u64(); + init_coset_rs(0, log_size, coset_gen_u64); + + // Create multiple random polynomials + let polys: Vec> = (0..num_polys) + .map(|batch_idx| { + let coeffs: Vec = (0..poly_size) + .map(|i| F::from_canonical_usize((i * 8191 + batch_idx * 54321) % 1000000)) + .collect(); + PolynomialCoeffs { coeffs } + }) + .collect(); + + // Compute batch coset FFT using GPU + let gpu_results = coset_fft_batch(polys.clone()); + + // Compute coset FFT for each polynomial using CPU + let cpu_results: Vec> = polys + .into_iter() + .map(|poly| { + let modified_poly: PolynomialCoeffs = F::coset_shift() + .powers() + .zip(&poly.coeffs) + .map(|(r, &c)| r * c) + .collect::>() + .into(); + + let mut buffer = modified_poly.coeffs; + super::fft_dispatch_cpu(&mut buffer, None, None); + PolynomialValues::new(buffer) + }) + .collect(); + + // Compare results + assert_eq!(gpu_results.len(), cpu_results.len()); + for (batch_idx, (gpu_result, cpu_result)) in + gpu_results.iter().zip(cpu_results.iter()).enumerate() + { + assert_eq!(gpu_result.len(), cpu_result.len()); + for i in 0..poly_size { + assert_eq!( + gpu_result.values[i], cpu_result.values[i], + "Batch coset FFT mismatch at batch {} index {}", + batch_idx, i + ); + } + } + } + + #[test] + fn test_batch_fft_empty() { + type F = GoldilocksField; + let polys: Vec> = vec![]; + let results = fft_batch(polys); + assert!(results.is_empty()); + } + + #[test] + #[should_panic(expected = "All polynomials must have the same size")] + fn test_batch_fft_different_sizes() { + type F = GoldilocksField; + let poly1 = PolynomialCoeffs { + coeffs: vec![F::ONE; 256], + }; + let poly2 = PolynomialCoeffs { + coeffs: vec![F::ONE; 512], + }; + let _ = fft_batch(vec![poly1, poly2]); + } + fn evaluate_naive(coefficients: &PolynomialCoeffs) -> PolynomialValues { let degree = coefficients.len(); let degree_padded = 1 << log2_ceil(degree); diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs index 2a97352c4..be8bf0ad9 100644 --- a/field/src/polynomial/mod.rs +++ b/field/src/polynomial/mod.rs @@ -283,6 +283,15 @@ impl PolynomialCoeffs { zero_factor: Option, root_table: Option<&FftRootTable>, ) -> PolynomialValues { + #[cfg(feature = "cuda")] + { + if F::CUDA_SUPPORT && shift == F::coset_shift() { + // Use GPU coset FFT directly without CPU-side coefficient modification + return crate::fft::coset_fft_gpu(self.clone(), zero_factor, root_table); + } + } + + // CPU path: multiply by powers of shift, then do regular FFT let modified_poly: Self = shift .powers() .zip(&self.coeffs) From c3aae3d9e6de1ab5ba8e2995eb3815e834890dbd Mon Sep 17 00:00:00 2001 From: lighter-zz Date: Thu, 20 Nov 2025 10:48:52 -0500 Subject: [PATCH 04/37] fix merkle tree --- Cargo.toml | 2 + field/Cargo.toml | 4 + field/src/goldilocks_extensions.rs | 25 +- field/src/goldilocks_field.rs | 4 +- plonky2/Cargo.toml | 6 + plonky2/benches/merkle.rs | 2 +- plonky2/examples/fibonacci.rs | 21 +- plonky2/src/batch_fri/oracle.rs | 2 +- plonky2/src/batch_fri/prover.rs | 3 +- plonky2/src/fri/oracle.rs | 4 +- plonky2/src/fri/prover.rs | 3 +- plonky2/src/hash/keccak.rs | 3 +- plonky2/src/hash/merkle_proofs.rs | 8 +- plonky2/src/hash/merkle_tree.rs | 1067 +++++++++++++++++++++++-- plonky2/src/hash/mod.rs | 2 +- plonky2/src/hash/path_compression.rs | 15 +- plonky2/src/hash/poseidon.rs | 3 +- plonky2/src/lib.rs | 2 +- plonky2/src/plonk/config.rs | 10 + plonky2/src/util/serialization/mod.rs | 27 +- 20 files changed, 1083 insertions(+), 130 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 81dbbde49..3bb243ecf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,10 +9,12 @@ hashbrown = { version = "0.14.3", default-features = false, features = ["ahash", itertools = { version = "0.11.0", default-features = false } log = { version = "0.4.14", default-features = false } num = { version = "0.4", default-features = false, features = ["rand"] } +once_cell = { version = "1.18.0", default-features = false } rand = { version = "0.8.4", default-features = false } serde = { version = "1.0", default-features = false, features = ["derive"] } static_assertions = { version = "1.1.0", default-features = false } unroll = { version = "0.1.5", default-features = false } +zeknox = { path = "../zeknox/wrappers/rust" } [profile.release] opt-level = 3 diff --git a/field/Cargo.toml b/field/Cargo.toml index e13f49efd..8ec0fac52 100644 --- a/field/Cargo.toml +++ b/field/Cargo.toml @@ -29,3 +29,7 @@ rustdoc-args = ["--html-in-header", ".cargo/katex-header.html"] [lints] workspace = true + +[features] +default = [] +cuda = [] \ No newline at end of file diff --git a/field/src/goldilocks_extensions.rs b/field/src/goldilocks_extensions.rs index 8f2d85253..6dd15ce0d 100644 --- a/field/src/goldilocks_extensions.rs +++ b/field/src/goldilocks_extensions.rs @@ -21,9 +21,10 @@ impl Extendable<2> for GoldilocksField { // DTH_ROOT = W^((ORDER - 1)/2) const DTH_ROOT: Self = Self(18446744069414584320); - const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] = [Self(0), Self(11713931119993638672)]; + const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] = + [Self(18081566051660590251), Self(16121475356294670766)]; - const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(7226896044987257365)]; + const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(15659105665374529263)]; } impl Mul for QuadraticExtension { @@ -44,11 +45,15 @@ impl Extendable<4> for GoldilocksField { // DTH_ROOT = W^((ORDER - 1)/4) const DTH_ROOT: Self = Self(281474976710656); - const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] = - [Self(0), Self(8295451483910296135), Self(0), Self(0)]; + const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] = [ + Self(5024755240244648895), + Self(13227474371289740625), + Self(3912887029498544536), + Self(3900057112666848848), + ]; const EXT_POWER_OF_TWO_GENERATOR: [Self; 4] = - [Self(0), Self(0), Self(0), Self(17216955519093520442)]; + [Self(0), Self(0), Self(0), Self(12587610116473453104)]; } impl Mul for QuarticExtension { @@ -70,11 +75,11 @@ impl Extendable<5> for GoldilocksField { const DTH_ROOT: Self = Self(1041288259238279555); const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 5] = [ - Self(4624713872807171977), - Self(381988216716071028), - Self(14499722700050429911), - Self(4870631734967222356), - Self(4518902370426242880), + Self(2899034827742553394), + Self(13012057356839176729), + Self(14593811582388663055), + Self(7722900811313895436), + Self(4557222484695340057), ]; const EXT_POWER_OF_TWO_GENERATOR: [Self; 5] = [ diff --git a/field/src/goldilocks_field.rs b/field/src/goldilocks_field.rs index b0191ca59..4e459c908 100644 --- a/field/src/goldilocks_field.rs +++ b/field/src/goldilocks_field.rs @@ -77,14 +77,14 @@ impl Field for GoldilocksField { const CHARACTERISTIC_TWO_ADICITY: usize = Self::TWO_ADICITY; // Sage: `g = GF(p).multiplicative_generator()` - const MULTIPLICATIVE_GROUP_GENERATOR: Self = Self(14293326489335486720); + const MULTIPLICATIVE_GROUP_GENERATOR: Self = Self(7); // Sage: // ``` // g_2 = g^((p - 1) / 2^32) // g_2.multiplicative_order().factor() // ``` - const POWER_OF_TWO_GENERATOR: Self = Self(7277203076849721926); + const POWER_OF_TWO_GENERATOR: Self = Self(1753635133440165772); const BITS: usize = 64; diff --git a/plonky2/Cargo.toml b/plonky2/Cargo.toml index 83ff08519..d9e31168d 100644 --- a/plonky2/Cargo.toml +++ b/plonky2/Cargo.toml @@ -13,10 +13,12 @@ categories.workspace = true [features] default = ["gate_testing", "parallel", "rand_chacha", "std", "timing"] +# default = ["gate_testing", "parallel", "rand_chacha", "std", "timing", "cuda"] gate_testing = [] parallel = ["hashbrown/rayon", "plonky2_maybe_rayon/parallel"] std = ["anyhow/std", "rand/std", "itertools/use_std"] timing = ["std", "dep:web-time"] +cuda = ["plonky2_field/cuda"] [dependencies] ahash = { workspace = true } @@ -26,6 +28,7 @@ itertools = { workspace = true } keccak-hash = { version = "0.8.0", default-features = false } log = { workspace = true } num = { workspace = true } +once_cell = { workspace = true } rand = { workspace = true } rand_chacha = { version = "0.3.1", optional = true, default-features = false } serde = { workspace = true, features = ["rc"] } @@ -38,6 +41,9 @@ plonky2_field = { version = "1.0.0", path = "../field", default-features = false plonky2_maybe_rayon = { version = "1.0.0", path = "../maybe_rayon", default-features = false } plonky2_util = { version = "1.0.0", path = "../util", default-features = false } +# cuda accelerator wrapper +zeknox = { workspace = true } + [target.'cfg(all(target_arch = "wasm32", target_os = "unknown"))'.dependencies] getrandom = { version = "0.2", default-features = false, features = ["js"] } diff --git a/plonky2/benches/merkle.rs b/plonky2/benches/merkle.rs index 6230c1343..e9995be1a 100644 --- a/plonky2/benches/merkle.rs +++ b/plonky2/benches/merkle.rs @@ -23,7 +23,7 @@ pub(crate) fn bench_merkle_tree>(c: &mut Criterion) { let size = 1 << size_log; group.bench_with_input(BenchmarkId::from_parameter(size), &size, |b, _| { let leaves = vec![F::rand_vec(ELEMS_PER_LEAF); size]; - b.iter(|| MerkleTree::::new(leaves.clone(), 0)); + b.iter(|| MerkleTree::::new_from_2d(leaves.clone(), 0)); }); } } diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs index 578dc2424..79101dd3b 100644 --- a/plonky2/examples/fibonacci.rs +++ b/plonky2/examples/fibonacci.rs @@ -1,14 +1,21 @@ -use anyhow::Result; +use anyhow::{Ok, Result}; +use log::Level; use plonky2::field::types::Field; use plonky2::iop::witness::{PartialWitness, WitnessWrite}; use plonky2::plonk::circuit_builder::CircuitBuilder; use plonky2::plonk::circuit_data::CircuitConfig; use plonky2::plonk::config::{GenericConfig, PoseidonGoldilocksConfig}; +use plonky2::util::timing::TimingTree; /// An example of using Plonky2 to prove a statement of the form /// "I know the 100th element of the Fibonacci sequence, starting with constants a and b." /// When a == 0 and b == 1, this is proving knowledge of the 100th (standard) Fibonacci number. fn main() -> Result<()> { + env_logger::Builder::from_default_env() + .format_timestamp(None) + .filter_level(log::LevelFilter::Debug) + .init(); + const D: usize = 2; type C = PoseidonGoldilocksConfig; type F = >::F; @@ -21,7 +28,7 @@ fn main() -> Result<()> { let initial_b = builder.add_virtual_target(); let mut prev_target = initial_a; let mut cur_target = initial_b; - for _ in 0..99 { + for _ in 0..999999 { let temp = builder.add(prev_target, cur_target); prev_target = cur_target; cur_target = temp; @@ -38,12 +45,16 @@ fn main() -> Result<()> { pw.set_target(initial_b, F::ONE)?; let data = builder.build::(); - let proof = data.prove(pw)?; + let mut timing = TimingTree::new("prove", Level::Info); + println!("Starting proof generation..."); + let proof = plonky2::plonk::prover::prove(&data.prover_only, &data.common, pw, &mut timing)?; println!( "100th Fibonacci number mod |F| (starting with {}, {}) is: {}", proof.public_inputs[0], proof.public_inputs[1], proof.public_inputs[2] ); - - data.verify(proof) + timing.print(); + data.verify(proof)?; + println!("Proof verified!"); + Ok(()) } diff --git a/plonky2/src/batch_fri/oracle.rs b/plonky2/src/batch_fri/oracle.rs index 58deeaa3c..bdf6da72a 100644 --- a/plonky2/src/batch_fri/oracle.rs +++ b/plonky2/src/batch_fri/oracle.rs @@ -15,7 +15,7 @@ use crate::fri::oracle::PolynomialBatch; use crate::fri::proof::FriProof; use crate::fri::structure::{FriBatchInfo, FriInstanceInfo}; use crate::fri::FriParams; -use crate::hash::batch_merkle_tree::BatchMerkleTree; +// use crate::hash::batch_merkle_tree::BatchMerkleTree; use crate::hash::hash_types::RichField; use crate::iop::challenger::Challenger; use crate::plonk::config::GenericConfig; diff --git a/plonky2/src/batch_fri/prover.rs b/plonky2/src/batch_fri/prover.rs index e71fe25b4..bed056c0b 100644 --- a/plonky2/src/batch_fri/prover.rs +++ b/plonky2/src/batch_fri/prover.rs @@ -104,7 +104,8 @@ pub(crate) fn batch_fri_committed_trees< reverse_index_bits_in_place(&mut final_values.values); let chunked_values = final_values.values.par_chunks(arity).map(flatten).collect(); - let tree = MerkleTree::::new(chunked_values, fri_params.config.cap_height); + let tree = + MerkleTree::::new_from_2d(chunked_values, fri_params.config.cap_height); challenger.observe_cap(&tree.cap); trees.push(tree); diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs index e413071a4..bf986fe64 100644 --- a/plonky2/src/fri/oracle.rs +++ b/plonky2/src/fri/oracle.rs @@ -99,7 +99,7 @@ impl, C: GenericConfig, const D: usize> let merkle_tree = timed!( timing, "build Merkle tree", - MerkleTree::new(leaves, cap_height) + MerkleTree::new_from_2d(leaves, cap_height) ); Self { @@ -142,7 +142,7 @@ impl, C: GenericConfig, const D: usize> pub fn get_lde_values(&self, index: usize, step: usize) -> &[F] { let index = index * step; let index = reverse_bits(index, self.degree_log + self.rate_bits); - let slice = &self.merkle_tree.leaves[index]; + let slice = &self.merkle_tree.get(index); &slice[..slice.len() - if self.blinding { SALT_SIZE } else { 0 }] } diff --git a/plonky2/src/fri/prover.rs b/plonky2/src/fri/prover.rs index 24c88ced7..e5792cb2c 100644 --- a/plonky2/src/fri/prover.rs +++ b/plonky2/src/fri/prover.rs @@ -101,7 +101,8 @@ fn fri_committed_trees, C: GenericConfig, .par_chunks(arity) .map(|chunk: &[F::Extension]| flatten(chunk)) .collect(); - let tree = MerkleTree::::new(chunked_values, fri_params.config.cap_height); + let tree = + MerkleTree::::new_from_2d(chunked_values, fri_params.config.cap_height); challenger.observe_cap(&tree.cap); trees.push(tree); diff --git a/plonky2/src/hash/keccak.rs b/plonky2/src/hash/keccak.rs index d3fa8c4b2..61e7cb87e 100644 --- a/plonky2/src/hash/keccak.rs +++ b/plonky2/src/hash/keccak.rs @@ -7,7 +7,7 @@ use keccak_hash::keccak; use crate::hash::hash_types::{BytesHash, RichField}; use crate::hash::hashing::PlonkyPermutation; -use crate::plonk::config::Hasher; +use crate::plonk::config::{Hasher, HasherType}; use crate::util::serialization::Write; pub const SPONGE_RATE: usize = 8; @@ -103,6 +103,7 @@ impl PlonkyPermutation for KeccakPermutation { pub struct KeccakHash; impl Hasher for KeccakHash { const HASH_SIZE: usize = N; + const HASHER_TYPE: HasherType = HasherType::Keccak; type Hash = BytesHash; type Permutation = KeccakPermutation; diff --git a/plonky2/src/hash/merkle_proofs.rs b/plonky2/src/hash/merkle_proofs.rs index 424e03ae6..892564932 100644 --- a/plonky2/src/hash/merkle_proofs.rs +++ b/plonky2/src/hash/merkle_proofs.rs @@ -342,7 +342,8 @@ mod tests { let n = 1 << log_n; let cap_height = 1; let leaves = random_data::(n, 7); - let tree = MerkleTree::>::Hasher>::new(leaves, cap_height); + let tree = + MerkleTree::>::Hasher>::new_from_2d(leaves, cap_height); let i: usize = OsRng.gen_range(0..n); let proof = tree.prove(i); @@ -359,9 +360,10 @@ mod tests { let i_c = builder.constant(F::from_canonical_usize(i)); let i_bits = builder.split_le(i_c, log_n); - let data = builder.add_virtual_targets(tree.leaves[i].len()); + let data = builder.add_virtual_targets(tree.leaf_size); + let leaf = tree.get(i); for j in 0..data.len() { - pw.set_target(data[j], tree.leaves[i][j])?; + pw.set_target(data[j], leaf[j])?; } builder.verify_merkle_proof_to_cap::<>::InnerHasher>( diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs index 31bcf5e37..b1c5dcc37 100644 --- a/plonky2/src/hash/merkle_tree.rs +++ b/plonky2/src/hash/merkle_tree.rs @@ -1,16 +1,51 @@ -#[cfg(not(feature = "std"))] -use alloc::vec::Vec; use core::mem::MaybeUninit; use core::slice; +use std::collections::HashSet; +#[cfg(feature = "cuda")] +use std::sync::Arc; +#[cfg(feature = "cuda")] +use std::sync::Mutex; +use std::time::Instant; +#[cfg(not(feature = "std"))] +use std::vec::Vec; +use num::range; +#[cfg(feature = "cuda")] +use once_cell::sync::Lazy; use plonky2_maybe_rayon::*; use serde::{Deserialize, Serialize}; +#[cfg(feature = "cuda")] +use zeknox::device::memory::HostOrDeviceSlice; +#[cfg(feature = "cuda")] +use zeknox::device::stream::CudaStream; +#[cfg(feature = "cuda")] +use zeknox::fill_digests_buf_linear_gpu_with_gpu_ptr; +#[cfg(feature = "cuda")] +use zeknox::fill_digests_buf_linear_multigpu_with_gpu_ptr; use crate::hash::hash_types::RichField; +#[cfg(feature = "cuda")] +use crate::hash::hash_types::NUM_HASH_OUT_ELTS; use crate::hash::merkle_proofs::MerkleProof; +#[cfg(feature = "cuda")] +use crate::plonk::config::HasherType; use crate::plonk::config::{GenericHashOut, Hasher}; use crate::util::log2_strict; +#[cfg(feature = "cuda")] +pub static GPU_ID: Lazy>> = Lazy::new(|| Arc::new(Mutex::new(0))); + +#[cfg(all(feature = "timing", feature = "cuda"))] +fn print_time(now: Instant, msg: &str) { + println!("Time {} {} ms", msg, now.elapsed().as_millis()); +} + +#[cfg(not(all(feature = "timing", feature = "cuda")))] +fn print_time(_now: Instant, _msg: &str) {} + +#[cfg(feature = "cuda")] +const FORCE_SINGLE_GPU: bool = true; + /// The Merkle cap of height `h` of a Merkle tree is the `h`-th layer (from the root) of the tree. /// It can be used in place of the root to verify Merkle paths, which are `h` elements shorter. #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] @@ -45,7 +80,10 @@ impl> MerkleCap { #[derive(Clone, Debug, Eq, PartialEq)] pub struct MerkleTree> { /// The data in the leaves of the Merkle tree. - pub leaves: Vec>, + // pub leaves: Vec>, + pub leaves: Vec, + + pub leaf_size: usize, /// The digests in the tree. Consists of `cap.len()` sub-trees, each corresponding to one /// element in `cap`. Each subtree is contiguous and located at @@ -64,6 +102,7 @@ pub struct MerkleTree> { impl> Default for MerkleTree { fn default() -> Self { Self { + leaf_size: 0, leaves: Vec::new(), digests: Vec::new(), cap: MerkleCap::default(), @@ -71,7 +110,7 @@ impl> Default for MerkleTree { } } -pub(crate) fn capacity_up_to_mut(v: &mut Vec, len: usize) -> &mut [MaybeUninit] { +fn capacity_up_to_mut(v: &mut Vec, len: usize) -> &mut [MaybeUninit] { assert!(v.capacity() >= len); let v_ptr = v.as_mut_ptr().cast::>(); unsafe { @@ -83,59 +122,105 @@ pub(crate) fn capacity_up_to_mut(v: &mut Vec, len: usize) -> &mut [MaybeUn } } -pub(crate) fn fill_subtree>( +fn fill_subtree>( digests_buf: &mut [MaybeUninit], - leaves: &[Vec], + leaves: &[F], + leaf_size: usize, ) -> H::Hash { - assert_eq!(leaves.len(), digests_buf.len() / 2 + 1); - if digests_buf.is_empty() { - H::hash_or_noop(&leaves[0]) - } else { - // Layout is: left recursive output || left child digest - // || right child digest || right recursive output. - // Split `digests_buf` into the two recursive outputs (slices) and two child digests - // (references). - let (left_digests_buf, right_digests_buf) = digests_buf.split_at_mut(digests_buf.len() / 2); - let (left_digest_mem, left_digests_buf) = left_digests_buf.split_last_mut().unwrap(); - let (right_digest_mem, right_digests_buf) = right_digests_buf.split_first_mut().unwrap(); - // Split `leaves` between both children. - let (left_leaves, right_leaves) = leaves.split_at(leaves.len() / 2); - - let (left_digest, right_digest) = plonky2_maybe_rayon::join( - || fill_subtree::(left_digests_buf, left_leaves), - || fill_subtree::(right_digests_buf, right_leaves), - ); + let leaves_count = leaves.len() / leaf_size; - left_digest_mem.write(left_digest); - right_digest_mem.write(right_digest); - H::two_to_one(left_digest, right_digest) + // if one leaf => return it hash + if leaves_count == 1 { + let hash = H::hash_or_noop(leaves); + digests_buf[0].write(hash); + return hash; + } + // if two leaves => return their concat hash + if leaves_count == 2 { + let (leaf1, leaf2) = leaves.split_at(leaf_size); + let hash_left = H::hash_or_noop(leaf1); + let hash_right = H::hash_or_noop(leaf2); + digests_buf[0].write(hash_left); + digests_buf[1].write(hash_right); + return H::two_to_one(hash_left, hash_right); } + + assert_eq!(leaves_count, digests_buf.len() / 2 + 1); + + // leaves first - we can do all in parallel + let (_, digests_leaves) = digests_buf.split_at_mut(digests_buf.len() - leaves_count); + digests_leaves + .into_par_iter() + .enumerate() + .for_each(|(leaf_idx, digest)| { + let (_, r) = leaves.split_at(leaf_idx * leaf_size); + let (leaf, _) = r.split_at(leaf_size); + digest.write(H::hash_or_noop(leaf)); + }); + + // internal nodes - we can do in parallel per level + let mut last_index = digests_buf.len() - leaves_count; + + for level_log in range(1, log2_strict(leaves_count)).rev() { + let level_size = 1 << level_log; + let (_, digests_slice) = digests_buf.split_at_mut(last_index - level_size); + let (digests_slice, next_digests) = digests_slice.split_at_mut(level_size); + + digests_slice + .into_par_iter() + .zip(last_index - level_size..last_index) + .for_each(|(digest, idx)| { + let left_idx = 2 * (idx + 1) - last_index; + let right_idx = left_idx + 1; + + unsafe { + let left_digest = next_digests[left_idx].assume_init(); + let right_digest = next_digests[right_idx].assume_init(); + digest.write(H::two_to_one(left_digest, right_digest)); + } + }); + last_index -= level_size; + } + + // return cap hash + let hash: >::Hash; + unsafe { + let left_digest = digests_buf[0].assume_init(); + let right_digest = digests_buf[1].assume_init(); + hash = H::two_to_one(left_digest, right_digest); + } + hash } -pub(crate) fn fill_digests_buf>( +fn fill_digests_buf>( digests_buf: &mut [MaybeUninit], cap_buf: &mut [MaybeUninit], - leaves: &[Vec], + leaves: &Vec, + leaf_size: usize, cap_height: usize, ) { // Special case of a tree that's all cap. The usual case will panic because we'll try to split // an empty slice into chunks of `0`. (We would not need this if there was a way to split into // `blah` chunks as opposed to chunks _of_ `blah`.) + let leaves_count = leaves.len() / leaf_size; + if digests_buf.is_empty() { - debug_assert_eq!(cap_buf.len(), leaves.len()); + debug_assert_eq!(cap_buf.len(), leaves_count); cap_buf .par_iter_mut() - .zip(leaves) - .for_each(|(cap_buf, leaf)| { + .enumerate() + .for_each(|(leaf_idx, cap_buf)| { + let (_, r) = leaves.split_at(leaf_idx * leaf_size); + let (leaf, _) = r.split_at(leaf_size); cap_buf.write(H::hash_or_noop(leaf)); }); return; } let subtree_digests_len = digests_buf.len() >> cap_height; - let subtree_leaves_len = leaves.len() >> cap_height; + let subtree_leaves_len = leaves_count >> cap_height; let digests_chunks = digests_buf.par_chunks_exact_mut(subtree_digests_len); - let leaves_chunks = leaves.par_chunks_exact(subtree_leaves_len); + let leaves_chunks = leaves.par_chunks_exact(subtree_leaves_len * leaf_size); assert_eq!(digests_chunks.len(), cap_buf.len()); assert_eq!(digests_chunks.len(), leaves_chunks.len()); digests_chunks.zip(cap_buf).zip(leaves_chunks).for_each( @@ -143,55 +228,245 @@ pub(crate) fn fill_digests_buf>( // We have `1 << cap_height` sub-trees, one for each entry in `cap`. They are totally // independent, so we schedule one task for each. `digests_buf` and `leaves` are split // into `1 << cap_height` slices, one for each sub-tree. - subtree_cap.write(fill_subtree::(subtree_digests, subtree_leaves)); + subtree_cap.write(fill_subtree::( + subtree_digests, + subtree_leaves, + leaf_size, + )); }, ); + + // TODO - debug code - to remove in future + /* + let digests_count: u64 = digests_buf.len().try_into().unwrap(); + let leaves_count: u64 = leaves.len().try_into().unwrap(); + let cap_height: u64 = cap_height.try_into().unwrap(); + let leaf_size: u64 = leaves[0].len().try_into().unwrap(); + let fname = format!("cpu-{}-{}-{}-{}.txt", digests_count, leaves_count, leaf_size, cap_height); + let mut file = File::create(fname).unwrap(); + for digest in digests_buf { + unsafe { + let hash = digest.assume_init().to_vec(); + for x in hash { + let str = format!("{} ", x.to_canonical_u64()); + file.write_all(str.as_bytes()); + } + } + file.write_all(b"\n"); + } + */ +} + +#[cfg(feature = "cuda")] +#[repr(C)] +union U8U64 { + f1: [u8; 32], + f2: [u64; 4], +} + +#[cfg(feature = "cuda")] +fn fill_digests_buf_gpu>( + digests_buf: &mut [MaybeUninit], + cap_buf: &mut [MaybeUninit], + leaves: &Vec, + leaf_size: usize, + cap_height: usize, +) { + let leaves_count = leaves.len() / leaf_size; + + let num_gpus: usize = std::env::var("NUM_OF_GPUS") + .expect("NUM_OF_GPUS should be set") + .parse() + .unwrap(); + + let mut gpu_id_lock = GPU_ID.lock().unwrap(); + let gpu_id = *gpu_id_lock; + *gpu_id_lock += 1; + if *gpu_id_lock >= num_gpus as u64 { + *gpu_id_lock = 0; + } + + let now = Instant::now(); + let mut gpu_leaves_buf: HostOrDeviceSlice<'_, F> = + HostOrDeviceSlice::cuda_malloc(gpu_id as i32, leaves.len()).unwrap(); + print_time(now, "alloc gpu leaves buffer"); + + let now = Instant::now(); + let _ = gpu_leaves_buf.copy_from_host(leaves.as_slice()); + print_time(now, "leaves copy to gpu"); + + let now = Instant::now(); + fill_digests_buf_gpu_ptr::( + digests_buf, + cap_buf, + gpu_leaves_buf.as_mut_ptr(), + leaves_count, + leaf_size, + cap_height, + gpu_id, + ); + print_time(now, "fill_digests_buf_gpu_ptr"); } -pub(crate) fn merkle_tree_prove>( - leaf_index: usize, +#[cfg(feature = "cuda")] +fn fill_digests_buf_gpu_ptr>( + digests_buf: &mut [MaybeUninit], + cap_buf: &mut [MaybeUninit], + leaves_ptr: *const F, leaves_len: usize, + leaf_len: usize, cap_height: usize, - digests: &[H::Hash], -) -> Vec { - let num_layers = log2_strict(leaves_len) - cap_height; - debug_assert_eq!(leaf_index >> (cap_height + num_layers), 0); - - let digest_len = 2 * (leaves_len - (1 << cap_height)); - assert_eq!(digest_len, digests.len()); - - let digest_tree: &[H::Hash] = { - let tree_index = leaf_index >> num_layers; - let tree_len = digest_len >> cap_height; - &digests[tree_len * tree_index..tree_len * (tree_index + 1)] + gpu_id: u64, +) { + let digests_count: u64 = digests_buf.len().try_into().unwrap(); + let leaves_count: u64 = leaves_len.try_into().unwrap(); + let caps_count: u64 = cap_buf.len().try_into().unwrap(); + let cap_height: u64 = cap_height.try_into().unwrap(); + let leaf_size: u64 = leaf_len.try_into().unwrap(); + + let now = Instant::now(); + // if digests_buf is empty (size 0), just allocate a few bytes to avoid errors + let digests_size = if digests_buf.len() == 0 { + NUM_HASH_OUT_ELTS + } else { + digests_buf.len() * NUM_HASH_OUT_ELTS + }; + let caps_size = if cap_buf.len() == 0 { + NUM_HASH_OUT_ELTS + } else { + cap_buf.len() * NUM_HASH_OUT_ELTS }; - // Mask out high bits to get the index within the sub-tree. - let mut pair_index = leaf_index & ((1 << num_layers) - 1); - (0..num_layers) - .map(|i| { - let parity = pair_index & 1; - pair_index >>= 1; - - // The layers' data is interleaved as follows: - // [layer 0, layer 1, layer 0, layer 2, layer 0, layer 1, layer 0, layer 3, ...]. - // Each of the above is a pair of siblings. - // `pair_index` is the index of the pair within layer `i`. - // The index of that the pair within `digests` is - // `pair_index * 2 ** (i + 1) + (2 ** i - 1)`. - let siblings_index = (pair_index << (i + 1)) + (1 << i) - 1; - // We have an index for the _pair_, but we want the index of the _sibling_. - // Double the pair index to get the index of the left sibling. Conditionally add `1` - // if we are to retrieve the right sibling. - let sibling_index = 2 * siblings_index + (1 - parity); - digest_tree[sibling_index] - }) - .collect() + let mut gpu_digests_buf: HostOrDeviceSlice<'_, F> = + HostOrDeviceSlice::cuda_malloc(gpu_id as i32, digests_size).unwrap(); + let mut gpu_cap_buf: HostOrDeviceSlice<'_, F> = + HostOrDeviceSlice::cuda_malloc(gpu_id as i32, caps_size).unwrap(); + + unsafe { + let num_gpus: usize = std::env::var("NUM_OF_GPUS") + .expect("NUM_OF_GPUS should be set") + .parse() + .unwrap(); + if !FORCE_SINGLE_GPU + && leaves_count >= (1 << 12) + && cap_height > 0 + && num_gpus > 1 + && H::HASHER_TYPE == HasherType::PoseidonBN128 + { + // println!("Multi GPU"); + fill_digests_buf_linear_multigpu_with_gpu_ptr( + gpu_digests_buf.as_mut_ptr() as *mut core::ffi::c_void, + gpu_cap_buf.as_mut_ptr() as *mut core::ffi::c_void, + leaves_ptr as *mut core::ffi::c_void, + digests_count, + caps_count, + leaves_count, + leaf_size, + cap_height, + H::HASHER_TYPE as u64, + ); + } else { + // println!("Single GPU"); + fill_digests_buf_linear_gpu_with_gpu_ptr( + gpu_digests_buf.as_mut_ptr() as *mut core::ffi::c_void, + gpu_cap_buf.as_mut_ptr() as *mut core::ffi::c_void, + leaves_ptr as *mut core::ffi::c_void, + digests_count, + caps_count, + leaves_count, + leaf_size, + cap_height, + H::HASHER_TYPE as u64, + gpu_id, + ); + } + } + print_time(now, "fill init"); + + let mut host_digests: Vec = vec![F::ZERO; digests_size]; + let mut host_caps: Vec = vec![F::ZERO; caps_size]; + let stream1 = CudaStream::create().unwrap(); + let stream2 = CudaStream::create().unwrap(); + + gpu_digests_buf + .copy_to_host_async(host_digests.as_mut_slice(), &stream1) + .expect("copy digests"); + gpu_cap_buf + .copy_to_host_async(host_caps.as_mut_slice(), &stream2) + .expect("copy caps"); + stream1.synchronize().expect("cuda sync"); + stream2.synchronize().expect("cuda sync"); + stream1.destroy().expect("cuda stream destroy"); + stream2.destroy().expect("cuda stream destroy"); + + let now = Instant::now(); + + if digests_buf.len() > 0 { + host_digests + .chunks_exact(4) + .zip(digests_buf) + .for_each(|(x, y)| { + unsafe { + let mut parts = U8U64 { f1: [0; 32] }; + parts.f2[0] = x[0].to_canonical_u64(); + parts.f2[1] = x[1].to_canonical_u64(); + parts.f2[2] = x[2].to_canonical_u64(); + parts.f2[3] = x[3].to_canonical_u64(); + let (slice, _) = parts.f1.split_at(H::HASH_SIZE); + let h: H::Hash = H::Hash::from_bytes(slice); + y.write(h); + }; + }); + } + + if cap_buf.len() > 0 { + host_caps.chunks_exact(4).zip(cap_buf).for_each(|(x, y)| { + unsafe { + let mut parts = U8U64 { f1: [0; 32] }; + parts.f2[0] = x[0].to_canonical_u64(); + parts.f2[1] = x[1].to_canonical_u64(); + parts.f2[2] = x[2].to_canonical_u64(); + parts.f2[3] = x[3].to_canonical_u64(); + let (slice, _) = parts.f1.split_at(H::HASH_SIZE); + let h: H::Hash = H::Hash::from_bytes(slice); + y.write(h); + }; + }); + } + print_time(now, "copy results"); +} + +#[cfg(feature = "cuda")] +fn fill_digests_buf_meta>( + digests_buf: &mut [MaybeUninit], + cap_buf: &mut [MaybeUninit], + leaves: &Vec, + leaf_size: usize, + cap_height: usize, +) { + // if the input is small or if it Keccak hashing, just do the hashing on CPU + if leaf_size <= H::HASH_SIZE / 8 { + fill_digests_buf::(digests_buf, cap_buf, leaves, leaf_size, cap_height); + } else { + fill_digests_buf_gpu::(digests_buf, cap_buf, leaves, leaf_size, cap_height); + } +} + +#[cfg(not(feature = "cuda"))] +fn fill_digests_buf_meta>( + digests_buf: &mut [MaybeUninit], + cap_buf: &mut [MaybeUninit], + leaves: &Vec, + leaf_size: usize, + cap_height: usize, +) { + fill_digests_buf::(digests_buf, cap_buf, leaves, leaf_size, cap_height); } impl> MerkleTree { - pub fn new(leaves: Vec>, cap_height: usize) -> Self { - let log2_leaves_len = log2_strict(leaves.len()); + pub fn new_from_1d(leaves_1d: Vec, leaf_size: usize, cap_height: usize) -> Self { + let leaves_len = leaves_1d.len() / leaf_size; + let log2_leaves_len = log2_strict(leaves_len); assert!( cap_height <= log2_leaves_len, "cap_height={} should be at most log2(leaves.len())={}", @@ -199,7 +474,7 @@ impl> MerkleTree { log2_leaves_len ); - let num_digests = 2 * (leaves.len() - (1 << cap_height)); + let num_digests = 2 * (leaves_len - (1 << cap_height)); let mut digests = Vec::with_capacity(num_digests); let len_cap = 1 << cap_height; @@ -207,7 +482,9 @@ impl> MerkleTree { let digests_buf = capacity_up_to_mut(&mut digests, num_digests); let cap_buf = capacity_up_to_mut(&mut cap, len_cap); - fill_digests_buf::(digests_buf, cap_buf, &leaves[..], cap_height); + let now = Instant::now(); + fill_digests_buf_meta::(digests_buf, cap_buf, &leaves_1d, leaf_size, cap_height); + print_time(now, "fill digests buffer"); unsafe { // SAFETY: `fill_digests_buf` and `cap` initialized the spare capacity up to @@ -215,38 +492,363 @@ impl> MerkleTree { digests.set_len(num_digests); cap.set_len(len_cap); } + /* + println!{"Digest Buffer"}; + for dg in &digests { + println!("{:?}", dg); + } + println!{"Cap Buffer"}; + for dg in &cap { + println!("{:?}", dg); + } + */ + Self { + leaves: leaves_1d, + leaf_size, + digests, + cap: MerkleCap(cap), + } + } + + pub fn new_from_2d(leaves_2d: Vec>, cap_height: usize) -> Self { + let leaf_size = leaves_2d[0].len(); + let leaves_count = leaves_2d.len(); + let zeros = vec![F::from_canonical_u64(0); leaf_size]; + let mut leaves_1d: Vec = Vec::with_capacity(leaves_count * leaf_size); + for idx in 0..leaves_count { + if leaves_2d[idx].len() == 0 { + leaves_1d.extend(zeros.clone()); + } else { + leaves_1d.extend(leaves_2d[idx].clone()); + } + } + Self::new_from_1d(leaves_1d, leaf_size, cap_height) + } + pub fn new_from_fields( + leaves_1d: Vec, + leaf_size: usize, + digests: Vec, + cap: MerkleCap, + ) -> Self { Self { - leaves, + leaves: leaves_1d, + leaf_size, + digests, + cap, + } + } + + #[cfg(feature = "cuda")] + pub fn new_from_gpu_leaves( + leaves_gpu_ptr: &HostOrDeviceSlice<'_, F>, + leaves_len: usize, + leaf_len: usize, + cap_height: usize, + ) -> Self { + let log2_leaves_len = log2_strict(leaves_len); + assert!( + cap_height <= log2_leaves_len, + "cap_height={} should be at most log2(leaves.len())={}", + cap_height, + log2_leaves_len + ); + + // copy data from GPU in async mode + let mut host_leaves: Vec = vec![F::ZERO; leaves_len * leaf_len]; + let stream_copy = CudaStream::create().unwrap(); + + let start = std::time::Instant::now(); + leaves_gpu_ptr + .copy_to_host_async(host_leaves.as_mut_slice(), &stream_copy) + .expect("copy to host error"); + print_time(start, "copy leaves from GPU async"); + + let num_digests = 2 * (leaves_len - (1 << cap_height)); + let mut digests = Vec::with_capacity(num_digests); + + let len_cap = 1 << cap_height; + let mut cap = Vec::with_capacity(len_cap); + + let digests_buf = capacity_up_to_mut(&mut digests, num_digests); + let cap_buf = capacity_up_to_mut(&mut cap, len_cap); + let now = Instant::now(); + let gpu_id = 0; + fill_digests_buf_gpu_ptr::( + digests_buf, + cap_buf, + leaves_gpu_ptr.as_ptr(), + leaves_len, + leaf_len, + cap_height, + gpu_id, + ); + print_time(now, "fill digests buffer"); + + unsafe { + // SAFETY: `fill_digests_buf` and `cap` initialized the spare capacity up to + // `num_digests` and `len_cap`, resp. + digests.set_len(num_digests); + cap.set_len(len_cap); + } + /* + println!{"Digest Buffer"}; + for dg in &digests { + println!("{:?}", dg); + } + println!{"Cap Buffer"}; + for dg in &cap { + println!("{:?}", dg); + } + */ + let _ = stream_copy.synchronize(); + let _ = stream_copy.destroy(); + + Self { + leaves: host_leaves, + leaf_size: leaf_len, digests, cap: MerkleCap(cap), } } pub fn get(&self, i: usize) -> &[F] { - &self.leaves[i] + let (_, v) = self.leaves.split_at(i * self.leaf_size); + let (v, _) = v.split_at(self.leaf_size); + v + } + + pub fn get_leaves_1d(&self) -> Vec { + self.leaves.clone() + } + + pub fn get_leaves_2d(&self) -> Vec> { + let v2d: Vec> = self + .leaves + .chunks_exact(self.leaf_size) + .map(|leaf| leaf.to_vec()) + .collect(); + v2d + } + + pub fn get_leaves_count(&self) -> usize { + self.leaves.len() / self.leaf_size + } + + pub fn change_leaf_and_update(&mut self, leaf: Vec, leaf_index: usize) { + assert_eq!(leaf.len(), self.leaf_size); + let leaves_count = self.leaves.len() / self.leaf_size; + assert!(leaf_index < leaves_count); + + let cap_height = log2_strict(self.cap.len()); + let mut leaves = self.leaves.clone(); + let start = leaf_index * self.leaf_size; + let leaf_copy = leaf.clone(); + leaf.into_iter() + .enumerate() + .for_each(|(i, el)| leaves[start + i] = el); + + let digests_len = self.digests.len(); + let cap_len = self.cap.0.len(); + let digests_buf = capacity_up_to_mut(&mut self.digests, digests_len); + let cap_buf = capacity_up_to_mut(&mut self.cap.0, cap_len); + self.leaves = leaves; + if digests_buf.is_empty() { + cap_buf[leaf_index].write(H::hash_or_noop(leaf_copy.as_slice())); + } else { + let subtree_leaves_len = leaves_count >> cap_height; + let subtree_idx = leaf_index / subtree_leaves_len; + let subtree_digests_len = digests_buf.len() >> cap_height; + let subtree_offset = subtree_idx * subtree_digests_len; + let idx_in_subtree = + subtree_digests_len - subtree_leaves_len + leaf_index % subtree_leaves_len; + if subtree_leaves_len == 2 { + digests_buf[subtree_offset + idx_in_subtree] + .write(H::hash_or_noop(leaf_copy.as_slice())); + } else { + assert!(subtree_leaves_len > 2); + let idx = subtree_offset + idx_in_subtree; + digests_buf[idx].write(H::hash_or_noop(leaf_copy.as_slice())); + let mut child_idx: i64 = idx_in_subtree as i64; + let mut parent_idx: i64 = child_idx / 2 - 1; + while child_idx > 1 { + unsafe { + let mut left_idx = subtree_offset + child_idx as usize; + let mut right_idx = subtree_offset + child_idx as usize + 1; + if child_idx & 1 == 1 { + left_idx = subtree_offset + child_idx as usize - 1; + right_idx = subtree_offset + child_idx as usize; + } + let left_digest = digests_buf[left_idx].assume_init(); + let right_digest = digests_buf[right_idx].assume_init(); + digests_buf[subtree_offset + parent_idx as usize] + .write(H::two_to_one(left_digest, right_digest)); + } + child_idx = parent_idx; + parent_idx = child_idx / 2 - 1; + } + } + unsafe { + let left_digest = digests_buf[subtree_offset].assume_init(); + let right_digest = digests_buf[subtree_offset + 1].assume_init(); + cap_buf[subtree_idx].write(H::two_to_one(left_digest, right_digest)); + } + } + } + + pub fn change_leaves_in_range_and_update( + &mut self, + new_leaves: Vec>, + start_index: usize, + end_index: usize, + ) { + assert_eq!(new_leaves.len(), end_index - start_index); + assert_eq!(new_leaves[0].len(), self.leaf_size); + + let tree_leaves_count = self.leaves.len() / self.leaf_size; + assert!(start_index < end_index); + assert!(end_index < tree_leaves_count); + + let cap_height = log2_strict(self.cap.len()); + let mut leaves = self.leaves.clone(); + + leaves[start_index * self.leaf_size..end_index * self.leaf_size] + .chunks_exact_mut(self.leaf_size) + .zip(new_leaves.clone()) + .for_each(|(x, y)| { + for j in 0..self.leaf_size { + x[j] = y[j]; + } + }); + + let digests_len = self.digests.len(); + let cap_len = self.cap.0.len(); + let digests_buf = capacity_up_to_mut(&mut self.digests, digests_len); + let cap_buf = capacity_up_to_mut(&mut self.cap.0, cap_len); + self.leaves = leaves; + if digests_buf.is_empty() { + cap_buf[start_index..end_index] + .par_iter_mut() + .zip(new_leaves) + .for_each(|(cap, leaf)| { + cap.write(H::hash_or_noop(leaf.as_slice())); + }); + } else { + let subtree_leaves_len = tree_leaves_count >> cap_height; + let subtree_digests_len = digests_buf.len() >> cap_height; + + let mut positions: Vec = (start_index..end_index) + .map(|idx| { + let subtree_idx = idx / subtree_leaves_len; + let subtree_offset = subtree_idx * subtree_digests_len; + let idx_in_subtree = + subtree_digests_len - subtree_leaves_len + idx % subtree_leaves_len; + subtree_offset + idx_in_subtree + }) + .collect(); + + // TODO change to parallel loop + for i in 0..positions.len() { + digests_buf[positions[i]].write(H::hash_or_noop(new_leaves[i].as_slice())); + } + + if subtree_digests_len > 2 { + let rounds = log2_strict(tree_leaves_count) - cap_height - 1; + for _ in 0..rounds { + let mut parent_indexes: HashSet = HashSet::new(); + let parents: Vec = positions + .par_iter() + .map(|pos| { + let subtree_offset = pos / subtree_digests_len; + let idx_in_subtree = pos % subtree_digests_len; + let mut parent_idx = 0; + if idx_in_subtree > 1 { + parent_idx = idx_in_subtree / 2 - 1; + } + subtree_offset * subtree_digests_len + parent_idx + }) + .collect(); + for p in parents { + parent_indexes.insert(p); + } + positions = parent_indexes.into_iter().collect(); + + // TODO change to parallel loop + for i in 0..positions.len() { + let subtree_offset = positions[i] / subtree_digests_len; + let idx_in_subtree = positions[i] % subtree_digests_len; + let digest_idx = + subtree_offset * subtree_digests_len + 2 * (idx_in_subtree + 1); + unsafe { + let left_digest = digests_buf[digest_idx].assume_init(); + let right_digest = digests_buf[digest_idx + 1].assume_init(); + digests_buf[positions[i]] + .write(H::two_to_one(left_digest, right_digest)); + } + } + } + } + + let mut cap_indexes: HashSet = HashSet::new(); + for idx in start_index..end_index { + cap_indexes.insert(idx / subtree_leaves_len); + } + + unsafe { + for idx in cap_indexes { + let digest_idx = idx * subtree_digests_len; + let left_digest = digests_buf[digest_idx].assume_init(); + let right_digest = digests_buf[digest_idx + 1].assume_init(); + cap_buf[idx].write(H::two_to_one(left_digest, right_digest)); + } + } + } } /// Create a Merkle proof from a leaf index. pub fn prove(&self, leaf_index: usize) -> MerkleProof { let cap_height = log2_strict(self.cap.len()); - let siblings = - merkle_tree_prove::(leaf_index, self.leaves.len(), cap_height, &self.digests); + let num_layers = log2_strict(self.get_leaves_count()) - cap_height; + let subtree_digest_size = (1 << (num_layers + 1)) - 2; // 2 ^ (k+1) - 2 + let subtree_idx = leaf_index / (1 << num_layers); + + let siblings: Vec<>::Hash> = Vec::with_capacity(num_layers); + if num_layers == 0 { + return MerkleProof { siblings }; + } + + // digests index where we start + let idx = subtree_digest_size - (1 << num_layers) + (leaf_index % (1 << num_layers)); + + let siblings = (0..num_layers) + .map(|i| { + // relative index + let rel_idx = (idx + 2 - (1 << i + 1)) / (1 << i); + // absolute index + let mut abs_idx = subtree_idx * subtree_digest_size + rel_idx; + if (rel_idx & 1) == 1 { + abs_idx -= 1; + } else { + abs_idx += 1; + } + self.digests[abs_idx] + }) + .collect(); MerkleProof { siblings } } } #[cfg(test)] -pub(crate) mod tests { +mod tests { use anyhow::Result; use super::*; use crate::field::extension::Extendable; use crate::hash::merkle_proofs::verify_merkle_proof_to_cap; - use crate::plonk::config::{GenericConfig, PoseidonGoldilocksConfig}; + use crate::plonk::config::{GenericConfig, KeccakGoldilocksConfig, PoseidonGoldilocksConfig}; - pub(crate) fn random_data(n: usize, k: usize) -> Vec> { + fn random_data(n: usize, k: usize) -> Vec> { (0..n).map(|_| F::rand_vec(k)).collect() } @@ -258,7 +860,7 @@ pub(crate) mod tests { leaves: Vec>, cap_height: usize, ) -> Result<()> { - let tree = MerkleTree::::new(leaves.clone(), cap_height); + let tree = MerkleTree::::new_from_2d(leaves.clone(), cap_height); for (i, leaf) in leaves.into_iter().enumerate() { let proof = tree.prove(i); verify_merkle_proof_to_cap(leaf, i, &tree.cap, &proof)?; @@ -266,6 +868,224 @@ pub(crate) mod tests { Ok(()) } + fn verify_change_leaf_and_update(log_n: usize, cap_h: usize) { + const D: usize = 2; + type C = PoseidonGoldilocksConfig; + type F = >::F; + + let n = 1 << log_n; + let k = 7; + let mut leaves = random_data::(n, k); + + let mut mt1 = + MerkleTree::>::Hasher>::new_from_2d(leaves.clone(), cap_h); + + let tmp = random_data::(1, k); + leaves[0] = tmp[0].clone(); + let mt2 = MerkleTree::>::Hasher>::new_from_2d(leaves, cap_h); + + mt1.change_leaf_and_update(tmp[0].clone(), 0); + + /* + println!("Tree 1"); + mt1.digests.into_iter().for_each( + |x| { + println!("{:?}", x); + } + ); + println!("Tree 2"); + mt2.digests.into_iter().for_each( + |x| { + println!("{:?}", x); + } + ); + */ + + mt1.digests + .into_par_iter() + .zip(mt2.digests) + .for_each(|(d1, d2)| { + assert_eq!(d1, d2); + }); + + mt1.cap + .0 + .into_par_iter() + .zip(mt2.cap.0) + .for_each(|(d1, d2)| { + assert_eq!(d1, d2); + }); + } + + fn verify_change_leaf_and_update_range_one_by_one( + leaves_count: usize, + leaf_size: usize, + cap_height: usize, + start_index: usize, + end_index: usize, + ) { + use plonky2_field::types::Field; + + const D: usize = 2; + type C = PoseidonGoldilocksConfig; + type F = >::F; + + let raw_leaves: Vec> = random_data::(leaves_count, leaf_size); + let vals: Vec> = random_data::(end_index - start_index, leaf_size); + + let mut leaves1_1d: Vec = raw_leaves.into_iter().flatten().collect(); + let leaves2_1d: Vec = leaves1_1d.clone(); + + let mut tree2 = MerkleTree::>::Hasher>::new_from_1d( + leaves2_1d, leaf_size, cap_height, + ); + + // v1 + let now = Instant::now(); + for i in start_index..end_index { + for j in 0..leaf_size { + leaves1_1d[i * leaf_size + j] = vals[i - start_index][j]; + } + } + let tree1 = MerkleTree::>::Hasher>::new_from_1d( + leaves1_1d, leaf_size, cap_height, + ); + println!("Time V1: {} ms", now.elapsed().as_millis()); + + // v2 + let now = Instant::now(); + for idx in start_index..end_index { + let mut leaf: Vec = vec![F::from_canonical_u64(0); leaf_size]; + for j in 0..leaf_size { + leaf[j] = vals[idx - start_index][j]; + } + tree2.change_leaf_and_update(leaf, idx); + } + println!("Time V2: {} ms", now.elapsed().as_millis()); + + // compare leaves + let t2leaves = tree2.get_leaves_1d(); + tree1 + .get_leaves_1d() + .chunks_exact(leaf_size) + .enumerate() + .for_each(|(i, x)| { + let mut ok = true; + for j in 0..leaf_size { + if x[j] != t2leaves[i * leaf_size + j] { + ok = false; + break; + } + } + if !ok { + println!("Leaves different at index {:?}", i); + } + assert!(ok); + }); + + // compare trees + tree1.digests.into_iter().enumerate().for_each(|(i, x)| { + let y = tree2.digests[i]; + if x != y { + println!("Digests different at index {:?}", i); + } + assert_eq!(x, y); + }); + tree1.cap.0.into_iter().enumerate().for_each(|(i, x)| { + let y = tree2.cap.0[i]; + if x != y { + println!("Cap different at index {:?}", i); + } + assert_eq!(x, y); + }); + } + + fn verify_change_leaf_and_update_range( + leaves_count: usize, + leaf_size: usize, + cap_height: usize, + start_index: usize, + end_index: usize, + ) { + // use plonky2_field::types::Field; + + const D: usize = 2; + type C = PoseidonGoldilocksConfig; + type F = >::F; + + let raw_leaves: Vec> = random_data::(leaves_count, leaf_size); + let vals: Vec> = random_data::(end_index - start_index, leaf_size); + + let mut leaves1_1d: Vec = raw_leaves.into_iter().flatten().collect(); + let leaves2_1d: Vec = leaves1_1d.clone(); + + let mut tree2 = MerkleTree::>::Hasher>::new_from_1d( + leaves2_1d, leaf_size, cap_height, + ); + + // v1 + let now = Instant::now(); + for i in start_index..end_index { + for j in 0..leaf_size { + leaves1_1d[i * leaf_size + j] = vals[i - start_index][j]; + } + } + let tree1 = MerkleTree::>::Hasher>::new_from_1d( + leaves1_1d, leaf_size, cap_height, + ); + println!("Time V1: {} ms", now.elapsed().as_millis()); + + // v2 + let now = Instant::now(); + /* + for idx in start_index..end_index { + let mut leaf: Vec = vec![F::from_canonical_u64(0); leaf_size]; + for j in 0..leaf_size { + leaf[j] = vals[idx - start_index][j]; + } + tree2.change_leaf_and_update(leaf, idx); + } + */ + tree2.change_leaves_in_range_and_update(vals, start_index, end_index); + println!("Time V2: {} ms", now.elapsed().as_millis()); + + // compare leaves + let t2leaves = tree2.get_leaves_1d(); + tree1 + .get_leaves_1d() + .chunks_exact(leaf_size) + .enumerate() + .for_each(|(i, x)| { + let mut ok = true; + for j in 0..leaf_size { + if x[j] != t2leaves[i * leaf_size + j] { + ok = false; + break; + } + } + if !ok { + println!("Leaves different at index {:?}", i); + } + assert!(ok); + }); + + // compare trees + tree1.digests.into_iter().enumerate().for_each(|(i, x)| { + let y = tree2.digests[i]; + if x != y { + println!("Digests different at index {:?}", i); + } + assert_eq!(x, y); + }); + tree1.cap.0.into_iter().enumerate().for_each(|(i, x)| { + let y = tree2.cap.0[i]; + if x != y { + println!("Cap different at index {:?}", i); + } + assert_eq!(x, y); + }); + } + #[test] #[should_panic] fn test_cap_height_too_big() { @@ -277,7 +1097,7 @@ pub(crate) mod tests { let cap_height = log_n + 1; // Should panic if `cap_height > len_n`. let leaves = random_data::(1 << log_n, 7); - let _ = MerkleTree::>::Hasher>::new(leaves, cap_height); + let _ = MerkleTree::>::Hasher>::new_from_2d(leaves, cap_height); } #[test] @@ -296,12 +1116,89 @@ pub(crate) mod tests { } #[test] - fn test_merkle_trees() -> Result<()> { + fn test_change_leaf_and_update() -> Result<()> { + // small tree, 1 cap + verify_change_leaf_and_update(3, 0); + // small tree, 2 cap + verify_change_leaf_and_update(3, 1); + // small tree, 4 cap + verify_change_leaf_and_update(3, 2); + // small tree, all cap + verify_change_leaf_and_update(3, 3); + + // big tree + verify_change_leaf_and_update(12, 3); + + Ok(()) + } + + #[test] + fn test_change_leaf_and_update_range() -> Result<()> { + for h in 0..11 { + println!( + "Run verify_change_leaf_and_update_range_one_by_one() for height {:?}", + h + ); + verify_change_leaf_and_update_range_one_by_one(1024, 68, h, 32, 48); + println!( + "Run verify_change_leaf_and_update_range() for height {:?}", + h + ); + verify_change_leaf_and_update_range(1024, 68, h, 32, 48); + } + + Ok(()) + } + + #[test] + fn test_merkle_trees_poseidon_g64() -> Result<()> { const D: usize = 2; type C = PoseidonGoldilocksConfig; type F = >::F; - let log_n = 8; + // GPU warmup + #[cfg(feature = "cuda")] + let _x: HostOrDeviceSlice<'_, F> = HostOrDeviceSlice::cuda_malloc(0, 64).unwrap(); + + let log_n = 12; + let n = 1 << log_n; + let leaves = random_data::(n, 7); + + verify_all_leaves::(leaves, 1)?; + + Ok(()) + } + + #[cfg(feature = "cuda")] + #[test] + fn test_merkle_trees_cuda_poseidon_g64() -> Result<()> { + const D: usize = 2; + type C = PoseidonGoldilocksConfig; + type F = >::F; + + let log_n = 14; + let n = 1 << log_n; + let leaves = random_data::(n, 7); + let leaves_1d: Vec = leaves.into_iter().flatten().collect(); + + let mut gpu_data: HostOrDeviceSlice<'_, F> = + HostOrDeviceSlice::cuda_malloc(0, n * 7).unwrap(); + gpu_data + .copy_from_host(leaves_1d.as_slice()) + .expect("copy data to gpu"); + + MerkleTree::>::Hasher>::new_from_gpu_leaves(&gpu_data, n, 7, 1); + + Ok(()) + } + + #[test] + fn test_merkle_trees_keccak() -> Result<()> { + const D: usize = 2; + type C = KeccakGoldilocksConfig; + type F = >::F; + + let log_n = 12; let n = 1 << log_n; let leaves = random_data::(n, 7); diff --git a/plonky2/src/hash/mod.rs b/plonky2/src/hash/mod.rs index 0e4bb8a59..3d2e3b161 100644 --- a/plonky2/src/hash/mod.rs +++ b/plonky2/src/hash/mod.rs @@ -2,7 +2,7 @@ //! as well as specific hash functions implementation. mod arch; -pub mod batch_merkle_tree; +// pub mod batch_merkle_tree; pub mod hash_types; pub mod hashing; pub mod keccak; diff --git a/plonky2/src/hash/path_compression.rs b/plonky2/src/hash/path_compression.rs index 517576bf0..5a7f7e1ca 100644 --- a/plonky2/src/hash/path_compression.rs +++ b/plonky2/src/hash/path_compression.rs @@ -129,8 +129,14 @@ mod tests { type F = >::F; let h = 10; let cap_height = 3; - let vs = (0..1 << h).map(|_| vec![F::rand()]).collect::>(); - let mt = MerkleTree::>::Hasher>::new(vs.clone(), cap_height); + let vs = (0..1 << h) + .flat_map(|_| vec![F::rand()]) + .collect::>(); + let mt = MerkleTree::>::Hasher>::new_from_1d( + vs.clone(), + 1, + cap_height, + ); let mut rng = OsRng; let k = rng.gen_range(1..=1 << h); @@ -139,7 +145,10 @@ mod tests { let compressed_proofs = compress_merkle_proofs(cap_height, &indices, &proofs); let decompressed_proofs = decompress_merkle_proofs( - &indices.iter().map(|&i| vs[i].clone()).collect::>(), + &indices + .iter() + .map(|&i| vec![vs[i].clone()]) + .collect::>(), &indices, &compressed_proofs, h, diff --git a/plonky2/src/hash/poseidon.rs b/plonky2/src/hash/poseidon.rs index a7c763252..2b6dd0ac2 100644 --- a/plonky2/src/hash/poseidon.rs +++ b/plonky2/src/hash/poseidon.rs @@ -18,7 +18,7 @@ use crate::hash::hashing::{compress, hash_n_to_hash_no_pad, PlonkyPermutation}; use crate::iop::ext_target::ExtensionTarget; use crate::iop::target::{BoolTarget, Target}; use crate::plonk::circuit_builder::CircuitBuilder; -use crate::plonk::config::{AlgebraicHasher, Hasher}; +use crate::plonk::config::{AlgebraicHasher, Hasher, HasherType}; pub const SPONGE_RATE: usize = 8; pub const SPONGE_CAPACITY: usize = 4; @@ -874,6 +874,7 @@ impl PlonkyPermutation< pub struct PoseidonHash; impl Hasher for PoseidonHash { const HASH_SIZE: usize = 4 * 8; + const HASHER_TYPE: HasherType = HasherType::Poseidon; type Hash = HashOut; type Permutation = PoseidonPermutation; diff --git a/plonky2/src/lib.rs b/plonky2/src/lib.rs index 8772ecfc0..4bf8cc982 100644 --- a/plonky2/src/lib.rs +++ b/plonky2/src/lib.rs @@ -11,7 +11,7 @@ pub extern crate alloc; #[doc(inline)] pub use plonky2_field as field; -pub mod batch_fri; +// pub mod batch_fri; pub mod fri; pub mod gadgets; pub mod gates; diff --git a/plonky2/src/plonk/config.rs b/plonky2/src/plonk/config.rs index 217c88976..f4fe480cb 100644 --- a/plonky2/src/plonk/config.rs +++ b/plonky2/src/plonk/config.rs @@ -23,6 +23,14 @@ use crate::hash::poseidon::PoseidonHash; use crate::iop::target::{BoolTarget, Target}; use crate::plonk::circuit_builder::CircuitBuilder; +#[derive(PartialEq, Debug, Copy, Clone)] +pub enum HasherType { + Poseidon = 0, + Keccak = 1, + PoseidonBN128 = 2, + Poseidon2 = 3, +} + pub trait GenericHashOut: Copy + Clone + Debug + Eq + PartialEq + Send + Sync + Serialize + DeserializeOwned { @@ -34,6 +42,8 @@ pub trait GenericHashOut: /// Trait for hash functions. pub trait Hasher: Sized + Copy + Debug + Eq + PartialEq { + const HASHER_TYPE: HasherType; + /// Size of `Hash` in bytes. const HASH_SIZE: usize; diff --git a/plonky2/src/util/serialization/mod.rs b/plonky2/src/util/serialization/mod.rs index 3755851ac..fbfd7974f 100644 --- a/plonky2/src/util/serialization/mod.rs +++ b/plonky2/src/util/serialization/mod.rs @@ -321,21 +321,22 @@ pub trait Read { H: Hasher, { let leaves_len = self.read_usize()?; - let mut leaves = Vec::with_capacity(leaves_len); + let leaf_len = self.read_usize()?; + let mut leaves_2d = Vec::with_capacity(leaves_len * leaf_len); for _ in 0..leaves_len { - let leaf_len = self.read_usize()?; - leaves.push(self.read_field_vec(leaf_len)?); + // let leaf_len = self.read_usize()?; + leaves_2d.push(self.read_field_vec(leaf_len)?); } + let leaves_1d = leaves_2d.into_iter().flatten().collect(); + let digests_len = self.read_usize()?; let digests = self.read_hash_vec::(digests_len)?; let cap_height = self.read_usize()?; let cap = self.read_merkle_cap::(cap_height)?; - Ok(MerkleTree { - leaves, - digests, - cap, - }) + Ok(MerkleTree::new_from_fields( + leaves_1d, leaf_len, digests, cap, + )) } /// Reads a value of type [`OpeningSet`] from `self` with the given `common_data`. @@ -1421,10 +1422,12 @@ pub trait Write { F: RichField, H: Hasher, { - self.write_usize(tree.leaves.len())?; - for i in 0..tree.leaves.len() { - self.write_usize(tree.leaves[i].len())?; - self.write_field_vec(&tree.leaves[i])?; + let leaves_count = tree.get_leaves_count(); + self.write_usize(leaves_count)?; + self.write_usize(tree.leaf_size)?; + for i in 0..leaves_count { + // self.write_usize(tree.leaf_size)?; + self.write_field_vec(&tree.get(i))?; } self.write_hash_vec::(&tree.digests)?; self.write_usize(tree.cap.height())?; From ea7334c71bda6cf19a15f0f6ec48987d7ec7b95a Mon Sep 17 00:00:00 2001 From: zhenfeizhang Date: Sun, 23 Nov 2025 19:22:23 -0500 Subject: [PATCH 05/37] wip --- field/Cargo.toml | 1 + field/src/fft.rs | 190 ++++++++++++++++++++++++++++++------ field/src/polynomial/mod.rs | 14 +-- 3 files changed, 168 insertions(+), 37 deletions(-) diff --git a/field/Cargo.toml b/field/Cargo.toml index 49cb04494..12e38e354 100644 --- a/field/Cargo.toml +++ b/field/Cargo.toml @@ -11,6 +11,7 @@ keywords.workspace = true categories.workspace = true [dependencies] +ark-std = "0.5.0" anyhow = { workspace = true } itertools = { workspace = true, features = ["use_alloc"] } num = { workspace = true, features = ["alloc"] } diff --git a/field/src/fft.rs b/field/src/fft.rs index eeb86b62d..9c6c8fed6 100644 --- a/field/src/fft.rs +++ b/field/src/fft.rs @@ -40,16 +40,41 @@ fn fft_dispatch_gpu( ) { use zeknox::ntt_batch; use zeknox::types::NTTConfig; - if F::CUDA_SUPPORT { - return ntt_batch( - 0, - input.as_mut_ptr(), - input.len().trailing_zeros() as usize, - NTTConfig::default(), - ); - } else { - return fft_dispatch_cpu(input, zero_factor, root_table); - } + + let mut a = input.to_vec(); + let mut b = input.to_vec(); + + ntt_batch( + 0, + a.as_mut_ptr(), + input.len().trailing_zeros() as usize, + NTTConfig::default(), + ); + + fft_dispatch_cpu(&mut b, zero_factor, root_table); + ark_std::println!("a: {:?}", a); + ark_std::println!("b: {:?}", b); + + assert_eq!( + a, b, + "failed GPU FFT vs CPU FFT comparison\ngpu:{:?}\ncpu:{:?}\ninput:{:?}", + a, b, input + ); + + input.copy_from_slice(&a); + + // use zeknox::ntt_batch; + // use zeknox::types::NTTConfig; + // if F::CUDA_SUPPORT { + // return ntt_batch( + // 0, + // input.as_mut_ptr(), + // input.len().trailing_zeros() as usize, + // NTTConfig::default(), + // ); + // } else { + // return fft_dispatch_cpu(input, zero_factor, root_table); + // } } /// Batch FFT computation for multiple polynomials on GPU @@ -202,25 +227,46 @@ pub fn coset_fft_batch_with_options( zero_factor: Option, root_table: Option<&FftRootTable>, ) -> Vec> { - #[cfg(feature = "cuda")] - return coset_fft_batch_gpu(polys, zero_factor, root_table); - - #[cfg(not(feature = "cuda"))] - { - // CPU fallback: process each polynomial separately - polys - .into_iter() - .map(|poly| { - let modified_poly: PolynomialCoeffs = F::coset_shift() - .powers() - .zip(&poly.coeffs) - .map(|(r, &c)| r * c) - .collect::>() - .into(); - fft_with_options(modified_poly, zero_factor, root_table) - }) - .collect() - } + // #[cfg(feature = "cuda")] + // { + // let a = coset_fft_batch_gpu(polys.clone(), zero_factor, root_table); + // let b = polys + // .into_iter() + // .map(|poly| { + // let modified_poly: PolynomialCoeffs = F::coset_shift() + // .powers() + // .zip(&poly.coeffs) + // .map(|(r, &c)| r * c) + // .collect::>() + // .into(); + // fft_with_options(modified_poly, zero_factor, root_table) + // }) + // .collect::>(); + // assert_eq!(a.len(), b.len()); + + // for (i, (val_a, val_b)) in a.iter().zip(b.iter()).enumerate() { + // assert_eq!(val_a, val_b, "Mismatch at index {}", i); + // } + + // return a; + // } + + // #[cfg(not(feature = "cuda"))] + // { + // CPU fallback: process each polynomial separately + polys + .into_iter() + .map(|poly| { + let modified_poly: PolynomialCoeffs = F::coset_shift() + .powers() + .zip(&poly.coeffs) + .map(|(r, &c)| r * c) + .collect::>() + .into(); + fft_with_options(modified_poly, zero_factor, root_table) + }) + .collect() + // } } fn fft_dispatch_cpu( @@ -483,11 +529,95 @@ mod tests { #[cfg(feature = "cuda")] use zeknox::init_twiddle_factors_rs; - use crate::fft::{coset_fft_batch, fft, fft_batch, fft_with_options, ifft}; + use crate::fft::{ + coset_fft_batch, fft, fft_batch, fft_dispatch_cpu, fft_dispatch_gpu, fft_with_options, ifft, + }; use crate::goldilocks_field::GoldilocksField; use crate::polynomial::{PolynomialCoeffs, PolynomialValues}; use crate::types::Field; + #[test] + fn test_kat() { + init_twiddle_factors_rs(0, 4); + + let input = [ + 16807u64, + 10376289027450995739, + 18446743787439915009, + 1905022641934172156, + 4730749933575995392, + 68841472, + 18428264577490855681, + 18445589101169082369, + 18446744069414567514, + 8070455041963588582, + 49, + 1625527855624486912, + 7, + 18446744069414555649, + 7696581392640, + 481036337152, + ]; + let input_field: Vec = input + .iter() + .map(|&x| GoldilocksField::from_canonical_u64(x)) + .collect(); + + let res_cpu = [ + 8241673866677297204, + 18443207692673526440, + 3336172192632445894, + 12915814655533318448, + 5977358399840934215, + 2796120128477098295, + 16099264885043452953, + 1114428869533774434, + 1182881845840683068, + 18442399148451944616, + 5639697009785877037, + 5534977815694745617, + 3521085621945067109, + 15650623939293352472, + 11342098386477995483, + 17336148097415430195, + ]; + let res_cpu_field: Vec = res_cpu + .iter() + .map(|&x| GoldilocksField::from_canonical_u64(x)) + .collect(); + + let res_gpu = [ + 8241673866677297204, + 18443207692673526440, + 3336172192632445894, + 12915814655533318448, + 5977358399840934215, + 2796120128477098295, + 16099264885043452953, + 1114428869533774434, + 1182881845840683068, + 18442399148451944616, + 5639697009785877037, + 5534977815694745617, + 3521085621945067109, + 15650623939293352472, + 11342098386477995483, + 17336148097415430195, + ]; + let res_gpu_field: Vec = res_gpu + .iter() + .map(|&x| GoldilocksField::from_canonical_u64(x)) + .collect(); + + let mut input_cpu = input_field.clone(); + fft_dispatch_cpu(&mut input_cpu, None, None); + assert_eq!(input_cpu, res_cpu_field); + + let mut input_gpu = input_field.clone(); + fft_dispatch_gpu(&mut input_gpu, None, None); + assert_eq!(input_gpu, res_gpu_field); + } + #[test] fn fft_and_ifft() { type F = GoldilocksField; diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs index be8bf0ad9..28f6e8ed3 100644 --- a/field/src/polynomial/mod.rs +++ b/field/src/polynomial/mod.rs @@ -283,13 +283,13 @@ impl PolynomialCoeffs { zero_factor: Option, root_table: Option<&FftRootTable>, ) -> PolynomialValues { - #[cfg(feature = "cuda")] - { - if F::CUDA_SUPPORT && shift == F::coset_shift() { - // Use GPU coset FFT directly without CPU-side coefficient modification - return crate::fft::coset_fft_gpu(self.clone(), zero_factor, root_table); - } - } + // #[cfg(feature = "cuda")] + // { + // if F::CUDA_SUPPORT && shift == F::coset_shift() { + // // Use GPU coset FFT directly without CPU-side coefficient modification + // return crate::fft::coset_fft_gpu(self.clone(), zero_factor, root_table); + // } + // } // CPU path: multiply by powers of shift, then do regular FFT let modified_poly: Self = shift From 912f1a2bd7b857dcd313f40c57e55de00540d18b Mon Sep 17 00:00:00 2001 From: zhenfeizhang Date: Sun, 23 Nov 2025 19:22:52 -0500 Subject: [PATCH 06/37] wip --- Cargo.toml | 2 +- plonky2/Cargo.toml | 6 +- plonky2/examples/fibonacci.rs | 76 ++++++++++-- plonky2/src/batch_fri/oracle.rs | 12 ++ plonky2/src/batch_fri/prover.rs | 21 ++++ plonky2/src/fri/oracle.rs | 27 ++++- plonky2/src/gadgets/interpolation.rs | 2 +- plonky2/src/hash/merkle_tree.rs | 168 ++++++++++++++++++++++++++- plonky2/src/iop/generator.rs | 6 +- plonky2/src/plonk/prover.rs | 32 ++++- plonky2/src/plonk/verifier.rs | 5 + plonky2/src/util/mod.rs | 108 ++++++++++++++++- 12 files changed, 443 insertions(+), 22 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index eed6218b8..d8ebd31d6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ rand = { version = "0.8.4", default-features = false } serde = { version = "1.0", default-features = false, features = ["derive"] } static_assertions = { version = "1.1.0", default-features = false } unroll = { version = "0.1.5", default-features = false } -zeknox_= { path = "../zeknox/wrappers/rust"} +zeknox = { path = "../zeknox/wrappers/rust" } [profile.release] diff --git a/plonky2/Cargo.toml b/plonky2/Cargo.toml index 83ff08519..c5c282928 100644 --- a/plonky2/Cargo.toml +++ b/plonky2/Cargo.toml @@ -12,11 +12,12 @@ keywords.workspace = true categories.workspace = true [features] -default = ["gate_testing", "parallel", "rand_chacha", "std", "timing"] +default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda"] gate_testing = [] parallel = ["hashbrown/rayon", "plonky2_maybe_rayon/parallel"] std = ["anyhow/std", "rand/std", "itertools/use_std"] timing = ["std", "dep:web-time"] +cuda = ["plonky2_field/cuda"] [dependencies] ahash = { workspace = true } @@ -38,6 +39,9 @@ plonky2_field = { version = "1.0.0", path = "../field", default-features = false plonky2_maybe_rayon = { version = "1.0.0", path = "../maybe_rayon", default-features = false } plonky2_util = { version = "1.0.0", path = "../util", default-features = false } +# cuda accelerator wrapper +zeknox = { workspace = true } + [target.'cfg(all(target_arch = "wasm32", target_os = "unknown"))'.dependencies] getrandom = { version = "0.2", default-features = false, features = ["js"] } diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs index 578dc2424..dfce8de8b 100644 --- a/plonky2/examples/fibonacci.rs +++ b/plonky2/examples/fibonacci.rs @@ -9,13 +9,19 @@ use plonky2::plonk::config::{GenericConfig, PoseidonGoldilocksConfig}; /// "I know the 100th element of the Fibonacci sequence, starting with constants a and b." /// When a == 0 and b == 1, this is proving knowledge of the 100th (standard) Fibonacci number. fn main() -> Result<()> { + // Initialize logger to see timing output + env_logger::Builder::from_default_env() + .format_timestamp(None) + .filter_level(log::LevelFilter::Debug) + .init(); const D: usize = 2; type C = PoseidonGoldilocksConfig; type F = >::F; let config = CircuitConfig::standard_recursion_config(); + println!("Building circuit..."); let mut builder = CircuitBuilder::::new(config); - + println!("Building arithmetic circuit..."); // The arithmetic circuit. let initial_a = builder.add_virtual_target(); let initial_b = builder.add_virtual_target(); @@ -26,6 +32,28 @@ fn main() -> Result<()> { prev_target = cur_target; cur_target = temp; } + println!("Circuit built."); + + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + println!("Initializing CUDA twiddle factors..."); + // Initialize twiddle factors for all dimensions that will be used + // This test involves multiple polynomials and recursive verification, + // so we initialize a wider range of dimensions to be safe + // for i in 0..=19 { + // zeknox::init_twiddle_factors_rs(0, i); + // } + + zeknox::init_twiddle_factors_rs(0, 3); + zeknox::init_twiddle_factors_rs(0, 6); + // Initialize coset on GPU + // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR) + // TODO: Make this generic for other fields if needed + let coset_gen_u64 = 7u64; + // zeknox::init_coset_rs(0, 19, coset_gen_u64); + zeknox::init_coset_rs(0, 6, coset_gen_u64); + } // Public inputs are the two initial values (provided below) and the result (which is generated). builder.register_public_input(initial_a); @@ -38,12 +66,46 @@ fn main() -> Result<()> { pw.set_target(initial_b, F::ONE)?; let data = builder.build::(); - let proof = data.prove(pw)?; - println!( - "100th Fibonacci number mod |F| (starting with {}, {}) is: {}", - proof.public_inputs[0], proof.public_inputs[1], proof.public_inputs[2] - ); + #[cfg(feature = "timing")] + { + use log::Level; + use plonky2::util::timing::TimingTree; + let mut timing = TimingTree::new("prove", Level::Info); + println!("Starting proof generation..."); + let proof = + plonky2::plonk::prover::prove(&data.prover_only, &data.common, pw, &mut timing)?; + + println!( + "100th Fibonacci number mod |F| (starting with {}, {}) is: {}", + proof.public_inputs[0], proof.public_inputs[1], proof.public_inputs[2] + ); + + // Print first few elements of wires_cap for comparison + println!("First wires_cap hash: {:?}", proof.proof.wires_cap.0[0]); + println!( + "First plonk_zs hash: {:?}", + proof.proof.plonk_zs_partial_products_cap.0[0] + ); + println!( + "First quotient hash: {:?}", + proof.proof.quotient_polys_cap.0[0] + ); + + timing.print(); + data.verify(proof)?; + } + + #[cfg(not(feature = "timing"))] + { + let proof = data.prove(pw)?; + println!( + "100th Fibonacci number mod |F| (starting with {}, {}) is: {}", + proof.public_inputs[0], proof.public_inputs[1], proof.public_inputs[2] + ); + data.verify(proof)?; + } - data.verify(proof) + println!("finished"); + Ok(()) } diff --git a/plonky2/src/batch_fri/oracle.rs b/plonky2/src/batch_fri/oracle.rs index 58deeaa3c..1f31b8cf1 100644 --- a/plonky2/src/batch_fri/oracle.rs +++ b/plonky2/src/batch_fri/oracle.rs @@ -300,6 +300,18 @@ mod test { reduction_arity_bits, }; + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + // Initialize twiddle factors for all dimensions that will be used + // This test involves multiple polynomials and recursive verification, + // so we initialize a wider range of dimensions to be safe + let current_log_size = k0 + fri_params.config.rate_bits; + for i in 0..=current_log_size + 5 { + zeknox::init_twiddle_factors_rs(0, i); + } + } + let n0 = 1 << k0; let n1 = 1 << k1; let n2 = 1 << k2; diff --git a/plonky2/src/batch_fri/prover.rs b/plonky2/src/batch_fri/prover.rs index e71fe25b4..6815bb1af 100644 --- a/plonky2/src/batch_fri/prover.rs +++ b/plonky2/src/batch_fri/prover.rs @@ -263,6 +263,17 @@ mod tests { }; let n = 1 << k; + + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + // Initialize twiddle factors for all dimensions that will be used + let current_log_size = k + fri_params.config.rate_bits; + for i in 0..=current_log_size { + zeknox::init_twiddle_factors_rs(0, i); + } + } + let trace = PolynomialValues::new((1..n + 1).map(F::from_canonical_i64).collect_vec()); let polynomial_batch: BatchFriOracle = BatchFriOracle::from_values( @@ -359,6 +370,16 @@ mod tests { reduction_arity_bits, }; + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + // Initialize twiddle factors for all dimensions that will be used + let current_log_size = k0 + fri_params.config.rate_bits; + for i in 0..=current_log_size { + zeknox::init_twiddle_factors_rs(0, i); + } + } + let n0 = 1 << k0; let n1 = 1 << k1; let n2 = 1 << k2; diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs index e413071a4..803747e34 100644 --- a/plonky2/src/fri/oracle.rs +++ b/plonky2/src/fri/oracle.rs @@ -65,7 +65,8 @@ impl, C: GenericConfig, const D: usize> let coeffs = timed!( timing, "IFFT", - values.into_par_iter().map(|v| v.ifft()).collect::>() + // Use sequential iteration for deterministic results + values.into_iter().map(|v| v.ifft()).collect::>() ); Self::from_coeffs( @@ -95,7 +96,20 @@ impl, C: GenericConfig, const D: usize> ); let mut leaves = timed!(timing, "transpose LDEs", transpose(&lde_values)); + // Debug: Print first leaf for determinism check + if !leaves.is_empty() && !leaves[0].is_empty() { + println!( + "First leaf before reverse_bits: {:?}", + &leaves[0][..4.min(leaves[0].len())] + ); + } reverse_index_bits_in_place(&mut leaves); + if !leaves.is_empty() && !leaves[0].is_empty() { + println!( + "First leaf after reverse_bits: {:?}", + &leaves[0][..4.min(leaves[0].len())] + ); + } let merkle_tree = timed!( timing, "build Merkle tree", @@ -121,9 +135,16 @@ impl, C: GenericConfig, const D: usize> // If blinding, salt with two random elements to each leaf vector. let salt_size = if blinding { SALT_SIZE } else { 0 }; + println!( + "lde_values: num_polys={}, degree={}, blinding={}, salt_size={}", + polynomials.len(), + degree, + blinding, + salt_size + ); polynomials - .par_iter() + .iter() .map(|p| { assert_eq!(p.len(), degree, "Polynomial degrees inconsistent"); p.lde(rate_bits) @@ -132,7 +153,7 @@ impl, C: GenericConfig, const D: usize> }) .chain( (0..salt_size) - .into_par_iter() + .into_iter() .map(|_| F::rand_vec(degree << rate_bits)), ) .collect() diff --git a/plonky2/src/gadgets/interpolation.rs b/plonky2/src/gadgets/interpolation.rs index 39b048af4..9aedf7143 100644 --- a/plonky2/src/gadgets/interpolation.rs +++ b/plonky2/src/gadgets/interpolation.rs @@ -86,7 +86,7 @@ mod tests { let value_targets = values .iter() - .map(|&v| (builder.constant_extension(v))) + .map(|&v| builder.constant_extension(v)) .collect::>(); let zt = builder.constant_extension(z); diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs index 31bcf5e37..0846b7224 100644 --- a/plonky2/src/hash/merkle_tree.rs +++ b/plonky2/src/hash/merkle_tree.rs @@ -5,8 +5,16 @@ use core::slice; use plonky2_maybe_rayon::*; use serde::{Deserialize, Serialize}; +#[cfg(feature = "cuda")] +use zeknox::device::{memory::HostOrDeviceSlice, stream::CudaStream}; +#[cfg(feature = "cuda")] +use zeknox::{ + fill_digests_buf_linear_gpu_with_gpu_ptr, fill_digests_buf_linear_multigpu_with_gpu_ptr, +}; use crate::hash::hash_types::RichField; +#[cfg(feature = "cuda")] +use crate::hash::hash_types::NUM_HASH_OUT_ELTS; use crate::hash::merkle_proofs::MerkleProof; use crate::plonk::config::{GenericHashOut, Hasher}; use crate::util::log2_strict; @@ -148,6 +156,125 @@ pub(crate) fn fill_digests_buf>( ); } +#[cfg(feature = "cuda")] +fn fill_digests_buf_gpu_ptr>( + digests_buf: &mut [MaybeUninit], + cap_buf: &mut [MaybeUninit], + leaves_ptr: *const F, + leaves_len: usize, + leaf_len: usize, + cap_height: usize, + gpu_id: u64, +) { + let digests_count: u64 = digests_buf.len().try_into().unwrap(); + let leaves_count: u64 = leaves_len.try_into().unwrap(); + let caps_count: u64 = cap_buf.len().try_into().unwrap(); + let cap_height: u64 = cap_height.try_into().unwrap(); + let leaf_size: u64 = leaf_len.try_into().unwrap(); + + // if digests_buf is empty (size 0), just allocate a few bytes to avoid errors + let digests_size = if digests_buf.len() == 0 { + NUM_HASH_OUT_ELTS + } else { + digests_buf.len() * NUM_HASH_OUT_ELTS + }; + let caps_size = if cap_buf.len() == 0 { + NUM_HASH_OUT_ELTS + } else { + cap_buf.len() * NUM_HASH_OUT_ELTS + }; + + let mut gpu_digests_buf: HostOrDeviceSlice<'_, F> = + HostOrDeviceSlice::cuda_malloc(gpu_id as i32, digests_size).unwrap(); + let mut gpu_cap_buf: HostOrDeviceSlice<'_, F> = + HostOrDeviceSlice::cuda_malloc(gpu_id as i32, caps_size).unwrap(); + + unsafe { + let num_gpus: usize = std::env::var("NUM_OF_GPUS") + .unwrap_or_else(|_| "1".to_string()) + .parse() + .unwrap_or(1); + + if leaves_count >= (1 << 12) && cap_height > 0 && num_gpus > 1 { + // Multi-GPU path + fill_digests_buf_linear_multigpu_with_gpu_ptr( + gpu_digests_buf.as_mut_ptr() as *mut core::ffi::c_void, + gpu_cap_buf.as_mut_ptr() as *mut core::ffi::c_void, + leaves_ptr as *mut core::ffi::c_void, + digests_count, + caps_count, + leaves_count, + leaf_size, + cap_height, + 0, // hash_type: 0 for Poseidon + ); + } else { + // Single GPU path + fill_digests_buf_linear_gpu_with_gpu_ptr( + gpu_digests_buf.as_mut_ptr() as *mut core::ffi::c_void, + gpu_cap_buf.as_mut_ptr() as *mut core::ffi::c_void, + leaves_ptr as *mut core::ffi::c_void, + digests_count, + caps_count, + leaves_count, + leaf_size, + cap_height, + 0, // hash_type: 0 for Poseidon + gpu_id, + ); + } + } + + let stream1 = CudaStream::create().unwrap(); + let stream2 = CudaStream::create().unwrap(); + + gpu_digests_buf + .copy_to_host_ptr_async( + digests_buf.as_mut_ptr() as *mut core::ffi::c_void, + digests_size, + &stream1, + ) + .expect("copy digests"); + gpu_cap_buf + .copy_to_host_ptr_async( + cap_buf.as_mut_ptr() as *mut core::ffi::c_void, + caps_size, + &stream2, + ) + .expect("copy caps"); + stream1.synchronize().expect("cuda sync"); + stream2.synchronize().expect("cuda sync"); + stream1.destroy().expect("cuda stream destroy"); + stream2.destroy().expect("cuda stream destroy"); +} + +#[cfg(feature = "cuda")] +fn fill_digests_buf_gpu>( + digests_buf: &mut [MaybeUninit], + cap_buf: &mut [MaybeUninit], + leaves: &Vec, + leaf_size: usize, + cap_height: usize, +) { + let leaves_count = leaves.len() / leaf_size; + let gpu_id = 0; + + let mut gpu_leaves_buf: HostOrDeviceSlice<'_, F> = + HostOrDeviceSlice::cuda_malloc(gpu_id as i32, leaves.len()).unwrap(); + + let _ = gpu_leaves_buf.copy_from_host(leaves.as_slice()); + + fill_digests_buf_gpu_ptr::( + digests_buf, + cap_buf, + gpu_leaves_buf.as_mut_ptr(), + leaves_count, + leaf_size, + cap_height, + gpu_id, + ); +} + pub(crate) fn merkle_tree_prove>( leaf_index: usize, leaves_len: usize, @@ -207,10 +334,47 @@ impl> MerkleTree { let digests_buf = capacity_up_to_mut(&mut digests, num_digests); let cap_buf = capacity_up_to_mut(&mut cap, len_cap); - fill_digests_buf::(digests_buf, cap_buf, &leaves[..], cap_height); + + // #[cfg(feature = "cuda")] + // { + // // Check if we should use GPU acceleration + // // Use GPU for large trees (>= 1024 leaves) or if CUDA_MERKLE_THRESHOLD is set + // let use_gpu = if let Ok(threshold_str) = std::env::var("CUDA_MERKLE_THRESHOLD") { + // if let Ok(threshold) = threshold_str.parse::() { + // leaves.len() >= threshold + // } else { + // leaves.len() >= 1024 + // } + // } else { + // leaves.len() >= 1024 + // }; + + // if use_gpu { + // // Flatten leaves into 1D vector for GPU + // let leaf_size = if leaves.is_empty() { 0 } else { leaves[0].len() }; + // let zeros = vec![F::ZERO; leaf_size]; + // let mut leaves_1d: Vec = Vec::with_capacity(leaves.len() * leaf_size); + // for leaf in &leaves { + // if leaf.is_empty() { + // leaves_1d.extend(zeros.clone()); + // } else { + // leaves_1d.extend(leaf.clone()); + // } + // } + + // fill_digests_buf_gpu::(digests_buf, cap_buf, &leaves_1d, leaf_size, cap_height); + // } else { + // fill_digests_buf::(digests_buf, cap_buf, &leaves[..], cap_height); + // } + // } + + // #[cfg(not(feature = "cuda"))] + { + fill_digests_buf::(digests_buf, cap_buf, &leaves[..], cap_height); + } unsafe { - // SAFETY: `fill_digests_buf` and `cap` initialized the spare capacity up to + // SAFETY: `fill_digests_buf` or `fill_digests_buf_gpu` initialized the spare capacity up to // `num_digests` and `len_cap`, resp. digests.set_len(num_digests); cap.set_len(len_cap); diff --git a/plonky2/src/iop/generator.rs b/plonky2/src/iop/generator.rs index f81508b7a..8e387c4cd 100644 --- a/plonky2/src/iop/generator.rs +++ b/plonky2/src/iop/generator.rs @@ -36,7 +36,7 @@ pub fn generate_partial_witness< let config = &common_data.config; let generators = &prover_data.generators; let generator_indices_by_watches = &prover_data.generator_indices_by_watches; - + println!("Initializing witness."); let mut witness = PartitionWitness::new( config.num_wires, common_data.degree(), @@ -57,6 +57,8 @@ pub fn generate_partial_witness< let mut buffer = GeneratedValues::empty(); + println!("Starting generator execution."); + // Keep running generators until we fail to make progress. while !pending_generator_indices.is_empty() { let mut next_pending_generator_indices = Vec::new(); @@ -96,6 +98,8 @@ pub fn generate_partial_witness< pending_generator_indices = next_pending_generator_indices; } + println!("Finished generator execution."); + if remaining_generators != 0 { return Err(anyhow!("{} generators weren't run", remaining_generators)); } diff --git a/plonky2/src/plonk/prover.rs b/plonky2/src/plonk/prover.rs index ac0a683cf..649c811a1 100644 --- a/plonky2/src/plonk/prover.rs +++ b/plonky2/src/plonk/prover.rs @@ -150,6 +150,7 @@ where let degree = common_data.degree(); set_lookup_wires(prover_data, common_data, &mut partition_witness)?; + println!("Set lookup wires."); let public_inputs = partition_witness.get_targets(&prover_data.public_inputs); let public_inputs_hash = C::InnerHasher::hash_no_pad(&public_inputs); @@ -159,17 +160,25 @@ where "compute full witness", partition_witness.full_witness() ); - + println!("Computed full witness."); let wires_values: Vec> = timed!( timing, "compute wire polynomials", + // Use sequential iteration for deterministic results witness .wire_values - .par_iter() + .iter() .map(|column| PolynomialValues::new(column.clone())) .collect() ); - + println!("Computed wire polynomials."); + // Debug: Print first few wire values to check determinism + if !wires_values.is_empty() && !wires_values[0].values.is_empty() { + println!( + "First wire poly first 5 values: {:?}", + &wires_values[0].values[..5.min(wires_values[0].values.len())] + ); + } let wires_commitment = timed!( timing, "compute wires commitment", @@ -182,7 +191,7 @@ where prover_data.fft_root_table.as_ref(), ) ); - + println!("Computed wires commitment."); let mut challenger = Challenger::::new(); // Observe the FRI config @@ -230,6 +239,7 @@ where .collect(); let zs_partial_products = [plonk_z_vecs, partial_products_and_zs.concat()].concat(); + println!("Computed Z and partial products."); // All lookup polys: RE and partial SLDCs. let lookup_polys = compute_all_lookup_polys(&witness, &deltas, prover_data, common_data, has_lookup); @@ -240,6 +250,7 @@ where zs_partial_products }; + println!("Computed lookup polynomials."); let partial_products_zs_and_lookup_commitment = timed!( timing, "commit to partial products, Z's and, if any, lookup polynomials", @@ -272,7 +283,12 @@ where &alphas, ) ); + println!("prover alphas: {:?}", alphas); + println!("prover betas: {:?}", betas); + println!("prover gammas: {:?}", gammas); + println!("prover deltas: {:?}", deltas); + println!("Split up quotient polys."); let all_quotient_poly_chunks: Vec> = timed!( timing, "split up quotient polys", @@ -288,6 +304,7 @@ where .collect() ); + println!("Committed to quotient polys."); let quotient_polys_commitment = timed!( timing, "commit to quotient polys", @@ -301,9 +318,11 @@ where ) ); + println!("Committed to quotient polys."); challenger.observe_cap::("ient_polys_commitment.merkle_tree.cap); let zeta = challenger.get_extension_challenge::(); + println!("prover zeta: {:?}", zeta); // To avoid leaking witness data, we want to ensure that our opening locations, `zeta` and // `g * zeta`, are not in our subgroup `H`. It suffices to check `zeta` only, since // `(g * zeta)^n = zeta^n`, where `n` is the order of `g`. @@ -313,6 +332,7 @@ where "Opening point is in the subgroup." ); + println!("Constructing the opening set, including lookups."); let openings = timed!( timing, "construct the opening set, including lookups", @@ -326,6 +346,8 @@ where common_data ) ); + println!("Computed openings."); + challenger.observe_openings(&openings.to_fri_openings()); let instance = common_data.get_fri_instance(zeta); @@ -347,7 +369,7 @@ where timing, ) ); - + println!("Computed opening proofs."); let proof = Proof:: { wires_cap: wires_commitment.merkle_tree.cap, plonk_zs_partial_products_cap: partial_products_zs_and_lookup_commitment.merkle_tree.cap, diff --git a/plonky2/src/plonk/verifier.rs b/plonky2/src/plonk/verifier.rs index fa1bc14b8..d369656c6 100644 --- a/plonky2/src/plonk/verifier.rs +++ b/plonky2/src/plonk/verifier.rs @@ -27,6 +27,11 @@ pub(crate) fn verify, C: GenericConfig, c &verifier_data.circuit_digest, common_data, )?; + println!("verifier alphas: {:?}", challenges.plonk_alphas); + println!("verifier betas: {:?}", challenges.plonk_betas); + println!("verifier gammas: {:?}", challenges.plonk_gammas); + println!("verifier deltas: {:?}", challenges.plonk_deltas); + println!("verifier zeta: {:?}", challenges.plonk_zeta); verify_with_challenges::( proof_with_pis.proof, diff --git a/plonky2/src/util/mod.rs b/plonky2/src/util/mod.rs index 8f9960034..6f2ae608e 100644 --- a/plonky2/src/util/mod.rs +++ b/plonky2/src/util/mod.rs @@ -6,6 +6,8 @@ use alloc::vec::Vec; use plonky2_maybe_rayon::*; #[doc(inline)] pub use plonky2_util::*; +#[cfg(feature = "cuda")] +use zeknox::{device::memory::HostOrDeviceSlice, transpose_rev_batch, types::TransposeConfig}; use crate::field::polynomial::PolynomialValues; use crate::field::types::Field; @@ -22,10 +24,114 @@ pub(crate) fn transpose_poly_values(polys: Vec>) - transpose(&poly_values) } +#[cfg(feature = "cuda")] +fn transpose_gpu(matrix: &[Vec]) -> Vec> { + use std::time::Instant; + + if matrix.is_empty() || matrix[0].is_empty() { + return vec![]; + } + + let num_rows = matrix.len(); + let num_cols = matrix[0].len(); + let total_elements = num_rows * num_cols; + + // Flatten the 2D matrix into a 1D vector for GPU + let mut flat_input: Vec = Vec::with_capacity(total_elements); + for row in matrix { + flat_input.extend_from_slice(row); + } + + let gpu_id = 0; + let log_n = (num_cols as f64).log2().ceil() as usize; + + // Allocate GPU memory for input and output + let mut gpu_input: HostOrDeviceSlice<'_, T> = + HostOrDeviceSlice::cuda_malloc(gpu_id, total_elements).unwrap(); + let mut gpu_output: HostOrDeviceSlice<'_, T> = + HostOrDeviceSlice::cuda_malloc(gpu_id, total_elements).unwrap(); + + // Copy input to GPU + gpu_input.copy_from_host(&flat_input).unwrap(); + + // Configure transpose + let mut cfg = TransposeConfig::default(); + cfg.batches = num_rows as u32; + cfg.are_inputs_on_device = true; + cfg.are_outputs_on_device = true; + + let timers = Instant::now(); + // Perform GPU transpose + transpose_rev_batch( + gpu_id, + gpu_output.as_mut_ptr(), + gpu_input.as_mut_ptr(), + log_n, + cfg, + ); + println!( + "CUDA transpose of {}x{} took {:?}", + num_rows, + num_cols, + timers.elapsed() + ); + + let timer = Instant::now(); + // Copy result back to host + let mut flat_output = vec![unsafe { std::mem::zeroed() }; total_elements]; + gpu_output + .copy_to_host(&mut flat_output, total_elements) + .unwrap(); + println!( + "CUDA transpose copy back and reshape of {}x{} took {:?}", + num_rows, + num_cols, + timer.elapsed() + ); + + // Reshape back to 2D (transposed) using chunks_exact for better performance + // The GPU transpose outputs in column-major order, so we can just chunk by num_rows + let result: Vec> = flat_output + .chunks_exact(num_rows) + .map(|chunk| chunk.to_vec()) + .collect(); + + result +} + pub fn transpose(matrix: &[Vec]) -> Vec> { + if matrix.is_empty() { + return vec![]; + } + let len = matrix[0].len(); + + // #[cfg(feature = "cuda")] + // { + // // Use GPU for large matrices + // // Threshold: use GPU if total elements >= 2^16 (65536) or if CUDA_TRANSPOSE_THRESHOLD is set + // let num_rows = matrix.len(); + // let num_cols = len; + // let total_elements = num_rows * num_cols; + + // let use_gpu = if let Ok(threshold_str) = std::env::var("CUDA_TRANSPOSE_THRESHOLD") { + // if let Ok(threshold) = threshold_str.parse::() { + // total_elements >= threshold + // } else { + // total_elements >= 65536 + // } + // } else { + // total_elements >= 65536 + // }; + + // if use_gpu && num_cols.is_power_of_two() { + // return transpose_gpu(matrix); + // } + // } + + // CPU fallback + // Use sequential iteration for deterministic results (0..len) - .into_par_iter() .map(|i| matrix.iter().map(|row| row[i]).collect()) .collect() } From a04950fda99e1a8da2c24cf39a336543ba55ed22 Mon Sep 17 00:00:00 2001 From: zhenfeizhang Date: Mon, 24 Nov 2025 08:54:30 -0500 Subject: [PATCH 07/37] wip --- field/Cargo.toml | 4 +- field/perm_comp.md | 34 +++++++++++++ field/src/fft.rs | 85 ++++++++++++++++++--------------- field/src/polynomial/mod.rs | 24 ++++++---- plonky2/Cargo.toml | 3 +- plonky2/examples/fibonacci.rs | 10 ++-- plonky2/src/hash/merkle_tree.rs | 68 +++++++++++++------------- 7 files changed, 138 insertions(+), 90 deletions(-) create mode 100644 field/perm_comp.md diff --git a/field/Cargo.toml b/field/Cargo.toml index 12e38e354..1b6a62d71 100644 --- a/field/Cargo.toml +++ b/field/Cargo.toml @@ -36,6 +36,6 @@ workspace = true [features] -# default = [] -default = [ "cuda" ] +default = [] +# default = [ "cuda" ] cuda = [] \ No newline at end of file diff --git a/field/perm_comp.md b/field/perm_comp.md new file mode 100644 index 000000000..dec02f91f --- /dev/null +++ b/field/perm_comp.md @@ -0,0 +1,34 @@ +# Performance comparison +- CPU: AMD 7950x3d 16 core +- GPU: 4080 super; single card +- + +| Operation | CPU (s) | GPU (s) | Speedup | GPU Tuned? | +|-----------|---------|---------|---------|------------| +| **Run generators** | 1.7767 | 1.7899 | 0.99x | ✗ Not accelerated | +| **Compute full witness** | 0.3369 | 0.3362 | 1.00x | ✗ Not accelerated | +| **Compute wire polynomials** | 0.0396 | 0.0392 | 1.01x | ✗ Not accelerated | +| **Compute wires commitment** | 20.1902 | 10.0548 | **2.01x** | ✓ Yes | +| └─ IFFT | 1.2070 | 0.1587 | **7.61x** | ✓ **Highly tuned** | +| └─ FFT + blinding | 11.4267 | 3.6139 | **3.16x** | ✓ **Highly tuned** | +| └─ Transpose LDEs | 2.8010 | 2.7881 | 1.00x | ✗ Not accelerated | +| └─ Build Merkle tree | 4.5166 | 3.2734 | **1.38x** | ✓ Tuned | +| **Compute partial products** | 0.1700 | 0.1671 | 1.02x | ✗ Not accelerated | +| **Commit to partial products/Z's** | 3.4213 | 1.6982 | **2.01x** | ✓ Yes | +| └─ IFFT | 0.1860 | 0.0241 | **7.72x** | ✓ **Highly tuned** | +| └─ FFT + blinding | 1.7627 | 0.4778 | **3.69x** | ✓ **Highly tuned** | +| └─ Transpose LDEs | 0.3906 | 0.3874 | 1.01x | ✗ Not accelerated | +| └─ Build Merkle tree | 1.0253 | 0.7573 | **1.35x** | ✓ Tuned | +| **Compute quotient polys** | 1.4041 | 1.3128 | 1.07x | ✗ Not accelerated | +| **Split quotient polys** | 0.0098 | 0.0212 | 0.46x | ✗ Not accelerated| +| **Commit to quotient polys** | 2.6641 | 1.4077 | **1.89x** | ✓ Yes | +| └─ FFT + blinding | 1.5496 | 0.4315 | **3.59x** | ✓ **Highly tuned** | +| └─ Transpose LDEs | 0.2952 | 0.2908 | 1.02x | ✗ Not accelerated | +| └─ Build Merkle tree | 0.7756 | 0.6453 | **1.20x** | ✓ Tuned | +| **Construct opening set** | 0.1609 | 0.1600 | 1.01x | ✗ Not accelerated | +| **Compute opening proofs** | 1.3580 | 1.2919 | 1.05x | ✗ Not accelerated | +| └─ Reduce 255 polynomials | 0.8715 | 0.8518 | 1.02x | ✗ Not accelerated | +| └─ Reduce 2 polynomials | 0.0087 | 0.0085 | 1.02x | ✗ Not accelerated | +| └─ Final FFT 4194304 | 0.3083 | 0.3023 | 1.02x | ✗ Not accelerated | +| └─ Fold codewords | 0.1312 | 0.0904 | **1.45x** | ✗ Not accelerated | +| └─ Find PoW witness | 0.0014 | 0.0038 | 0.37x | ✗ Not accelerated | \ No newline at end of file diff --git a/field/src/fft.rs b/field/src/fft.rs index 9c6c8fed6..bccfb3486 100644 --- a/field/src/fft.rs +++ b/field/src/fft.rs @@ -38,43 +38,46 @@ fn fft_dispatch_gpu( zero_factor: Option, root_table: Option<&FftRootTable>, ) { - use zeknox::ntt_batch; - use zeknox::types::NTTConfig; - - let mut a = input.to_vec(); - let mut b = input.to_vec(); - - ntt_batch( - 0, - a.as_mut_ptr(), - input.len().trailing_zeros() as usize, - NTTConfig::default(), - ); - - fft_dispatch_cpu(&mut b, zero_factor, root_table); - ark_std::println!("a: {:?}", a); - ark_std::println!("b: {:?}", b); - - assert_eq!( - a, b, - "failed GPU FFT vs CPU FFT comparison\ngpu:{:?}\ncpu:{:?}\ninput:{:?}", - a, b, input - ); + // if F::CUDA_SUPPORT { + // use zeknox::ntt_batch; + // use zeknox::types::NTTConfig; - input.copy_from_slice(&a); + // let mut a = input.to_vec(); + // let mut b = input.to_vec(); - // use zeknox::ntt_batch; - // use zeknox::types::NTTConfig; - // if F::CUDA_SUPPORT { - // return ntt_batch( + // ntt_batch( // 0, - // input.as_mut_ptr(), + // a.as_mut_ptr(), // input.len().trailing_zeros() as usize, // NTTConfig::default(), // ); - // } else { - // return fft_dispatch_cpu(input, zero_factor, root_table); + + // fft_dispatch_cpu(&mut b, zero_factor, root_table); + // ark_std::println!("a: {:?}", a); + // ark_std::println!("b: {:?}", b); + + // assert_eq!( + // a, b, + // "failed GPU FFT vs CPU FFT comparison\ngpu:{:?}\ncpu:{:?}\ninput:{:?}", + // a, b, input + // ); + + // input.copy_from_slice(&a); // } + // return fft_dispatch_cpu(input, zero_factor, root_table); + + use zeknox::ntt_batch; + use zeknox::types::NTTConfig; + if F::CUDA_SUPPORT { + return ntt_batch( + 0, + input.as_mut_ptr(), + input.len().trailing_zeros() as usize, + NTTConfig::default(), + ); + } else { + return fft_dispatch_cpu(input, zero_factor, root_table); + } } /// Batch FFT computation for multiple polynomials on GPU @@ -269,7 +272,7 @@ pub fn coset_fft_batch_with_options( // } } -fn fft_dispatch_cpu( +pub(crate) fn fft_dispatch_cpu( input: &mut [F], zero_factor: Option, root_table: Option<&FftRootTable>, @@ -298,10 +301,15 @@ fn fft_dispatch( root_table: Option<&FftRootTable>, ) { #[cfg(feature = "cuda")] - return fft_dispatch_gpu(input, zero_factor, root_table); - + { + // ark_std::println!("Using GPU FFT dispatch"); + return fft_dispatch_gpu(input, zero_factor, root_table); + } #[cfg(not(feature = "cuda"))] - return fft_dispatch_cpu(input, zero_factor, root_table); + { + // ark_std::println!("Using CPU FFT dispatch"); + return fft_dispatch_cpu(input, zero_factor, root_table); + } } #[inline] @@ -529,14 +537,15 @@ mod tests { #[cfg(feature = "cuda")] use zeknox::init_twiddle_factors_rs; - use crate::fft::{ - coset_fft_batch, fft, fft_batch, fft_dispatch_cpu, fft_dispatch_gpu, fft_with_options, ifft, - }; + #[cfg(feature = "cuda")] + use crate::fft::{coset_fft_batch, fft_dispatch_cpu, fft_dispatch_gpu}; + use crate::fft::{fft, fft_batch, fft_with_options, ifft}; use crate::goldilocks_field::GoldilocksField; use crate::polynomial::{PolynomialCoeffs, PolynomialValues}; use crate::types::Field; #[test] + #[cfg(feature = "cuda")] fn test_kat() { init_twiddle_factors_rs(0, 4); @@ -669,7 +678,7 @@ mod tests { type F = GoldilocksField; // Test various polynomial sizes - for log_size in [8, 10, 12, 14] { + for log_size in [8, 10, 12, 14,16,18,20] { let size = 1 << log_size; zeknox::clear_cuda_errors_rs(); init_twiddle_factors_rs(0, log_size); diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs index 28f6e8ed3..a78cc10d1 100644 --- a/field/src/polynomial/mod.rs +++ b/field/src/polynomial/mod.rs @@ -12,7 +12,7 @@ use plonky2_util::log2_strict; use serde::{Deserialize, Serialize}; use crate::extension::{Extendable, FieldExtension}; -use crate::fft::{fft, fft_with_options, ifft, FftRootTable}; +use crate::fft::{fft, fft_dispatch_cpu, fft_with_options, ifft, FftRootTable}; use crate::types::Field; /// A polynomial in point-value form. @@ -283,22 +283,26 @@ impl PolynomialCoeffs { zero_factor: Option, root_table: Option<&FftRootTable>, ) -> PolynomialValues { - // #[cfg(feature = "cuda")] - // { - // if F::CUDA_SUPPORT && shift == F::coset_shift() { - // // Use GPU coset FFT directly without CPU-side coefficient modification - // return crate::fft::coset_fft_gpu(self.clone(), zero_factor, root_table); - // } - // } + #[cfg(feature = "cuda")] + { + if F::CUDA_SUPPORT && shift == F::coset_shift() { + // Use GPU coset FFT directly without CPU-side coefficient modification + // ark_std::println!("Using GPU coset FFT: degree {}", self.len() - 1); + return crate::fft::coset_fft_gpu(self.clone(), zero_factor, root_table); + } + } // CPU path: multiply by powers of shift, then do regular FFT - let modified_poly: Self = shift + let mut modified_poly: Self = shift .powers() .zip(&self.coeffs) .map(|(r, &c)| r * c) .collect::>() .into(); - modified_poly.fft_with_options(zero_factor, root_table) + + fft_dispatch_cpu(&mut modified_poly.coeffs, zero_factor, root_table); + modified_poly.coeffs.into() + // modified_poly.fft_with_options(zero_factor, root_table) } pub fn to_extension(&self) -> PolynomialCoeffs diff --git a/plonky2/Cargo.toml b/plonky2/Cargo.toml index c5c282928..c84876752 100644 --- a/plonky2/Cargo.toml +++ b/plonky2/Cargo.toml @@ -12,7 +12,8 @@ keywords.workspace = true categories.workspace = true [features] -default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda"] +default = ["gate_testing", "parallel", "rand_chacha", "std", "timing"] +# default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing"] gate_testing = [] parallel = ["hashbrown/rayon", "plonky2_maybe_rayon/parallel"] std = ["anyhow/std", "rand/std", "itertools/use_std"] diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs index dfce8de8b..f456ce46c 100644 --- a/plonky2/examples/fibonacci.rs +++ b/plonky2/examples/fibonacci.rs @@ -27,7 +27,7 @@ fn main() -> Result<()> { let initial_b = builder.add_virtual_target(); let mut prev_target = initial_a; let mut cur_target = initial_b; - for _ in 0..99 { + for _ in 0..9999999 { let temp = builder.add(prev_target, cur_target); prev_target = cur_target; cur_target = temp; @@ -45,14 +45,14 @@ fn main() -> Result<()> { // zeknox::init_twiddle_factors_rs(0, i); // } - zeknox::init_twiddle_factors_rs(0, 3); - zeknox::init_twiddle_factors_rs(0, 6); + zeknox::init_twiddle_factors_rs(0, 19); + zeknox::init_twiddle_factors_rs(0, 22); // Initialize coset on GPU // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR) // TODO: Make this generic for other fields if needed let coset_gen_u64 = 7u64; - // zeknox::init_coset_rs(0, 19, coset_gen_u64); - zeknox::init_coset_rs(0, 6, coset_gen_u64); + zeknox::init_coset_rs(0, 22, coset_gen_u64); + // zeknox::init_coset_rs(0, 16, coset_gen_u64); } // Public inputs are the two initial values (provided below) and the result (which is generated). diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs index 0846b7224..495825147 100644 --- a/plonky2/src/hash/merkle_tree.rs +++ b/plonky2/src/hash/merkle_tree.rs @@ -335,40 +335,40 @@ impl> MerkleTree { let digests_buf = capacity_up_to_mut(&mut digests, num_digests); let cap_buf = capacity_up_to_mut(&mut cap, len_cap); - // #[cfg(feature = "cuda")] - // { - // // Check if we should use GPU acceleration - // // Use GPU for large trees (>= 1024 leaves) or if CUDA_MERKLE_THRESHOLD is set - // let use_gpu = if let Ok(threshold_str) = std::env::var("CUDA_MERKLE_THRESHOLD") { - // if let Ok(threshold) = threshold_str.parse::() { - // leaves.len() >= threshold - // } else { - // leaves.len() >= 1024 - // } - // } else { - // leaves.len() >= 1024 - // }; - - // if use_gpu { - // // Flatten leaves into 1D vector for GPU - // let leaf_size = if leaves.is_empty() { 0 } else { leaves[0].len() }; - // let zeros = vec![F::ZERO; leaf_size]; - // let mut leaves_1d: Vec = Vec::with_capacity(leaves.len() * leaf_size); - // for leaf in &leaves { - // if leaf.is_empty() { - // leaves_1d.extend(zeros.clone()); - // } else { - // leaves_1d.extend(leaf.clone()); - // } - // } - - // fill_digests_buf_gpu::(digests_buf, cap_buf, &leaves_1d, leaf_size, cap_height); - // } else { - // fill_digests_buf::(digests_buf, cap_buf, &leaves[..], cap_height); - // } - // } - - // #[cfg(not(feature = "cuda"))] + #[cfg(feature = "cuda")] + { + // Check if we should use GPU acceleration + // Use GPU for large trees (>= 1024 leaves) or if CUDA_MERKLE_THRESHOLD is set + let use_gpu = if let Ok(threshold_str) = std::env::var("CUDA_MERKLE_THRESHOLD") { + if let Ok(threshold) = threshold_str.parse::() { + leaves.len() >= threshold + } else { + leaves.len() >= 1024 + } + } else { + leaves.len() >= 1024 + }; + + if use_gpu { + // Flatten leaves into 1D vector for GPU + let leaf_size = if leaves.is_empty() { 0 } else { leaves[0].len() }; + let zeros = vec![F::ZERO; leaf_size]; + let mut leaves_1d: Vec = Vec::with_capacity(leaves.len() * leaf_size); + for leaf in &leaves { + if leaf.is_empty() { + leaves_1d.extend(zeros.clone()); + } else { + leaves_1d.extend(leaf.clone()); + } + } + + fill_digests_buf_gpu::(digests_buf, cap_buf, &leaves_1d, leaf_size, cap_height); + } else { + fill_digests_buf::(digests_buf, cap_buf, &leaves[..], cap_height); + } + } + + #[cfg(not(feature = "cuda"))] { fill_digests_buf::(digests_buf, cap_buf, &leaves[..], cap_height); } From 5f625e41a6d72df3b03e89a91b0348d11df9d271 Mon Sep 17 00:00:00 2001 From: lighter-zz Date: Mon, 24 Nov 2025 09:19:24 -0500 Subject: [PATCH 08/37] clean up --- BENCHMARK_RESULTS.md | 166 ------------ arch.md | 417 ----------------------------- field/perm_comp.md => perm_comp.md | 0 task.md | 1 - 4 files changed, 584 deletions(-) delete mode 100644 BENCHMARK_RESULTS.md delete mode 100644 arch.md rename field/perm_comp.md => perm_comp.md (100%) delete mode 100644 task.md diff --git a/BENCHMARK_RESULTS.md b/BENCHMARK_RESULTS.md deleted file mode 100644 index 11cd934e4..000000000 --- a/BENCHMARK_RESULTS.md +++ /dev/null @@ -1,166 +0,0 @@ -# Poseidon vs Poseidon2 Performance Benchmark Results - -## Summary - -This document presents benchmark results comparing **PoseidonGoldilocksConfig** (original Poseidon hash) vs **Poseidon2GoldilocksConfig** (hybrid configuration using Poseidon2 for Merkle trees). - -## Configuration Details - -- **PoseidonGoldilocksConfig**: Uses Poseidon hash for both external (Merkle trees) and internal (circuit) hashing -- **Poseidon2GoldilocksConfig**: Uses Poseidon2 for external hashing (Merkle trees), Poseidon for internal hashing (circuits) - -## Benchmark Results - -### Circuit Size: 100 iterations - -| Configuration | Build Time | Prove Time | Verify Time | Total Time | Speedup | -|--------------|------------|------------|-------------|------------|---------| -| Poseidon | 4.57ms | 6.21ms | 1.36ms | 12.14ms | - | -| Poseidon2 | 2.41ms | 6.36ms | 1.88ms | 10.65ms | **1.14x** | - -**Build speedup: 1.90x** (4.57ms → 2.41ms) - -### Circuit Size: 500 iterations - -| Configuration | Build Time | Prove Time | Verify Time | Total Time | Speedup | -|--------------|------------|------------|-------------|------------|---------| -| Poseidon | 3.32ms | 2.82ms | 1.50ms | 7.63ms | - | -| Poseidon2 | 3.28ms | 13.71ms | 2.17ms | 19.16ms | **0.40x** ⚠️ | - -**Note**: Poseidon2 is slower here - likely due to AVX2 warmup or different circuit structure. - -### Circuit Size: 1000 iterations - -| Configuration | Build Time | Prove Time | Verify Time | Total Time | Speedup | -|--------------|------------|------------|-------------|------------|---------| -| Poseidon | 5.05ms | 8.90ms | 1.87ms | 15.82ms | - | -| Poseidon2 | 5.86ms | 6.97ms | 2.77ms | 15.60ms | **1.01x** | - -**Prove speedup: 1.28x** (8.90ms → 6.97ms) - -## Analysis - -### Key Observations - -1. **Build Time**: Poseidon2 shows significant improvement for small circuits (1.90x faster at size 100) but becomes comparable or slightly slower for larger circuits. - -2. **Proof Generation**: - - For small circuits (100): Similar performance (6.21ms vs 6.36ms) - - For medium circuits (500): Poseidon2 is unexpectedly slower (needs investigation) - - For large circuits (1000): Poseidon2 shows **1.28x speedup** (8.90ms → 6.97ms) - -3. **Verification Time**: Poseidon2 is consistently slower in verification (1.36ms → 1.88ms for size 100), likely due to different hash function overhead. - -4. **Overall Performance**: Mixed results, with best performance at small (100) and large (1000) circuit sizes. - -### Performance Breakdown - -#### Circuit Size: 100 -``` -Poseidon: Build 37.6% | Prove 51.2% | Verify 11.2% -Poseidon2: Build 22.6% | Prove 59.7% | Verify 17.7% -``` - -#### Circuit Size: 1000 -``` -Poseidon: Build 31.9% | Prove 56.3% | Verify 11.8% -Poseidon2: Build 37.5% | Prove 44.7% | Verify 17.8% -``` - -## Performance Characteristics - -### Where Poseidon2 Excels - -✅ **Proof generation for larger circuits** (1.28x speedup at 1000 iterations) -- Better performance in Merkle tree construction -- More efficient FRI commitments with AVX2 optimizations -- Improved matrix multiplication in Poseidon2 hash - -✅ **Circuit building for small circuits** (1.90x speedup at 100 iterations) -- Faster initial setup -- Efficient sponge construction - -### Where Poseidon2 Shows No Improvement - -⚠️ **Medium-sized circuits** (500 iterations) -- Unexpected slowdown in proof generation -- Possibly due to CPU cache effects or AVX2 warmup -- Requires further investigation - -❌ **Verification time** -- Consistently 30-40% slower -- Likely due to Poseidon2 hash computation overhead in verification - -## Recommendations - -### When to Use Poseidon2GoldilocksConfig - -1. **Large circuits with many constraints** - Shows clear proof generation speedup -2. **Applications prioritizing proof generation over verification** - If prover performance is critical -3. **Batch proof generation** - Amortizes the warmup cost - -### When to Use PoseidonGoldilocksConfig - -1. **Applications with frequent verification** - Original Poseidon verifies faster -2. **Medium-sized circuits** - More consistent performance -3. **When stability is critical** - Well-tested, mature implementation - -## Technical Details - -### Hash Function Differences - -**Poseidon**: -- 12-element state width -- 8 full rounds + 22 partial rounds -- Standard MDS matrix - -**Poseidon2**: -- 12-element state width -- 8 full rounds + 22 partial rounds -- Optimized M_E (external) matrix using M_4 blocks -- AVX2-accelerated matrix multiplication -- More efficient internal diffusion layer - -### AVX2 Optimizations - -Both implementations use AVX2 SIMD instructions for: -- S-box computation (x^7 in Goldilocks field) -- Matrix-vector multiplication -- Round constant addition - -Poseidon2 additionally optimizes: -- Block-wise M_4 matrix application -- Internal layer diffusion with diagonal matrix - -## Future Work - -1. **Investigate 500-iteration slowdown** - Profile to understand performance regression -2. **Benchmark with different circuit types** - Test with other operations beyond Fibonacci -3. **Measure memory usage** - Compare memory footprint between configurations -4. **Test on different hardware** - Verify AVX2 benefits across CPUs -5. **Implement Poseidon2Gate** - Enable full Poseidon2 support for in-circuit hashing - -## Running the Benchmark - -To reproduce these results: - -```bash -cargo run --release --example bench_poseidon_vs_poseidon2 -``` - -To benchmark with custom circuit sizes, modify the `circuit_sizes` vector in `main()`: - -```rust -let circuit_sizes = vec![100, 500, 1000, 2000, 5000]; -``` - -## System Information - -- **CPU**: x86_64 with AVX2 support -- **Compiler**: rustc with release optimizations -- **Build**: `--release` with target-cpu=native recommended for best performance - ---- - -**Generated**: 2025-11-07 -**Benchmark Tool**: [bench_poseidon_vs_poseidon2.rs](plonky2/examples/bench_poseidon_vs_poseidon2.rs) diff --git a/arch.md b/arch.md deleted file mode 100644 index 257652a62..000000000 --- a/arch.md +++ /dev/null @@ -1,417 +0,0 @@ -# Plonky2 Circuit Architecture - -This document explains how circuits are laid out and structured in Plonky2. - -## Table of Contents -- [Circuit Matrix Structure](#circuit-matrix-structure) -- [Gate Placement](#gate-placement) -- [Wire Organization](#wire-organization) -- [Data Structure Hierarchy](#data-structure-hierarchy) -- [Constraint System](#constraint-system) -- [Copy Constraints & Permutation](#copy-constraints--permutation) -- [Witness Generation Pipeline](#witness-generation-pipeline) -- [Polynomial Commitments](#polynomial-commitments) -- [Key Design Principles](#key-design-principles) - -## Circuit Matrix Structure - -The circuit is fundamentally a **2D matrix**: -- **Rows**: Gates (operations), numbered 0 to `degree` -- **Columns**: 135 wires total - - 80 routed wires (participate in copy constraints/permutation argument) - - 55 advice wires (local to gates, used for intermediate values) - -**Reference**: [plonky2/src/plonk/circuit_builder.rs:141-207](plonky2/src/plonk/circuit_builder.rs#L141-L207) - -### Wire Layout -``` -Wire Index │ Type │ Purpose -───────────┼───────────────┼────────────────────────────── -0-79 │ Routed │ Can be connected across gates via permutation -80-134 │ Advice │ Local helper wires, not part of permutation -``` - -**Reference**: [plonky2/src/plonk/circuit_data.rs:56-88](plonky2/src/plonk/circuit_data.rs#L56-L88) - -```rust -pub const NUM_ROUTED_WIRES: usize = 80; -pub const NUM_ADVICE_WIRES: usize = 55; -pub const NUM_WIRES: usize = NUM_ROUTED_WIRES + NUM_ADVICE_WIRES; -``` - -## Gate Placement - -Gates are placed **sequentially** in the circuit matrix using a greedy algorithm: - -1. Each gate type defines how many constraint "slots" it needs -2. The builder searches for the next available slot using `find_slot()` -3. Gates are packed efficiently to minimize circuit size -4. Gates with the same degree are grouped together - -**Reference**: [plonky2/src/plonk/circuit_builder.rs:815-845](plonky2/src/plonk/circuit_builder.rs#L815-L845) - -### Selector Polynomials - -Instead of having one selector per gate type, Plonky2 uses **selector polynomials** that partition gates by degree: -- Gates of degree D are grouped together -- Selector polynomial is 1 for gates of that degree, 0 elsewhere -- This enables efficient constraint evaluation without per-gate filtering - -**Reference**: [plonky2/src/plonk/get_vecs.rs:12-68](plonky2/src/plonk/get_vecs.rs#L12-L68) - -## Wire Organization - -### Routed Wires (0-79) -- Participate in the **permutation argument** -- Can be connected across different gates -- Used for inputs/outputs that need to be constrained equal -- Example: connecting output of gate A to input of gate B - -### Advice Wires (80-134) -- Local to individual gates -- Do NOT participate in permutation -- Used for intermediate computations -- Reduces pressure on routed wires -- Example: temporary values in arithmetic operations - -**Reference**: [plonky2/src/plonk/circuit_data.rs:56-88](plonky2/src/plonk/circuit_data.rs#L56-L88) - -## Data Structure Hierarchy - -The circuit data is split into three components for efficiency: - -``` -CircuitData -├── ProverOnlyCircuitData -│ ├── Generators (compute witness values) -│ ├── Sigma polynomials (permutation mappings) -│ ├── Forest (union-find for copy constraints) -│ ├── Representative map -│ └── FFT precomputation tables -├── VerifierOnlyCircuitData -│ ├── Constants Merkle cap -│ └── Circuit digest -└── CommonCircuitData - ├── CircuitConfig - ├── Gates (list of all gate instances) - ├── Selectors (degree-based partitioning) - ├── Quotient degree factor - ├── Public input indices - ├── FRI parameters - └── Circuit digest -``` - -### CircuitData -Main container holding all circuit information. - -**Reference**: [plonky2/src/plonk/circuit_data.rs:185-191](plonky2/src/plonk/circuit_data.rs#L185-L191) - -### ProverOnlyCircuitData -Information needed only by the prover: -- **Generators**: Compute witness values from partial witness -- **Sigma polynomials**: Encode the permutation mapping for copy constraints -- **Forest**: Union-find data structure tracking which wires are constrained equal -- **FFT tables**: Precomputed for polynomial operations - -**Reference**: [plonky2/src/plonk/circuit_data.rs:428-440](plonky2/src/plonk/circuit_data.rs#L428-L440) - -### VerifierOnlyCircuitData -Minimal information for verification: -- Constants Merkle cap (commitment to constants) -- Circuit digest (hash of circuit structure) - -**Reference**: [plonky2/src/plonk/circuit_data.rs:402-426](plonky2/src/plonk/circuit_data.rs#L402-L426) - -### CommonCircuitData -Shared between prover and verifier: -- Configuration parameters -- Gate definitions -- Selector polynomials -- Public input locations -- FRI parameters - -**Reference**: [plonky2/src/plonk/circuit_data.rs:442-480](plonky2/src/plonk/circuit_data.rs#L442-L480) - -## Constraint System - -Each gate implements the `Gate` trait which defines: - -```rust -pub trait Gate, const D: usize>: ... { - fn num_wires(&self) -> usize; // How many wires it uses - fn num_constants(&self) -> usize; // How many constants it needs - fn degree(&self) -> usize; // Max degree of constraints - fn num_constraints(&self) -> usize; // Number of polynomial equations - - fn eval_unfiltered(&self, ...); // Evaluate constraints - fn eval_filtered(&self, ...); // Evaluate with selector -} -``` - -**Reference**: [plonky2/src/plonk/gates/gate.rs:53-260](plonky2/src/plonk/gates/gate.rs#L53-L260) - -### Constraint Evaluation - -Constraints are evaluated in **point-major order**: -- Evaluate all constraints at point 1 -- Then all constraints at point 2 -- Then all constraints at point 3 -- ... - -This is more SIMD-friendly than gate-major order. - -**Reference**: [plonky2/src/plonk/get_vecs.rs:70-110](plonky2/src/plonk/get_vecs.rs#L70-L110) - -### Gate Instance - -A gate instance consists of: -- Gate index (which gate definition) -- Row index (which row in the circuit matrix) - -**Reference**: [plonky2/src/plonk/gates/gate.rs:319-322](plonky2/src/plonk/gates/gate.rs#L319-L322) - -## Copy Constraints & Permutation - -Plonky2 uses the **PLONK permutation argument** to enforce that wires constrained to be equal actually have equal values. - -### Forest (Union-Find) - -Tracks which wires are constrained to be equal: -- Each wire starts in its own set -- `copy_constraint(w1, w2)` unions the sets -- Eventually computes a permutation mapping - -**Reference**: [plonky2/src/plonk/permutation_argument.rs:13-156](plonky2/src/plonk/permutation_argument.rs#L13-L156) - -### Sigma Polynomials - -Encode the permutation mapping: -- For each wire `w`, `sigma(w)` tells you the next wire in its equivalence class -- Forms a cycle through all wires that must be equal -- Committed as part of the circuit structure - -**Reference**: [plonky2/src/plonk/permutation_argument.rs:45-91](plonky2/src/plonk/permutation_argument.rs#L45-L91) - -### Permutation Argument - -During proving: -1. Compute partial products based on wires and sigmas -2. Accumulate these into the `Z` polynomial -3. Prove that `Z` forms a valid permutation product - -During verification: -- Check that permutation constraints hold at random point - -**Reference**: [plonky2/src/plonk/prover.rs:250-289](plonky2/src/plonk/prover.rs#L250-L289) - -## Witness Generation Pipeline - -The witness goes through three forms: - -``` -PartialWitness (sparse, user-provided) - ↓ (apply generators) -PartitionWitness (respects copy constraints) - ↓ (flatten to column-major matrix) -MatrixWitness (dense, ready for polynomials) -``` - -### PartialWitness - -- Sparse representation (HashMap) -- User provides initial values (public inputs, private inputs) -- Not all wires need to be set - -**Reference**: [plonky2/src/iop/witness.rs:283-308](plonky2/src/iop/witness.rs#L283-L308) - -### PartitionWitness - -- Organized by copy-constraint partitions -- Each partition has one representative wire -- Setting a wire sets all wires in its partition -- Generators fill in missing values - -**Reference**: [plonky2/src/iop/witness.rs:310-377](plonky2/src/iop/witness.rs#L310-L377) - -### MatrixWitness - -- Dense 2D array -- Column-major layout (wires are contiguous) -- Ready to be converted to polynomials via FFT -- Used for final proof generation - -**Reference**: [plonky2/src/iop/witness.rs:379-402](plonky2/src/iop/witness.rs#L379-L402) - -### Generators - -Generators compute derived witness values: -- Take some inputs and compute outputs -- Run in topological order based on dependencies -- Examples: arithmetic operations, hash outputs, lookup multiplicities - -**Reference**: [plonky2/src/iop/generator.rs:33-142](plonky2/src/iop/generator.rs#L33-L142) - -## Polynomial Commitments - -Plonky2 uses FRI for polynomial commitments. There are four oracles: - -### Oracle 1: CONSTANTS_SIGMAS -- Constants (gate constants, public inputs) -- Sigma polynomials (permutation mappings) -- **Not blinded** (deterministic, part of circuit structure) - -**Reference**: [plonky2/src/plonk/prover.rs:119-148](plonky2/src/plonk/prover.rs#L119-L148) - -### Oracle 2: WIRES -- Wire witness values -- **Blinded** with random salt -- Committed after witness generation - -**Reference**: [plonky2/src/plonk/prover.rs:153-181](plonky2/src/plonk/prover.rs#L153-L181) - -### Oracle 3: ZS_PARTIAL_PRODUCTS -- Permutation product polynomial (Z) -- Partial products for permutation argument -- **Blinded** - -**Reference**: [plonky2/src/plonk/prover.rs:250-289](plonky2/src/plonk/prover.rs#L250-L289) - -### Oracle 4: QUOTIENT -- Quotient polynomial from constraint division -- Proves all constraints are satisfied -- **Blinded** - -**Reference**: [plonky2/src/plonk/prover.rs:291-339](plonky2/src/plonk/prover.rs#L291-L339) - -### Polynomial Batch Process - -For each oracle: -1. Coefficients in evaluation form -2. **FFT** to coefficient form -3. **Low-degree extension** (LDE) by interpolation -4. Add **blinding salt** (random polynomial) -5. Evaluate LDE on larger domain -6. Build **Merkle tree** over evaluations -7. Return Merkle cap as commitment - -**Reference**: [plonky2/src/plonk/prover.rs:73-111](plonky2/src/plonk/prover.rs#L73-L111) - -### Opening Points - -Polynomials are opened at two points: -- `zeta`: Random challenge point -- `g * zeta`: Next point in coset (for permutation argument) - -**Reference**: [plonky2/src/plonk/verifier.rs:42-167](plonky2/src/plonk/verifier.rs#L42-L167) - -## Key Design Principles - -### 1. Efficiency Through Selectors -Selector polynomials group gates by degree, enabling constraint evaluation without filtering by individual gate type. This is more efficient than standard PLONK. - -### 2. Routed vs Advice Wires -Separating routed wires (participate in permutation) from advice wires (local to gates) reduces the cost of the permutation argument while maintaining flexibility. - -### 3. Modularity -Gates are self-contained with their own constraint logic. New gates can be added without modifying the core proving system. - -### 4. SIMD-Friendly Layout -- Point-major constraint evaluation -- Column-major witness layout -- Both enable efficient vectorization - -### 5. Prover/Verifier Separation -Splitting data into ProverOnly, VerifierOnly, and Common minimizes what the verifier needs, reducing verification cost. - -### 6. Generator Pipeline -The generator system allows complex witness computation while maintaining a clean separation between circuit definition and witness generation. - -### 7. Lookup Arguments -Lookup tables enable efficient range checks, XOR operations, and other lookups without expensive bitwise constraints. - -**Reference**: [plonky2/src/plonk/circuit_builder.rs:1357-1472](plonky2/src/plonk/circuit_builder.rs#L1357-L1472) - -## Advanced Features - -### Recursion -Plonky2 can verify its own proofs: -- Verifier circuit is built using the circuit builder -- Enables proof composition and aggregation -- Special gates for efficient field arithmetic - -**Reference**: [plonky2/src/recursion/](plonky2/src/recursion/) - -### Custom Gates -Users can define custom gates for specific operations: -- Implement the `Gate` trait -- Define constraints and evaluation logic -- Register with the circuit builder - -**Reference**: [plonky2/src/plonk/gates/gate.rs:53-260](plonky2/src/plonk/gates/gate.rs#L53-L260) - -### Lookup Tables -Efficient lookups for operations like: -- Range checks -- Bitwise operations (XOR, AND) -- Small field operations -- S-boxes (for hash functions) - -**Reference**: [plonky2/src/gates/lookup.rs](plonky2/src/gates/lookup.rs), [plonky2/src/gates/lookup_table.rs](plonky2/src/gates/lookup_table.rs) - -## File Reference Index - -### Core Circuit Structure -- [plonky2/src/plonk/circuit_data.rs](plonky2/src/plonk/circuit_data.rs) - Main data structures -- [plonky2/src/plonk/circuit_builder.rs](plonky2/src/plonk/circuit_builder.rs) - Circuit construction - -### Gates -- [plonky2/src/plonk/gates/gate.rs](plonky2/src/plonk/gates/gate.rs) - Gate trait and instances -- [plonky2/src/gates/](plonky2/src/gates/) - Concrete gate implementations - -### Witness -- [plonky2/src/iop/witness.rs](plonky2/src/iop/witness.rs) - Witness types -- [plonky2/src/iop/generator.rs](plonky2/src/iop/generator.rs) - Generator system - -### Proving & Verification -- [plonky2/src/plonk/prover.rs](plonky2/src/plonk/prover.rs) - Proof generation -- [plonky2/src/plonk/verifier.rs](plonky2/src/plonk/verifier.rs) - Proof verification - -### Permutation Argument -- [plonky2/src/plonk/permutation_argument.rs](plonky2/src/plonk/permutation_argument.rs) - Copy constraints - -### Polynomials -- [plonky2/src/plonk/get_vecs.rs](plonky2/src/plonk/get_vecs.rs) - Polynomial evaluation -- [plonky2/src/fri/](plonky2/src/fri/) - FRI commitment scheme - -### Recursion -- [plonky2/src/recursion/](plonky2/src/recursion/) - Recursive proof verification - -## Example: Simple Circuit - -Here's how a simple circuit `c = a + b * 3` would be laid out: - -``` -Row 0: PublicInput gate (for input a) -Row 1: PublicInput gate (for input b) -Row 2: ArithmeticGate (b * 3) -Row 3: ArithmeticGate (a + result_from_row2) -Row 4-N: Padding to reach power-of-2 degree - -Copy constraints: -- a (row 0, wire 0) = a (row 3, wire 0) -- b (row 1, wire 0) = b (row 2, wire 0) -- result (row 2, wire 2) = operand (row 3, wire 1) -``` - -The witness generation would: -1. User provides `a` and `b` in PartialWitness -2. Generators compute intermediate values -3. PartitionWitness ensures copy constraints are satisfied -4. MatrixWitness provides final polynomial values - -See [plonky2/examples/](plonky2/examples/) for complete working examples. - ---- - -**Last Updated**: 2025-11-04 -**Plonky2 Version**: Based on plonky2-lighter repository diff --git a/field/perm_comp.md b/perm_comp.md similarity index 100% rename from field/perm_comp.md rename to perm_comp.md diff --git a/task.md b/task.md deleted file mode 100644 index 11fb7869c..000000000 --- a/task.md +++ /dev/null @@ -1 +0,0 @@ -i need to do bench \ No newline at end of file From d48ec0de47bf9fb473e793b62ce8441e85d2ceb1 Mon Sep 17 00:00:00 2001 From: lighter-zz Date: Mon, 24 Nov 2025 09:21:13 -0500 Subject: [PATCH 09/37] Update perm_comp.md --- perm_comp.md | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/perm_comp.md b/perm_comp.md index dec02f91f..ec1886a01 100644 --- a/perm_comp.md +++ b/perm_comp.md @@ -5,30 +5,30 @@ | Operation | CPU (s) | GPU (s) | Speedup | GPU Tuned? | |-----------|---------|---------|---------|------------| -| **Run generators** | 1.7767 | 1.7899 | 0.99x | ✗ Not accelerated | -| **Compute full witness** | 0.3369 | 0.3362 | 1.00x | ✗ Not accelerated | -| **Compute wire polynomials** | 0.0396 | 0.0392 | 1.01x | ✗ Not accelerated | +| **Run generators** | 1.7767 | 1.7899 | - | ✗ Not accelerated | +| **Compute full witness** | 0.3369 | 0.3362 | - | ✗ Not accelerated | +| **Compute wire polynomials** | 0.0396 | 0.0392 | - | ✗ Not accelerated | | **Compute wires commitment** | 20.1902 | 10.0548 | **2.01x** | ✓ Yes | | └─ IFFT | 1.2070 | 0.1587 | **7.61x** | ✓ **Highly tuned** | | └─ FFT + blinding | 11.4267 | 3.6139 | **3.16x** | ✓ **Highly tuned** | -| └─ Transpose LDEs | 2.8010 | 2.7881 | 1.00x | ✗ Not accelerated | +| └─ Transpose LDEs | 2.8010 | 2.7881 | - | ✗ Not accelerated | | └─ Build Merkle tree | 4.5166 | 3.2734 | **1.38x** | ✓ Tuned | -| **Compute partial products** | 0.1700 | 0.1671 | 1.02x | ✗ Not accelerated | +| **Compute partial products** | 0.1700 | 0.1671 | - | ✗ Not accelerated | | **Commit to partial products/Z's** | 3.4213 | 1.6982 | **2.01x** | ✓ Yes | | └─ IFFT | 0.1860 | 0.0241 | **7.72x** | ✓ **Highly tuned** | | └─ FFT + blinding | 1.7627 | 0.4778 | **3.69x** | ✓ **Highly tuned** | -| └─ Transpose LDEs | 0.3906 | 0.3874 | 1.01x | ✗ Not accelerated | +| └─ Transpose LDEs | 0.3906 | 0.3874 | - | ✗ Not accelerated | | └─ Build Merkle tree | 1.0253 | 0.7573 | **1.35x** | ✓ Tuned | -| **Compute quotient polys** | 1.4041 | 1.3128 | 1.07x | ✗ Not accelerated | -| **Split quotient polys** | 0.0098 | 0.0212 | 0.46x | ✗ Not accelerated| +| **Compute quotient polys** | 1.4041 | 1.3128 | - | ✗ Not accelerated | +| **Split quotient polys** | 0.0098 | 0.0212 | - | ✗ Not accelerated| | **Commit to quotient polys** | 2.6641 | 1.4077 | **1.89x** | ✓ Yes | | └─ FFT + blinding | 1.5496 | 0.4315 | **3.59x** | ✓ **Highly tuned** | -| └─ Transpose LDEs | 0.2952 | 0.2908 | 1.02x | ✗ Not accelerated | +| └─ Transpose LDEs | 0.2952 | 0.2908 | - | ✗ Not accelerated | | └─ Build Merkle tree | 0.7756 | 0.6453 | **1.20x** | ✓ Tuned | -| **Construct opening set** | 0.1609 | 0.1600 | 1.01x | ✗ Not accelerated | -| **Compute opening proofs** | 1.3580 | 1.2919 | 1.05x | ✗ Not accelerated | -| └─ Reduce 255 polynomials | 0.8715 | 0.8518 | 1.02x | ✗ Not accelerated | -| └─ Reduce 2 polynomials | 0.0087 | 0.0085 | 1.02x | ✗ Not accelerated | -| └─ Final FFT 4194304 | 0.3083 | 0.3023 | 1.02x | ✗ Not accelerated | -| └─ Fold codewords | 0.1312 | 0.0904 | **1.45x** | ✗ Not accelerated | -| └─ Find PoW witness | 0.0014 | 0.0038 | 0.37x | ✗ Not accelerated | \ No newline at end of file +| **Construct opening set** | 0.1609 | 0.1600 | - | ✗ Not accelerated | +| **Compute opening proofs** | 1.3580 | 1.2919 | - | ✗ Not accelerated | +| └─ Reduce 255 polynomials | 0.8715 | 0.8518 | - | ✗ Not accelerated | +| └─ Reduce 2 polynomials | 0.0087 | 0.0085 | - | ✗ Not accelerated | +| └─ Final FFT 4194304 | 0.3083 | 0.3023 | - | ✗ Not accelerated | +| └─ Fold codewords | 0.1312 | 0.0904 | - | ✗ Not accelerated | +| └─ Find PoW witness | 0.0014 | 0.0038 | - | ✗ Not accelerated | \ No newline at end of file From 6139c76e56d97eeaff478490d3ce370902939aad Mon Sep 17 00:00:00 2001 From: lighter-zz Date: Mon, 24 Nov 2025 09:23:57 -0500 Subject: [PATCH 10/37] Update perm_comp.md --- perm_comp.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/perm_comp.md b/perm_comp.md index ec1886a01..333134394 100644 --- a/perm_comp.md +++ b/perm_comp.md @@ -1,7 +1,9 @@ # Performance comparison -- CPU: AMD 7950x3d 16 core -- GPU: 4080 super; single card -- +- CPU: AMD 7950x3d; 16 core +- GPU: NVidia 4080; single card +- Circuit size: 2^19 gates +- Total CPU time: **32.97 s** +- Total GPU time: **19.71 s** | Operation | CPU (s) | GPU (s) | Speedup | GPU Tuned? | |-----------|---------|---------|---------|------------| From 2669e9be44f40cf0af4eeb0c251c94b10b6318c9 Mon Sep 17 00:00:00 2001 From: lighter-zz Date: Tue, 9 Dec 2025 13:33:34 -0500 Subject: [PATCH 11/37] fix --- field/src/fft.rs | 2 +- plonky2/src/hash/merkle_tree.rs | 40 ++++++++++----------------------- 2 files changed, 13 insertions(+), 29 deletions(-) diff --git a/field/src/fft.rs b/field/src/fft.rs index bccfb3486..682a1e33d 100644 --- a/field/src/fft.rs +++ b/field/src/fft.rs @@ -678,7 +678,7 @@ mod tests { type F = GoldilocksField; // Test various polynomial sizes - for log_size in [8, 10, 12, 14,16,18,20] { + for log_size in [8, 10, 12, 14, 16, 18, 20] { let size = 1 << log_size; zeknox::clear_cuda_errors_rs(); init_twiddle_factors_rs(0, log_size); diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs index 8865cf1c2..109eedeb0 100644 --- a/plonky2/src/hash/merkle_tree.rs +++ b/plonky2/src/hash/merkle_tree.rs @@ -15,21 +15,11 @@ use once_cell::sync::Lazy; use plonky2_maybe_rayon::*; use serde::{Deserialize, Serialize}; #[cfg(feature = "cuda")] -<<<<<<< HEAD -use zeknox::device::memory::HostOrDeviceSlice; -#[cfg(feature = "cuda")] -use zeknox::device::stream::CudaStream; -#[cfg(feature = "cuda")] -use zeknox::fill_digests_buf_linear_gpu_with_gpu_ptr; -#[cfg(feature = "cuda")] -use zeknox::fill_digests_buf_linear_multigpu_with_gpu_ptr; -======= use zeknox::device::{memory::HostOrDeviceSlice, stream::CudaStream}; #[cfg(feature = "cuda")] use zeknox::{ fill_digests_buf_linear_gpu_with_gpu_ptr, fill_digests_buf_linear_multigpu_with_gpu_ptr, }; ->>>>>>> zz-lighter/zz/cuda_integration use crate::hash::hash_types::RichField; #[cfg(feature = "cuda")] @@ -266,12 +256,6 @@ fn fill_digests_buf>( } #[cfg(feature = "cuda")] -<<<<<<< HEAD -#[repr(C)] -union U8U64 { - f1: [u8; 32], - f2: [u64; 4], -======= fn fill_digests_buf_gpu_ptr>( digests_buf: &mut [MaybeUninit], cap_buf: &mut [MaybeUninit], @@ -361,7 +345,6 @@ fn fill_digests_buf_gpu_ptr>( stream2.synchronize().expect("cuda sync"); stream1.destroy().expect("cuda stream destroy"); stream2.destroy().expect("cuda stream destroy"); ->>>>>>> zz-lighter/zz/cuda_integration } #[cfg(feature = "cuda")] @@ -370,8 +353,6 @@ fn fill_digests_buf_gpu>( cap_buf: &mut [MaybeUninit], leaves: &Vec, leaf_size: usize, -<<<<<<< HEAD -======= cap_height: usize, ) { let leaves_count = leaves.len() / leaf_size; @@ -396,7 +377,6 @@ fn fill_digests_buf_gpu>( pub(crate) fn merkle_tree_prove>( leaf_index: usize, leaves_len: usize, ->>>>>>> zz-lighter/zz/cuda_integration cap_height: usize, ) { let leaves_count = leaves.len() / leaf_size; @@ -609,11 +589,6 @@ impl> MerkleTree { let digests_buf = capacity_up_to_mut(&mut digests, num_digests); let cap_buf = capacity_up_to_mut(&mut cap, len_cap); -<<<<<<< HEAD - let now = Instant::now(); - fill_digests_buf_meta::(digests_buf, cap_buf, &leaves_1d, leaf_size, cap_height); - print_time(now, "fill digests buffer"); -======= #[cfg(feature = "cuda")] { @@ -631,7 +606,11 @@ impl> MerkleTree { if use_gpu { // Flatten leaves into 1D vector for GPU - let leaf_size = if leaves.is_empty() { 0 } else { leaves[0].len() }; + let leaf_size = if leaves.is_empty() { + 0 + } else { + leaves[0].len() + }; let zeros = vec![F::ZERO; leaf_size]; let mut leaves_1d: Vec = Vec::with_capacity(leaves.len() * leaf_size); for leaf in &leaves { @@ -642,7 +621,13 @@ impl> MerkleTree { } } - fill_digests_buf_gpu::(digests_buf, cap_buf, &leaves_1d, leaf_size, cap_height); + fill_digests_buf_gpu::( + digests_buf, + cap_buf, + &leaves_1d, + leaf_size, + cap_height, + ); } else { fill_digests_buf::(digests_buf, cap_buf, &leaves[..], cap_height); } @@ -652,7 +637,6 @@ impl> MerkleTree { { fill_digests_buf::(digests_buf, cap_buf, &leaves[..], cap_height); } ->>>>>>> zz-lighter/zz/cuda_integration unsafe { // SAFETY: `fill_digests_buf` or `fill_digests_buf_gpu` initialized the spare capacity up to From 6db10e90208f78e4f28c809276edc9b9b1c16fa4 Mon Sep 17 00:00:00 2001 From: lighter-zz Date: Tue, 9 Dec 2025 13:43:53 -0500 Subject: [PATCH 12/37] fix again --- plonky2/src/hash/merkle_tree.rs | 206 ++++---------------------------- 1 file changed, 26 insertions(+), 180 deletions(-) diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs index 109eedeb0..dfdf94421 100644 --- a/plonky2/src/hash/merkle_tree.rs +++ b/plonky2/src/hash/merkle_tree.rs @@ -1,25 +1,27 @@ +#[cfg(feature = "cuda")] +use alloc::sync::Arc; +#[cfg(not(feature = "std"))] +use alloc::vec::Vec; use core::mem::MaybeUninit; use core::slice; use std::collections::HashSet; #[cfg(feature = "cuda")] -use std::sync::Arc; -#[cfg(feature = "cuda")] use std::sync::Mutex; use std::time::Instant; -#[cfg(not(feature = "std"))] -use std::vec::Vec; -use num::range; #[cfg(feature = "cuda")] -use once_cell::sync::Lazy; -use plonky2_maybe_rayon::*; -use serde::{Deserialize, Serialize}; +use cryptography_cuda::device::memory::HostOrDeviceSlice; #[cfg(feature = "cuda")] -use zeknox::device::{memory::HostOrDeviceSlice, stream::CudaStream}; +use cryptography_cuda::device::stream::CudaStream; #[cfg(feature = "cuda")] -use zeknox::{ +use cryptography_cuda::merkle::bindings::{ fill_digests_buf_linear_gpu_with_gpu_ptr, fill_digests_buf_linear_multigpu_with_gpu_ptr, }; +use num::range; +#[cfg(feature = "cuda")] +use once_cell::sync::Lazy; +use plonky2_maybe_rayon::*; +use serde::{Deserialize, Serialize}; use crate::hash::hash_types::RichField; #[cfg(feature = "cuda")] @@ -33,14 +35,10 @@ use crate::util::log2_strict; #[cfg(feature = "cuda")] pub static GPU_ID: Lazy>> = Lazy::new(|| Arc::new(Mutex::new(0))); -#[cfg(all(feature = "timing", feature = "cuda"))] fn print_time(now: Instant, msg: &str) { println!("Time {} {} ms", msg, now.elapsed().as_millis()); } -#[cfg(not(all(feature = "timing", feature = "cuda")))] -fn print_time(_now: Instant, _msg: &str) {} - #[cfg(feature = "cuda")] const FORCE_SINGLE_GPU: bool = true; @@ -256,95 +254,10 @@ fn fill_digests_buf>( } #[cfg(feature = "cuda")] -fn fill_digests_buf_gpu_ptr>( - digests_buf: &mut [MaybeUninit], - cap_buf: &mut [MaybeUninit], - leaves_ptr: *const F, - leaves_len: usize, - leaf_len: usize, - cap_height: usize, - gpu_id: u64, -) { - let digests_count: u64 = digests_buf.len().try_into().unwrap(); - let leaves_count: u64 = leaves_len.try_into().unwrap(); - let caps_count: u64 = cap_buf.len().try_into().unwrap(); - let cap_height: u64 = cap_height.try_into().unwrap(); - let leaf_size: u64 = leaf_len.try_into().unwrap(); - - // if digests_buf is empty (size 0), just allocate a few bytes to avoid errors - let digests_size = if digests_buf.len() == 0 { - NUM_HASH_OUT_ELTS - } else { - digests_buf.len() * NUM_HASH_OUT_ELTS - }; - let caps_size = if cap_buf.len() == 0 { - NUM_HASH_OUT_ELTS - } else { - cap_buf.len() * NUM_HASH_OUT_ELTS - }; - - let mut gpu_digests_buf: HostOrDeviceSlice<'_, F> = - HostOrDeviceSlice::cuda_malloc(gpu_id as i32, digests_size).unwrap(); - let mut gpu_cap_buf: HostOrDeviceSlice<'_, F> = - HostOrDeviceSlice::cuda_malloc(gpu_id as i32, caps_size).unwrap(); - - unsafe { - let num_gpus: usize = std::env::var("NUM_OF_GPUS") - .unwrap_or_else(|_| "1".to_string()) - .parse() - .unwrap_or(1); - - if leaves_count >= (1 << 12) && cap_height > 0 && num_gpus > 1 { - // Multi-GPU path - fill_digests_buf_linear_multigpu_with_gpu_ptr( - gpu_digests_buf.as_mut_ptr() as *mut core::ffi::c_void, - gpu_cap_buf.as_mut_ptr() as *mut core::ffi::c_void, - leaves_ptr as *mut core::ffi::c_void, - digests_count, - caps_count, - leaves_count, - leaf_size, - cap_height, - 0, // hash_type: 0 for Poseidon - ); - } else { - // Single GPU path - fill_digests_buf_linear_gpu_with_gpu_ptr( - gpu_digests_buf.as_mut_ptr() as *mut core::ffi::c_void, - gpu_cap_buf.as_mut_ptr() as *mut core::ffi::c_void, - leaves_ptr as *mut core::ffi::c_void, - digests_count, - caps_count, - leaves_count, - leaf_size, - cap_height, - 0, // hash_type: 0 for Poseidon - gpu_id, - ); - } - } - - let stream1 = CudaStream::create().unwrap(); - let stream2 = CudaStream::create().unwrap(); - - gpu_digests_buf - .copy_to_host_ptr_async( - digests_buf.as_mut_ptr() as *mut core::ffi::c_void, - digests_size, - &stream1, - ) - .expect("copy digests"); - gpu_cap_buf - .copy_to_host_ptr_async( - cap_buf.as_mut_ptr() as *mut core::ffi::c_void, - caps_size, - &stream2, - ) - .expect("copy caps"); - stream1.synchronize().expect("cuda sync"); - stream2.synchronize().expect("cuda sync"); - stream1.destroy().expect("cuda stream destroy"); - stream2.destroy().expect("cuda stream destroy"); +#[repr(C)] +union U8U64 { + f1: [u8; 32], + f2: [u64; 4], } #[cfg(feature = "cuda")] @@ -356,30 +269,6 @@ fn fill_digests_buf_gpu>( cap_height: usize, ) { let leaves_count = leaves.len() / leaf_size; - let gpu_id = 0; - - let mut gpu_leaves_buf: HostOrDeviceSlice<'_, F> = - HostOrDeviceSlice::cuda_malloc(gpu_id as i32, leaves.len()).unwrap(); - - let _ = gpu_leaves_buf.copy_from_host(leaves.as_slice()); - - fill_digests_buf_gpu_ptr::( - digests_buf, - cap_buf, - gpu_leaves_buf.as_mut_ptr(), - leaves_count, - leaf_size, - cap_height, - gpu_id, - ); -} - -pub(crate) fn merkle_tree_prove>( - leaf_index: usize, - leaves_len: usize, - cap_height: usize, -) { - let leaves_count = leaves.len() / leaf_size; let num_gpus: usize = std::env::var("NUM_OF_GPUS") .expect("NUM_OF_GPUS should be set") @@ -552,7 +441,7 @@ fn fill_digests_buf_meta>( cap_height: usize, ) { // if the input is small or if it Keccak hashing, just do the hashing on CPU - if leaf_size <= H::HASH_SIZE / 8 { + if leaf_size <= H::HASH_SIZE / 8 || H::HASHER_TYPE == HasherType::Keccak { fill_digests_buf::(digests_buf, cap_buf, leaves, leaf_size, cap_height); } else { fill_digests_buf_gpu::(digests_buf, cap_buf, leaves, leaf_size, cap_height); @@ -589,57 +478,12 @@ impl> MerkleTree { let digests_buf = capacity_up_to_mut(&mut digests, num_digests); let cap_buf = capacity_up_to_mut(&mut cap, len_cap); - - #[cfg(feature = "cuda")] - { - // Check if we should use GPU acceleration - // Use GPU for large trees (>= 1024 leaves) or if CUDA_MERKLE_THRESHOLD is set - let use_gpu = if let Ok(threshold_str) = std::env::var("CUDA_MERKLE_THRESHOLD") { - if let Ok(threshold) = threshold_str.parse::() { - leaves.len() >= threshold - } else { - leaves.len() >= 1024 - } - } else { - leaves.len() >= 1024 - }; - - if use_gpu { - // Flatten leaves into 1D vector for GPU - let leaf_size = if leaves.is_empty() { - 0 - } else { - leaves[0].len() - }; - let zeros = vec![F::ZERO; leaf_size]; - let mut leaves_1d: Vec = Vec::with_capacity(leaves.len() * leaf_size); - for leaf in &leaves { - if leaf.is_empty() { - leaves_1d.extend(zeros.clone()); - } else { - leaves_1d.extend(leaf.clone()); - } - } - - fill_digests_buf_gpu::( - digests_buf, - cap_buf, - &leaves_1d, - leaf_size, - cap_height, - ); - } else { - fill_digests_buf::(digests_buf, cap_buf, &leaves[..], cap_height); - } - } - - #[cfg(not(feature = "cuda"))] - { - fill_digests_buf::(digests_buf, cap_buf, &leaves[..], cap_height); - } + let now = Instant::now(); + fill_digests_buf_meta::(digests_buf, cap_buf, &leaves_1d, leaf_size, cap_height); + print_time(now, "fill digests buffer"); unsafe { - // SAFETY: `fill_digests_buf` or `fill_digests_buf_gpu` initialized the spare capacity up to + // SAFETY: `fill_digests_buf` and `cap` initialized the spare capacity up to // `num_digests` and `len_cap`, resp. digests.set_len(num_digests); cap.set_len(len_cap); @@ -998,7 +842,9 @@ mod tests { use super::*; use crate::field::extension::Extendable; use crate::hash::merkle_proofs::verify_merkle_proof_to_cap; - use crate::plonk::config::{GenericConfig, KeccakGoldilocksConfig, PoseidonGoldilocksConfig}; + use crate::plonk::config::{ + GenericConfig, KeccakGoldilocksConfig, PoseidonGoldilocksConfig, + }; fn random_data(n: usize, k: usize) -> Vec> { (0..n).map(|_| F::rand_vec(k)).collect() @@ -1343,7 +1189,7 @@ mod tests { Ok(()) } - + #[test] fn test_merkle_trees_keccak() -> Result<()> { const D: usize = 2; @@ -1358,4 +1204,4 @@ mod tests { Ok(()) } -} +} \ No newline at end of file From 022397af1cc0c39f97cab780b97cf0cde5515858 Mon Sep 17 00:00:00 2001 From: lighter-zz Date: Tue, 9 Dec 2025 14:05:05 -0500 Subject: [PATCH 13/37] fixes --- plonky2/src/hash/merkle_tree.rs | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs index dfdf94421..461c04754 100644 --- a/plonky2/src/hash/merkle_tree.rs +++ b/plonky2/src/hash/merkle_tree.rs @@ -1,27 +1,27 @@ -#[cfg(feature = "cuda")] -use alloc::sync::Arc; -#[cfg(not(feature = "std"))] -use alloc::vec::Vec; use core::mem::MaybeUninit; use core::slice; use std::collections::HashSet; #[cfg(feature = "cuda")] +use std::sync::Arc; +#[cfg(feature = "cuda")] use std::sync::Mutex; use std::time::Instant; +#[cfg(not(feature = "std"))] +use std::vec::Vec; -#[cfg(feature = "cuda")] -use cryptography_cuda::device::memory::HostOrDeviceSlice; -#[cfg(feature = "cuda")] -use cryptography_cuda::device::stream::CudaStream; -#[cfg(feature = "cuda")] -use cryptography_cuda::merkle::bindings::{ - fill_digests_buf_linear_gpu_with_gpu_ptr, fill_digests_buf_linear_multigpu_with_gpu_ptr, -}; use num::range; #[cfg(feature = "cuda")] use once_cell::sync::Lazy; use plonky2_maybe_rayon::*; use serde::{Deserialize, Serialize}; +#[cfg(feature = "cuda")] +use zeknox::device::memory::HostOrDeviceSlice; +#[cfg(feature = "cuda")] +use zeknox::device::stream::CudaStream; +#[cfg(feature = "cuda")] +use zeknox::merkle::bindings::{ + fill_digests_buf_linear_gpu_with_gpu_ptr, fill_digests_buf_linear_multigpu_with_gpu_ptr, +}; use crate::hash::hash_types::RichField; #[cfg(feature = "cuda")] @@ -842,9 +842,7 @@ mod tests { use super::*; use crate::field::extension::Extendable; use crate::hash::merkle_proofs::verify_merkle_proof_to_cap; - use crate::plonk::config::{ - GenericConfig, KeccakGoldilocksConfig, PoseidonGoldilocksConfig, - }; + use crate::plonk::config::{GenericConfig, KeccakGoldilocksConfig, PoseidonGoldilocksConfig}; fn random_data(n: usize, k: usize) -> Vec> { (0..n).map(|_| F::rand_vec(k)).collect() @@ -1189,7 +1187,7 @@ mod tests { Ok(()) } - + #[test] fn test_merkle_trees_keccak() -> Result<()> { const D: usize = 2; @@ -1204,4 +1202,4 @@ mod tests { Ok(()) } -} \ No newline at end of file +} From 914139fabdcb186bb612e3e9c0eeb7301f9a485c Mon Sep 17 00:00:00 2001 From: lighter-zz Date: Tue, 9 Dec 2025 14:06:18 -0500 Subject: [PATCH 14/37] fix --- plonky2/src/hash/merkle_tree.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs index 461c04754..5bfbb663a 100644 --- a/plonky2/src/hash/merkle_tree.rs +++ b/plonky2/src/hash/merkle_tree.rs @@ -19,7 +19,7 @@ use zeknox::device::memory::HostOrDeviceSlice; #[cfg(feature = "cuda")] use zeknox::device::stream::CudaStream; #[cfg(feature = "cuda")] -use zeknox::merkle::bindings::{ +use zeknox::{ fill_digests_buf_linear_gpu_with_gpu_ptr, fill_digests_buf_linear_multigpu_with_gpu_ptr, }; From 80d429282b21b0cd9a98a5ea0a75fab56a353fc2 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 9 Dec 2025 21:12:27 +0000 Subject: [PATCH 15/37] merkle tree good version --- field/src/fft.rs | 676 +------------------------------- field/src/polynomial/mod.rs | 73 +--- plonky2/examples/fibonacci.rs | 40 +- plonky2/src/hash/merkle_tree.rs | 10 +- 4 files changed, 36 insertions(+), 763 deletions(-) diff --git a/field/src/fft.rs b/field/src/fft.rs index 682a1e33d..d078ca6c3 100644 --- a/field/src/fft.rs +++ b/field/src/fft.rs @@ -32,284 +32,16 @@ pub fn fft_root_table(n: usize) -> FftRootTable { root_table } -#[cfg(feature = "cuda")] -fn fft_dispatch_gpu( - input: &mut [F], - zero_factor: Option, - root_table: Option<&FftRootTable>, -) { - // if F::CUDA_SUPPORT { - // use zeknox::ntt_batch; - // use zeknox::types::NTTConfig; - - // let mut a = input.to_vec(); - // let mut b = input.to_vec(); - - // ntt_batch( - // 0, - // a.as_mut_ptr(), - // input.len().trailing_zeros() as usize, - // NTTConfig::default(), - // ); - - // fft_dispatch_cpu(&mut b, zero_factor, root_table); - // ark_std::println!("a: {:?}", a); - // ark_std::println!("b: {:?}", b); - - // assert_eq!( - // a, b, - // "failed GPU FFT vs CPU FFT comparison\ngpu:{:?}\ncpu:{:?}\ninput:{:?}", - // a, b, input - // ); - - // input.copy_from_slice(&a); - // } - // return fft_dispatch_cpu(input, zero_factor, root_table); - - use zeknox::ntt_batch; - use zeknox::types::NTTConfig; - if F::CUDA_SUPPORT { - return ntt_batch( - 0, - input.as_mut_ptr(), - input.len().trailing_zeros() as usize, - NTTConfig::default(), - ); - } else { - return fft_dispatch_cpu(input, zero_factor, root_table); - } -} - -/// Batch FFT computation for multiple polynomials on GPU -#[cfg(feature = "cuda")] -fn fft_batch_dispatch_gpu( - inputs: &mut [F], - poly_size: usize, - num_polys: usize, - zero_factor: Option, - root_table: Option<&FftRootTable>, -) { - use zeknox::ntt_batch; - use zeknox::types::NTTConfig; - - if F::CUDA_SUPPORT { - let mut cfg = NTTConfig::default(); - cfg.batches = num_polys as u32; - - return ntt_batch( - 0, - inputs.as_mut_ptr(), - poly_size.trailing_zeros() as usize, - cfg, - ); - } else { - // Fallback to CPU: process each polynomial separately - for i in 0..num_polys { - let start = i * poly_size; - let end = start + poly_size; - fft_dispatch_cpu(&mut inputs[start..end], zero_factor, root_table); - } - } -} - -#[cfg(feature = "cuda")] -pub(crate) fn coset_fft_gpu( - poly: PolynomialCoeffs, - zero_factor: Option, - root_table: Option<&FftRootTable>, -) -> PolynomialValues { - use zeknox::ntt_batch; - use zeknox::types::NTTConfig; - - if !F::CUDA_SUPPORT { - // Fallback to CPU if CUDA not supported for this field - let modified_poly: PolynomialCoeffs = F::coset_shift() - .powers() - .zip(&poly.coeffs) - .map(|(r, &c)| r * c) - .collect::>() - .into(); - return fft_with_options(modified_poly, zero_factor, root_table); - } - - let PolynomialCoeffs { coeffs: mut buffer } = poly; - let lg_n = buffer.len().trailing_zeros() as usize; - - // // Initialize coset on GPU - // // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR) - // // TODO: Make this generic for other fields if needed - // let coset_gen_u64 = 7u64; - // init_coset_rs(0, lg_n, coset_gen_u64); - - // Configure NTT for coset - let mut cfg = NTTConfig::default(); - cfg.with_coset = true; - cfg.ntt_type = zeknox::types::NTTType::Coset; - - // Perform coset NTT on GPU - ntt_batch(0, buffer.as_mut_ptr(), lg_n, cfg); - - PolynomialValues::new(buffer) -} - -/// Batch coset FFT computation for multiple polynomials on GPU -#[cfg(feature = "cuda")] -fn coset_fft_batch_gpu( - polys: Vec>, - zero_factor: Option, - root_table: Option<&FftRootTable>, -) -> Vec> { - use zeknox::ntt_batch; - use zeknox::types::NTTConfig; - - if polys.is_empty() { - return Vec::new(); - } - - let num_polys = polys.len(); - let poly_size = polys[0].len(); - - // Verify all polynomials have the same size - assert!( - polys.iter().all(|p| p.len() == poly_size), - "All polynomials must have the same size for batch coset FFT" - ); - - if !F::CUDA_SUPPORT { - // Fallback to CPU if CUDA not supported for this field - return polys - .into_iter() - .map(|poly| { - let modified_poly: PolynomialCoeffs = F::coset_shift() - .powers() - .zip(&poly.coeffs) - .map(|(r, &c)| r * c) - .collect::>() - .into(); - fft_with_options(modified_poly, zero_factor, root_table) - }) - .collect(); - } - - // Flatten all polynomials into a single contiguous buffer - let mut buffer: Vec = Vec::with_capacity(num_polys * poly_size); - for poly in polys { - buffer.extend_from_slice(&poly.coeffs); - } - - let lg_n = poly_size.trailing_zeros() as usize; - - // Configure NTT for batch coset - let mut cfg = NTTConfig::default(); - cfg.batches = num_polys as u32; - cfg.with_coset = true; - cfg.ntt_type = zeknox::types::NTTType::Coset; - - // Perform batch coset NTT on GPU - ntt_batch(0, buffer.as_mut_ptr(), lg_n, cfg); - - // Split the buffer back into separate polynomials - buffer - .chunks(poly_size) - .map(|chunk| PolynomialValues::new(chunk.to_vec())) - .collect() -} - -/// Compute coset FFT for multiple polynomials in batch. -/// All polynomials must have the same size (power of 2). -/// Returns a vector of PolynomialValues in the same order as input. -pub fn coset_fft_batch(polys: Vec>) -> Vec> { - coset_fft_batch_with_options(polys, None, None) -} - -/// Compute coset FFT for multiple polynomials in batch with options. -/// All polynomials must have the same size (power of 2). -/// Returns a vector of PolynomialValues in the same order as input. -pub fn coset_fft_batch_with_options( - polys: Vec>, - zero_factor: Option, - root_table: Option<&FftRootTable>, -) -> Vec> { - // #[cfg(feature = "cuda")] - // { - // let a = coset_fft_batch_gpu(polys.clone(), zero_factor, root_table); - // let b = polys - // .into_iter() - // .map(|poly| { - // let modified_poly: PolynomialCoeffs = F::coset_shift() - // .powers() - // .zip(&poly.coeffs) - // .map(|(r, &c)| r * c) - // .collect::>() - // .into(); - // fft_with_options(modified_poly, zero_factor, root_table) - // }) - // .collect::>(); - // assert_eq!(a.len(), b.len()); - - // for (i, (val_a, val_b)) in a.iter().zip(b.iter()).enumerate() { - // assert_eq!(val_a, val_b, "Mismatch at index {}", i); - // } - - // return a; - // } - - // #[cfg(not(feature = "cuda"))] - // { - // CPU fallback: process each polynomial separately - polys - .into_iter() - .map(|poly| { - let modified_poly: PolynomialCoeffs = F::coset_shift() - .powers() - .zip(&poly.coeffs) - .map(|(r, &c)| r * c) - .collect::>() - .into(); - fft_with_options(modified_poly, zero_factor, root_table) - }) - .collect() - // } -} - -pub(crate) fn fft_dispatch_cpu( - input: &mut [F], - zero_factor: Option, - root_table: Option<&FftRootTable>, -) { - if root_table.is_some() { - return fft_classic(input, zero_factor.unwrap_or(0), root_table.unwrap()); - } else { - // let pre_computed = F::pre_compute_fft_root_table(input.len()); - // if pre_computed.is_some() { - // return fft_classic(input, zero_factor.unwrap_or(0), pre_computed.unwrap()); - // } else { - // let computed = fft_root_table::(input.len()); - - // return fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref()); - // } - let computed = fft_root_table::(input.len()); - - return fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref()); - }; -} - #[inline] fn fft_dispatch( input: &mut [F], zero_factor: Option, root_table: Option<&FftRootTable>, ) { - #[cfg(feature = "cuda")] - { - // ark_std::println!("Using GPU FFT dispatch"); - return fft_dispatch_gpu(input, zero_factor, root_table); - } - #[cfg(not(feature = "cuda"))] - { - // ark_std::println!("Using CPU FFT dispatch"); - return fft_dispatch_cpu(input, zero_factor, root_table); - } + let computed_root_table = root_table.is_none().then(|| fft_root_table(input.len())); + let used_root_table = root_table.or(computed_root_table.as_ref()).unwrap(); + + fft_classic(input, zero_factor.unwrap_or(0), used_root_table); } #[inline] @@ -328,66 +60,6 @@ pub fn fft_with_options( PolynomialValues::new(buffer) } -/// Compute FFT for multiple polynomials in batch. -/// All polynomials must have the same size (power of 2). -/// Returns a vector of PolynomialValues in the same order as input. -#[inline] -pub fn fft_batch(polys: Vec>) -> Vec> { - fft_batch_with_options(polys, None, None) -} - -/// Compute FFT for multiple polynomials in batch with options. -/// All polynomials must have the same size (power of 2). -/// Returns a vector of PolynomialValues in the same order as input. -pub fn fft_batch_with_options( - polys: Vec>, - zero_factor: Option, - root_table: Option<&FftRootTable>, -) -> Vec> { - if polys.is_empty() { - return Vec::new(); - } - - let num_polys = polys.len(); - let poly_size = polys[0].len(); - - // Verify all polynomials have the same size - assert!( - polys.iter().all(|p| p.len() == poly_size), - "All polynomials must have the same size for batch FFT" - ); - assert!( - poly_size.is_power_of_two(), - "Polynomial size must be a power of 2" - ); - - // Flatten all polynomials into a single contiguous buffer - let mut buffer: Vec = Vec::with_capacity(num_polys * poly_size); - for poly in polys { - buffer.extend_from_slice(&poly.coeffs); - } - - // Dispatch to GPU or CPU batch processing - #[cfg(feature = "cuda")] - fft_batch_dispatch_gpu(&mut buffer, poly_size, num_polys, zero_factor, root_table); - - #[cfg(not(feature = "cuda"))] - { - // CPU fallback: process each polynomial separately - for i in 0..num_polys { - let start = i * poly_size; - let end = start + poly_size; - fft_dispatch_cpu(&mut buffer[start..end], zero_factor, root_table); - } - } - - // Split the buffer back into separate polynomials - buffer - .chunks(poly_size) - .map(|chunk| PolynomialValues::new(chunk.to_vec())) - .collect() -} - #[inline] pub fn ifft(poly: PolynomialValues) -> PolynomialCoeffs { ifft_with_options(poly, None, None) @@ -534,112 +206,18 @@ mod tests { use alloc::vec::Vec; use plonky2_util::{log2_ceil, log2_strict}; - #[cfg(feature = "cuda")] - use zeknox::init_twiddle_factors_rs; - #[cfg(feature = "cuda")] - use crate::fft::{coset_fft_batch, fft_dispatch_cpu, fft_dispatch_gpu}; - use crate::fft::{fft, fft_batch, fft_with_options, ifft}; + use crate::fft::{fft, fft_with_options, ifft}; use crate::goldilocks_field::GoldilocksField; use crate::polynomial::{PolynomialCoeffs, PolynomialValues}; use crate::types::Field; - #[test] - #[cfg(feature = "cuda")] - fn test_kat() { - init_twiddle_factors_rs(0, 4); - - let input = [ - 16807u64, - 10376289027450995739, - 18446743787439915009, - 1905022641934172156, - 4730749933575995392, - 68841472, - 18428264577490855681, - 18445589101169082369, - 18446744069414567514, - 8070455041963588582, - 49, - 1625527855624486912, - 7, - 18446744069414555649, - 7696581392640, - 481036337152, - ]; - let input_field: Vec = input - .iter() - .map(|&x| GoldilocksField::from_canonical_u64(x)) - .collect(); - - let res_cpu = [ - 8241673866677297204, - 18443207692673526440, - 3336172192632445894, - 12915814655533318448, - 5977358399840934215, - 2796120128477098295, - 16099264885043452953, - 1114428869533774434, - 1182881845840683068, - 18442399148451944616, - 5639697009785877037, - 5534977815694745617, - 3521085621945067109, - 15650623939293352472, - 11342098386477995483, - 17336148097415430195, - ]; - let res_cpu_field: Vec = res_cpu - .iter() - .map(|&x| GoldilocksField::from_canonical_u64(x)) - .collect(); - - let res_gpu = [ - 8241673866677297204, - 18443207692673526440, - 3336172192632445894, - 12915814655533318448, - 5977358399840934215, - 2796120128477098295, - 16099264885043452953, - 1114428869533774434, - 1182881845840683068, - 18442399148451944616, - 5639697009785877037, - 5534977815694745617, - 3521085621945067109, - 15650623939293352472, - 11342098386477995483, - 17336148097415430195, - ]; - let res_gpu_field: Vec = res_gpu - .iter() - .map(|&x| GoldilocksField::from_canonical_u64(x)) - .collect(); - - let mut input_cpu = input_field.clone(); - fft_dispatch_cpu(&mut input_cpu, None, None); - assert_eq!(input_cpu, res_cpu_field); - - let mut input_gpu = input_field.clone(); - fft_dispatch_gpu(&mut input_gpu, None, None); - assert_eq!(input_gpu, res_gpu_field); - } - #[test] fn fft_and_ifft() { type F = GoldilocksField; let degree = 200usize; let degree_padded = degree.next_power_of_two(); - #[cfg(feature = "cuda")] - let log_degree = { - zeknox::clear_cuda_errors_rs(); - let log_degree = degree_padded.trailing_zeros() as usize; - init_twiddle_factors_rs(0, log_degree); - log_degree - }; // Create a vector of coeffs; the first degree of them are // "random", the last degree_padded-degree of them are zero. let coeffs = (0..degree) @@ -661,8 +239,6 @@ mod tests { } for r in 0..4 { - #[cfg(feature = "cuda")] - init_twiddle_factors_rs(0, log_degree + r); // expand coefficients by factor 2^r by filling with zeros let zero_tail = coefficients.lde(r); assert_eq!( @@ -672,248 +248,6 @@ mod tests { } } - #[test] - #[cfg(feature = "cuda")] - fn test_fft_gpu_vs_cpu_single() { - type F = GoldilocksField; - - // Test various polynomial sizes - for log_size in [8, 10, 12, 14, 16, 18, 20] { - let size = 1 << log_size; - zeknox::clear_cuda_errors_rs(); - init_twiddle_factors_rs(0, log_size); - - // Create a random polynomial - let coeffs: Vec = (0..size) - .map(|i| F::from_canonical_usize(i * 7919 % 1000000)) - .collect(); - - let poly = PolynomialCoeffs { - coeffs: coeffs.clone(), - }; - - // Compute FFT using GPU (via fft function which dispatches to GPU) - let gpu_result = fft(poly.clone()); - - // Compute FFT using CPU (force CPU path) - let mut cpu_buffer = coeffs.clone(); - super::fft_dispatch_cpu(&mut cpu_buffer, None, None); - let cpu_result = PolynomialValues::new(cpu_buffer); - - // Compare results - assert_eq!( - gpu_result.len(), - cpu_result.len(), - "GPU and CPU results have different lengths for size {}", - size - ); - - for i in 0..size { - assert_eq!( - gpu_result.values[i], cpu_result.values[i], - "Mismatch at index {} for polynomial size {}", - i, size - ); - } - } - } - - #[test] - #[cfg(feature = "cuda")] - fn test_fft_batch_gpu_vs_cpu() { - type F = GoldilocksField; - - let poly_size: usize = 1 << 10; // 1024 elements - let num_polys = 8; - let log_size = poly_size.trailing_zeros() as usize; - - zeknox::clear_cuda_errors_rs(); - init_twiddle_factors_rs(0, log_size); - - // Create multiple random polynomials - let polys: Vec> = (0..num_polys) - .map(|batch_idx| { - let coeffs: Vec = (0..poly_size) - .map(|i| F::from_canonical_usize((i * 7919 + batch_idx * 12345) % 1000000)) - .collect(); - PolynomialCoeffs { coeffs } - }) - .collect(); - - // Compute batch FFT using GPU - let gpu_results = fft_batch(polys.clone()); - - // Compute FFT for each polynomial using CPU - let cpu_results: Vec> = polys - .into_iter() - .map(|poly| { - let mut buffer = poly.coeffs.clone(); - super::fft_dispatch_cpu(&mut buffer, None, None); - PolynomialValues::new(buffer) - }) - .collect(); - - // Compare results - assert_eq!(gpu_results.len(), cpu_results.len()); - for (batch_idx, (gpu_result, cpu_result)) in - gpu_results.iter().zip(cpu_results.iter()).enumerate() - { - assert_eq!(gpu_result.len(), cpu_result.len()); - for i in 0..poly_size { - assert_eq!( - gpu_result.values[i], cpu_result.values[i], - "Batch FFT mismatch at batch {} index {}", - batch_idx, i - ); - } - } - } - - #[test] - #[cfg(feature = "cuda")] - fn test_coset_fft_gpu_vs_cpu_single() { - use zeknox::init_coset_rs; - - use crate::types::PrimeField64; - type F = GoldilocksField; - - for log_size in [8, 10, 12] { - let size = 1 << log_size; - zeknox::clear_cuda_errors_rs(); - init_twiddle_factors_rs(0, log_size); - - // Initialize coset for GPU - let coset_gen_u64 = F::coset_shift().to_canonical_u64(); - init_coset_rs(0, log_size, coset_gen_u64); - - // Create a random polynomial - let coeffs: Vec = (0..size) - .map(|i| F::from_canonical_usize(i * 8191 % 1000000)) - .collect(); - - let poly = PolynomialCoeffs { - coeffs: coeffs.clone(), - }; - - // Compute coset FFT using GPU - let gpu_result = super::coset_fft_gpu(poly.clone(), None, None); - - // Compute coset FFT using CPU (apply coset shift then FFT) - let modified_poly: PolynomialCoeffs = F::coset_shift() - .powers() - .zip(&coeffs) - .map(|(r, &c)| r * c) - .collect::>() - .into(); - - let mut cpu_buffer = modified_poly.coeffs; - super::fft_dispatch_cpu(&mut cpu_buffer, None, None); - let cpu_result = PolynomialValues::new(cpu_buffer); - - // Compare results - assert_eq!( - gpu_result.len(), - cpu_result.len(), - "GPU and CPU coset FFT results have different lengths for size {}", - size - ); - - for i in 0..size { - assert_eq!( - gpu_result.values[i], cpu_result.values[i], - "Coset FFT mismatch at index {} for polynomial size {}", - i, size - ); - } - } - } - - #[test] - #[cfg(feature = "cuda")] - fn test_coset_fft_batch_gpu_vs_cpu() { - use zeknox::init_coset_rs; - - use crate::types::PrimeField64; - type F = GoldilocksField; - - let poly_size: usize = 1 << 10; // 1024 elements - let num_polys = 8; - let log_size = poly_size.trailing_zeros() as usize; - - zeknox::clear_cuda_errors_rs(); - init_twiddle_factors_rs(0, log_size); - - // Initialize coset for GPU - let coset_gen_u64 = F::coset_shift().to_canonical_u64(); - init_coset_rs(0, log_size, coset_gen_u64); - - // Create multiple random polynomials - let polys: Vec> = (0..num_polys) - .map(|batch_idx| { - let coeffs: Vec = (0..poly_size) - .map(|i| F::from_canonical_usize((i * 8191 + batch_idx * 54321) % 1000000)) - .collect(); - PolynomialCoeffs { coeffs } - }) - .collect(); - - // Compute batch coset FFT using GPU - let gpu_results = coset_fft_batch(polys.clone()); - - // Compute coset FFT for each polynomial using CPU - let cpu_results: Vec> = polys - .into_iter() - .map(|poly| { - let modified_poly: PolynomialCoeffs = F::coset_shift() - .powers() - .zip(&poly.coeffs) - .map(|(r, &c)| r * c) - .collect::>() - .into(); - - let mut buffer = modified_poly.coeffs; - super::fft_dispatch_cpu(&mut buffer, None, None); - PolynomialValues::new(buffer) - }) - .collect(); - - // Compare results - assert_eq!(gpu_results.len(), cpu_results.len()); - for (batch_idx, (gpu_result, cpu_result)) in - gpu_results.iter().zip(cpu_results.iter()).enumerate() - { - assert_eq!(gpu_result.len(), cpu_result.len()); - for i in 0..poly_size { - assert_eq!( - gpu_result.values[i], cpu_result.values[i], - "Batch coset FFT mismatch at batch {} index {}", - batch_idx, i - ); - } - } - } - - #[test] - fn test_batch_fft_empty() { - type F = GoldilocksField; - let polys: Vec> = vec![]; - let results = fft_batch(polys); - assert!(results.is_empty()); - } - - #[test] - #[should_panic(expected = "All polynomials must have the same size")] - fn test_batch_fft_different_sizes() { - type F = GoldilocksField; - let poly1 = PolynomialCoeffs { - coeffs: vec![F::ONE; 256], - }; - let poly2 = PolynomialCoeffs { - coeffs: vec![F::ONE; 512], - }; - let _ = fft_batch(vec![poly1, poly2]); - } - fn evaluate_naive(coefficients: &PolynomialCoeffs) -> PolynomialValues { let degree = coefficients.len(); let degree_padded = 1 << log2_ceil(degree); diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs index a78cc10d1..c13bbca27 100644 --- a/field/src/polynomial/mod.rs +++ b/field/src/polynomial/mod.rs @@ -12,7 +12,7 @@ use plonky2_util::log2_strict; use serde::{Deserialize, Serialize}; use crate::extension::{Extendable, FieldExtension}; -use crate::fft::{fft, fft_dispatch_cpu, fft_with_options, ifft, FftRootTable}; +use crate::fft::{fft, fft_with_options, ifft, FftRootTable}; use crate::types::Field; /// A polynomial in point-value form. @@ -283,26 +283,13 @@ impl PolynomialCoeffs { zero_factor: Option, root_table: Option<&FftRootTable>, ) -> PolynomialValues { - #[cfg(feature = "cuda")] - { - if F::CUDA_SUPPORT && shift == F::coset_shift() { - // Use GPU coset FFT directly without CPU-side coefficient modification - // ark_std::println!("Using GPU coset FFT: degree {}", self.len() - 1); - return crate::fft::coset_fft_gpu(self.clone(), zero_factor, root_table); - } - } - - // CPU path: multiply by powers of shift, then do regular FFT - let mut modified_poly: Self = shift + let modified_poly: Self = shift .powers() .zip(&self.coeffs) .map(|(r, &c)| r * c) .collect::>() .into(); - - fft_dispatch_cpu(&mut modified_poly.coeffs, zero_factor, root_table); - modified_poly.coeffs.into() - // modified_poly.fft_with_options(zero_factor, root_table) + modified_poly.fft_with_options(zero_factor, root_table) } pub fn to_extension(&self) -> PolynomialCoeffs @@ -453,8 +440,6 @@ impl Mul for &PolynomialCoeffs { mod tests { use std::time::Instant; - #[cfg(feature = "cuda")] - use plonky2_util::log2_ceil; use rand::rngs::OsRng; use rand::Rng; @@ -494,13 +479,6 @@ mod tests { let k = 8; let n = 1 << k; - - #[cfg(feature = "cuda")] - { - zeknox::clear_cuda_errors_rs(); - zeknox::init_twiddle_factors_rs(0, k); - } - let poly = PolynomialCoeffs::new(F::rand_vec(n)); let shift = F::rand(); let coset_evals = poly.coset_fft(shift).values; @@ -522,13 +500,6 @@ mod tests { let k = 8; let n = 1 << k; - - #[cfg(feature = "cuda")] - { - zeknox::clear_cuda_errors_rs(); - zeknox::init_twiddle_factors_rs(0, k); - } - let evals = PolynomialValues::new(F::rand_vec(n)); let shift = F::rand(); let coeffs = evals.clone().coset_ifft(shift); @@ -549,12 +520,6 @@ mod tests { type F = GoldilocksField; let mut rng = OsRng; let (a_deg, b_deg) = (rng.gen_range(1..10_000), rng.gen_range(1..10_000)); - - #[cfg(feature = "cuda")] - { - zeknox::clear_cuda_errors_rs(); - zeknox::init_twiddle_factors_rs(0, log2_ceil(a_deg + b_deg + 1)); - } let a = PolynomialCoeffs::new(F::rand_vec(a_deg)); let b = PolynomialCoeffs::new(F::rand_vec(b_deg)); let m1 = &a * &b; @@ -572,24 +537,11 @@ mod tests { let mut rng = OsRng; let a_deg = rng.gen_range(0..1_000); let n = rng.gen_range(1..1_000); - - #[cfg(feature = "cuda")] - { - zeknox::clear_cuda_errors_rs(); - for i in 1..=log2_ceil(max(a_deg, n)) + 1 { - zeknox::init_twiddle_factors_rs(0, i); - } - } - let mut a = PolynomialCoeffs::new(F::rand_vec(a_deg + 1)); - println!("a {} b {}", a.len(), n); - if a.coeffs[0].is_zero() { a.coeffs[0] = F::ONE; // First coefficient needs to be nonzero. } let b = a.inv_mod_xn(n); - println!("a {} b {}", a.len(), b.len()); - let mut m = &a * &b; m.coeffs.truncate(n); m.trim(); @@ -623,15 +575,6 @@ mod tests { type F = GoldilocksField; let mut rng = OsRng; let (a_deg, b_deg) = (rng.gen_range(1..10_000), rng.gen_range(1..10_000)); - - #[cfg(feature = "cuda")] - { - zeknox::clear_cuda_errors_rs(); - for i in 1..=log2_ceil(max(a_deg, b_deg)) + 1 { - zeknox::init_twiddle_factors_rs(0, i); - } - } - let a = PolynomialCoeffs::new(F::rand_vec(a_deg)); let b = PolynomialCoeffs::new(F::rand_vec(b_deg)); let (q, r) = a.div_rem(&b); @@ -663,7 +606,6 @@ mod tests { let mut rng = OsRng; let l = 14; let n = 1 << l; - let g = F::primitive_root_of_unity(l); let xn_minus_one = { let mut xn_min_one_vec = vec![F::ZERO; n + 1]; @@ -674,15 +616,6 @@ mod tests { let a = g.exp_u64(rng.gen_range(0..(n as u64))); let denom = PolynomialCoeffs::new(vec![-a, F::ONE]); - - #[cfg(feature = "cuda")] - { - zeknox::clear_cuda_errors_rs(); - for i in 1..=l + 1 { - zeknox::init_twiddle_factors_rs(0, i); - } - } - let now = Instant::now(); xn_minus_one.div_rem(&denom); println!("Division time: {:?}", now.elapsed()); diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs index 2d2460b92..9573048a6 100644 --- a/plonky2/examples/fibonacci.rs +++ b/plonky2/examples/fibonacci.rs @@ -29,33 +29,33 @@ fn main() -> Result<()> { let initial_b = builder.add_virtual_target(); let mut prev_target = initial_a; let mut cur_target = initial_b; - for _ in 0..9999999 { + for _ in 0..2999999 { let temp = builder.add(prev_target, cur_target); prev_target = cur_target; cur_target = temp; } println!("Circuit built."); - #[cfg(feature = "cuda")] - { - zeknox::clear_cuda_errors_rs(); - println!("Initializing CUDA twiddle factors..."); - // Initialize twiddle factors for all dimensions that will be used - // This test involves multiple polynomials and recursive verification, - // so we initialize a wider range of dimensions to be safe - // for i in 0..=19 { - // zeknox::init_twiddle_factors_rs(0, i); - // } + // #[cfg(feature = "cuda")] + // { + // zeknox::clear_cuda_errors_rs(); + // println!("Initializing CUDA twiddle factors..."); + // // Initialize twiddle factors for all dimensions that will be used + // // This test involves multiple polynomials and recursive verification, + // // so we initialize a wider range of dimensions to be safe + // // for i in 0..=19 { + // // zeknox::init_twiddle_factors_rs(0, i); + // // } - zeknox::init_twiddle_factors_rs(0, 19); - zeknox::init_twiddle_factors_rs(0, 22); - // Initialize coset on GPU - // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR) - // TODO: Make this generic for other fields if needed - let coset_gen_u64 = 7u64; - zeknox::init_coset_rs(0, 22, coset_gen_u64); - // zeknox::init_coset_rs(0, 16, coset_gen_u64); - } + // zeknox::init_twiddle_factors_rs(0, 19); + // zeknox::init_twiddle_factors_rs(0, 22); + // // Initialize coset on GPU + // // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR) + // // TODO: Make this generic for other fields if needed + // let coset_gen_u64 = 7u64; + // zeknox::init_coset_rs(0, 22, coset_gen_u64); + // // zeknox::init_coset_rs(0, 16, coset_gen_u64); + // } // Public inputs are the two initial values (provided below) and the result (which is generated). builder.register_public_input(initial_a); diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs index 5bfbb663a..3561471bd 100644 --- a/plonky2/src/hash/merkle_tree.rs +++ b/plonky2/src/hash/merkle_tree.rs @@ -281,10 +281,16 @@ fn fill_digests_buf_gpu>( if *gpu_id_lock >= num_gpus as u64 { *gpu_id_lock = 0; } + println!("Using GPU id {} leave length {}", gpu_id, leaves.len()); let now = Instant::now(); - let mut gpu_leaves_buf: HostOrDeviceSlice<'_, F> = - HostOrDeviceSlice::cuda_malloc(gpu_id as i32, leaves.len()).unwrap(); + let gpu_leaves_buf_result = HostOrDeviceSlice::cuda_malloc(gpu_id as i32, leaves.len()); + + if gpu_leaves_buf_result.is_err() { + panic!("CUDA malloc failed, falling back to CPU for Merkle tree generation"); + } + + let mut gpu_leaves_buf = gpu_leaves_buf_result.unwrap(); print_time(now, "alloc gpu leaves buffer"); let now = Instant::now(); From af979c0a9383b07a44c5ee7f71a948b3d24dec50 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 11 Dec 2025 18:43:33 +0000 Subject: [PATCH 16/37] working --- field/src/extension/quadratic.rs | 8 ++ field/src/extension/quartic.rs | 8 ++ field/src/extension/quintic.rs | 8 ++ field/src/fft.rs | 137 ++++++++++++++++++++++++++- field/src/goldilocks_field.rs | 7 ++ field/src/polynomial/mod.rs | 6 +- field/src/secp256k1_base.rs | 7 ++ field/src/secp256k1_scalar.rs | 7 ++ field/src/types.rs | 3 + plonky2/examples/fibonacci.rs | 64 ++++++++----- plonky2/src/fri/oracle.rs | 3 +- plonky2/src/plonk/circuit_builder.rs | 2 + 12 files changed, 230 insertions(+), 30 deletions(-) diff --git a/field/src/extension/quadratic.rs b/field/src/extension/quadratic.rs index 281369d21..ca74747b9 100644 --- a/field/src/extension/quadratic.rs +++ b/field/src/extension/quadratic.rs @@ -59,6 +59,14 @@ impl> Sample for QuadraticExtension { } impl> Field for QuadraticExtension { + fn to_u64(&self) -> u64 { + unimplemented!() + } + + fn from_u64(u: u64) -> Self { + unimplemented!() + } + const ZERO: Self = Self([F::ZERO; 2]); const ONE: Self = Self([F::ONE, F::ZERO]); const TWO: Self = Self([F::TWO, F::ZERO]); diff --git a/field/src/extension/quartic.rs b/field/src/extension/quartic.rs index 8c8a9e7e4..daa9d3aaf 100644 --- a/field/src/extension/quartic.rs +++ b/field/src/extension/quartic.rs @@ -65,6 +65,14 @@ impl> Sample for QuarticExtension { } impl> Field for QuarticExtension { + fn to_u64(&self) -> u64 { + unimplemented!() + } + + fn from_u64(u: u64) -> Self { + unimplemented!() + } + const ZERO: Self = Self([F::ZERO; 4]); const ONE: Self = Self([F::ONE, F::ZERO, F::ZERO, F::ZERO]); const TWO: Self = Self([F::TWO, F::ZERO, F::ZERO, F::ZERO]); diff --git a/field/src/extension/quintic.rs b/field/src/extension/quintic.rs index 28ec92267..21817c6c8 100644 --- a/field/src/extension/quintic.rs +++ b/field/src/extension/quintic.rs @@ -66,6 +66,14 @@ impl> Sample for QuinticExtension { } impl> Field for QuinticExtension { + fn to_u64(&self) -> u64 { + unimplemented!() + } + + fn from_u64(u: u64) -> Self { + unimplemented!() + } + const ZERO: Self = Self([F::ZERO; 5]); const ONE: Self = Self([F::ONE, F::ZERO, F::ZERO, F::ZERO, F::ZERO]); const TWO: Self = Self([F::TWO, F::ZERO, F::ZERO, F::ZERO, F::ZERO]); diff --git a/field/src/fft.rs b/field/src/fft.rs index d078ca6c3..666c4d697 100644 --- a/field/src/fft.rs +++ b/field/src/fft.rs @@ -32,16 +32,111 @@ pub fn fft_root_table(n: usize) -> FftRootTable { root_table } +#[allow(dead_code)] +#[cfg(feature = "cuda")] +fn fft_dispatch_gpu( + input: &mut [F], + zero_factor: Option, + root_table: Option<&FftRootTable>, +) { + if F::CUDA_SUPPORT { + use zeknox::device::memory::HostOrDeviceSlice; + use zeknox::ntt_batch; + use zeknox::types::NTTConfig; + + // let mut input_clone = input.to_vec(); + // fft_dispatch_cpu(&mut input_clone, zero_factor, root_table); + // ark_std::println!("cpu done" ); + + let total_elements = input.len(); + let mut io_u64 = input.iter().map(|x| x.to_u64()).collect::>(); + + let mut device_data: HostOrDeviceSlice<'_, u64> = + HostOrDeviceSlice::cuda_malloc(0, total_elements).unwrap(); + device_data + .copy_from_host_offset(&io_u64, 0, total_elements) + .unwrap(); + ntt_batch( + 0, + device_data.as_mut_ptr() as *mut F, + input.len().trailing_zeros() as usize, + NTTConfig::default(), + ); + + // Copy results back from device to host + io_u64.resize(total_elements, 0u64); + device_data + .copy_to_host(&mut io_u64, total_elements) + .unwrap(); + + // Convert u64 results back to field elements + input.iter_mut().zip(io_u64.iter()).for_each(|(a, b)| { + *a = F::from_canonical_u64(*b); + }); + // ark_std::println!("gpu done" ); + + // let mut to_print = false; + // for (i, (a, b)) in input.iter().zip(input_clone.iter()).enumerate() { + // if a != b { + // // panic!("Mismatch at index {}: gpu result = {}, cpu result = {}", i, a.to_u64(), b.to_u64()); + // to_print = true; + // ark_std::println!( + // "Mismatch at index {}: gpu result = {}, cpu result = {}", + // i, + // a.to_u64(), + // b.to_u64() + // ); + // } + // } + + // if to_print { + // ark_std::println!("Comparing results..."); + // ark_std::println!("cpu {:?}", input_clone); + // ark_std::println!("gpu {:?}", input); + // } + + return; + } else { + return fft_dispatch_cpu(input, zero_factor, root_table); + } +} + +fn fft_dispatch_cpu( + input: &mut [F], + zero_factor: Option, + root_table: Option<&FftRootTable>, +) { + if root_table.is_some() { + return fft_classic(input, zero_factor.unwrap_or(0), root_table.unwrap()); + } else { + // let pre_computed = F::pre_compute_fft_root_table(input.len()); + // if pre_computed.is_some() { + // return fft_classic(input, zero_factor.unwrap_or(0), pre_computed.unwrap()); + // } else { + // let computed = fft_root_table::(input.len()); + + // return fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref()); + // } + let computed = fft_root_table::(input.len()); + + return fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref()); + }; +} + #[inline] fn fft_dispatch( input: &mut [F], zero_factor: Option, root_table: Option<&FftRootTable>, ) { - let computed_root_table = root_table.is_none().then(|| fft_root_table(input.len())); - let used_root_table = root_table.or(computed_root_table.as_ref()).unwrap(); - - fft_classic(input, zero_factor.unwrap_or(0), used_root_table); + #[cfg(feature = "cuda")] + { + return fft_dispatch_gpu(input, zero_factor, root_table); + } + #[cfg(not(feature = "cuda"))] + { + return fft_dispatch_cpu(input, zero_factor, root_table); + } } #[inline] @@ -50,6 +145,7 @@ pub fn fft(poly: PolynomialCoeffs) -> PolynomialValues { } #[inline] + pub fn fft_with_options( poly: PolynomialCoeffs, zero_factor: Option, @@ -65,6 +161,28 @@ pub fn ifft(poly: PolynomialValues) -> PolynomialCoeffs { ifft_with_options(poly, None, None) } +#[inline] +pub fn ifft_cpu(poly: PolynomialValues) -> PolynomialCoeffs { + let n = poly.len(); + let lg_n = log2_strict(n); + let n_inv = F::inverse_2exp(lg_n); + + let PolynomialValues { values: mut buffer } = poly; + fft_dispatch_cpu(&mut buffer, None, None); + + // We reverse all values except the first, and divide each by n. + buffer[0] *= n_inv; + buffer[n / 2] *= n_inv; + for i in 1..(n / 2) { + let j = n - i; + let coeffs_i = buffer[j] * n_inv; + let coeffs_j = buffer[i] * n_inv; + buffer[i] = coeffs_i; + buffer[j] = coeffs_j; + } + PolynomialCoeffs { coeffs: buffer } +} + pub fn ifft_with_options( poly: PolynomialValues, zero_factor: Option, @@ -217,12 +335,20 @@ mod tests { type F = GoldilocksField; let degree = 200usize; let degree_padded = degree.next_power_of_two(); + println!("Initializing CUDA"); + + #[cfg(feature = "cuda")] + for i in 8..=12 { + zeknox::init_twiddle_factors_rs(0, i); + } + + println!("Testing fft/ifft with degree {}", degree); // Create a vector of coeffs; the first degree of them are // "random", the last degree_padded-degree of them are zero. let coeffs = (0..degree) .map(|i| F::from_canonical_usize(i * 1337 % 100)) - .chain(core::iter::repeat_n(F::ZERO, degree_padded - degree)) + .chain(core::iter::repeat(F::ZERO).take(degree_padded - degree)) .collect::>(); assert_eq!(coeffs.len(), degree_padded); let coefficients = PolynomialCoeffs { coeffs }; @@ -238,6 +364,7 @@ mod tests { assert_eq!(interpolated_coefficients.coeffs[i], F::ZERO); } + println!("Testing ldes"); for r in 0..4 { // expand coefficients by factor 2^r by filling with zeros let zero_tail = coefficients.lde(r); diff --git a/field/src/goldilocks_field.rs b/field/src/goldilocks_field.rs index ae8457744..2ac66d4ea 100644 --- a/field/src/goldilocks_field.rs +++ b/field/src/goldilocks_field.rs @@ -68,6 +68,13 @@ impl Sample for GoldilocksField { } impl Field for GoldilocksField { + fn to_u64(&self) -> u64 { + self.0 + } + fn from_u64(u: u64) -> Self { + Self(u) + } + const ZERO: Self = Self(0); const ONE: Self = Self(1); const TWO: Self = Self(2); diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs index c13bbca27..21eae220c 100644 --- a/field/src/polynomial/mod.rs +++ b/field/src/polynomial/mod.rs @@ -12,7 +12,7 @@ use plonky2_util::log2_strict; use serde::{Deserialize, Serialize}; use crate::extension::{Extendable, FieldExtension}; -use crate::fft::{fft, fft_with_options, ifft, FftRootTable}; +use crate::fft::{FftRootTable, fft, fft_with_options, ifft, ifft_cpu}; use crate::types::Field; /// A polynomial in point-value form. @@ -59,6 +59,10 @@ impl PolynomialValues { ifft(self) } + pub fn ifft_cpu(self) -> PolynomialCoeffs { + ifft_cpu(self) + } + /// Returns the polynomial whose evaluation on the coset `shift*H` is `self`. pub fn coset_ifft(self, shift: F) -> PolynomialCoeffs { let mut shifted_coeffs = self.ifft(); diff --git a/field/src/secp256k1_base.rs b/field/src/secp256k1_base.rs index 6632a7f83..20bc7a395 100644 --- a/field/src/secp256k1_base.rs +++ b/field/src/secp256k1_base.rs @@ -98,6 +98,13 @@ impl Field for Secp256K1Base { const BITS: usize = 256; + fn to_u64(&self) -> u64 { + unimplemented!() + } + fn from_u64(u: u64) -> Self { + unimplemented!() + } + fn order() -> BigUint { BigUint::from_slice(&[ 0xFFFFFC2F, 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, diff --git a/field/src/secp256k1_scalar.rs b/field/src/secp256k1_scalar.rs index 3ca5b6ba2..3514188e9 100644 --- a/field/src/secp256k1_scalar.rs +++ b/field/src/secp256k1_scalar.rs @@ -79,6 +79,13 @@ impl Sample for Secp256K1Scalar { } impl Field for Secp256K1Scalar { + fn to_u64(&self) -> u64 { + unimplemented!() + } + fn from_u64(u: u64) -> Self { + unimplemented!() + } + const ZERO: Self = Self([0; 4]); const ONE: Self = Self([1, 0, 0, 0]); const TWO: Self = Self([2, 0, 0, 0]); diff --git a/field/src/types.rs b/field/src/types.rs index 5a34bb6a3..f365ea299 100644 --- a/field/src/types.rs +++ b/field/src/types.rs @@ -94,6 +94,9 @@ pub trait Field: /// Whether this field is supported by cuda const CUDA_SUPPORT: bool = false; + fn to_u64(&self) -> u64; + fn from_u64(u: u64) -> Self; + fn order() -> BigUint; fn characteristic() -> BigUint; diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs index 9573048a6..ead3fc974 100644 --- a/plonky2/examples/fibonacci.rs +++ b/plonky2/examples/fibonacci.rs @@ -29,46 +29,64 @@ fn main() -> Result<()> { let initial_b = builder.add_virtual_target(); let mut prev_target = initial_a; let mut cur_target = initial_b; - for _ in 0..2999999 { + for _ in 0..9999999 { let temp = builder.add(prev_target, cur_target); prev_target = cur_target; cur_target = temp; } println!("Circuit built."); - // #[cfg(feature = "cuda")] - // { - // zeknox::clear_cuda_errors_rs(); - // println!("Initializing CUDA twiddle factors..."); - // // Initialize twiddle factors for all dimensions that will be used - // // This test involves multiple polynomials and recursive verification, - // // so we initialize a wider range of dimensions to be safe - // // for i in 0..=19 { - // // zeknox::init_twiddle_factors_rs(0, i); - // // } + let size = 19; - // zeknox::init_twiddle_factors_rs(0, 19); - // zeknox::init_twiddle_factors_rs(0, 22); - // // Initialize coset on GPU - // // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR) - // // TODO: Make this generic for other fields if needed - // let coset_gen_u64 = 7u64; - // zeknox::init_coset_rs(0, 22, coset_gen_u64); - // // zeknox::init_coset_rs(0, 16, coset_gen_u64); - // } + #[cfg(feature = "cuda")] + { + use plonky2_field::fft; + use plonky2_field::goldilocks_field::GoldilocksField; + use plonky2_field::polynomial::PolynomialCoeffs; + + zeknox::clear_cuda_errors_rs(); + println!("Initializing CUDA twiddle factors..."); + // Initialize twiddle factors for all dimensions that will be used + // This test involves multiple polynomials and recursive verification, + // so we initialize a wider range of dimensions to be safe + // for i in 0..=19 { + // zeknox::init_twiddle_factors_rs(0, i); + // } + + zeknox::init_twiddle_factors_rs(0, size); + zeknox::init_twiddle_factors_rs(0, size+3); + // Initialize coset on GPU + // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR) + // TODO: Make this generic for other fields if needed + let coset_gen_u64 = 7u64; + zeknox::init_coset_rs(0, size+3, coset_gen_u64); + + // warm up GPU + // for some reason the first 10 FFTs are somewhat buggy + + for i in 0..10 { + let t = (0..1 << size) + .map(|x| GoldilocksField::from_u64(i * x as u64)) + .collect(); + let poly = PolynomialCoeffs::new(t); + let _ = plonky2_field::fft::fft(poly.clone()); + } + println!("CUDA twiddle factors initialized."); + // zeknox::init_coset_rs(0, 16, coset_gen_u64); + } // Public inputs are the two initial values (provided below) and the result (which is generated). builder.register_public_input(initial_a); builder.register_public_input(initial_b); builder.register_public_input(cur_target); - + println!("Public inputs registered."); // Provide initial values. let mut pw = PartialWitness::new(); pw.set_target(initial_a, F::ZERO)?; pw.set_target(initial_b, F::ONE)?; - + println!("Initial values set in witness."); let data = builder.build::(); - + println!("Circuit data built. Generating proof..."); #[cfg(feature = "timing")] { use log::Level; diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs index 174068c73..ed5af3b67 100644 --- a/plonky2/src/fri/oracle.rs +++ b/plonky2/src/fri/oracle.rs @@ -62,11 +62,12 @@ impl, C: GenericConfig, const D: usize> timing: &mut TimingTree, fft_root_table: Option<&FftRootTable>, ) -> Self { + println!("using slow ifft_cpu"); let coeffs = timed!( timing, "IFFT", // Use sequential iteration for deterministic results - values.into_iter().map(|v| v.ifft()).collect::>() + values.into_iter().map(|v| v.ifft_cpu()).collect::>() ); Self::from_coeffs( diff --git a/plonky2/src/plonk/circuit_builder.rs b/plonky2/src/plonk/circuit_builder.rs index 421d15cdc..6ca5b712d 100644 --- a/plonky2/src/plonk/circuit_builder.rs +++ b/plonky2/src/plonk/circuit_builder.rs @@ -1225,6 +1225,7 @@ impl, const D: usize> CircuitBuilder { let max_fft_points = 1 << (degree_bits + max(rate_bits, log2_ceil(quotient_degree_factor))); let fft_root_table = fft_root_table(max_fft_points); + println!("start to commit"); let constants_sigmas_commitment = if commit_to_sigma { let constants_sigmas_vecs = [constant_vecs, sigma_vecs.clone()].concat(); PolynomialBatch::::from_values( @@ -1238,6 +1239,7 @@ impl, const D: usize> CircuitBuilder { } else { PolynomialBatch::::default() }; + println!("end commit"); // Map between gates where not all generators are used and the gate's number of used generators. let incomplete_gates = self From 6ba6963363168363b57373d18a92a42bc2b5a5ed Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 11 Dec 2025 19:28:13 +0000 Subject: [PATCH 17/37] wip clean up --- .gitignore | 3 ++ field/Cargo.toml | 8 ++-- field/src/extension/quadratic.rs | 8 ---- field/src/extension/quartic.rs | 8 ---- field/src/extension/quintic.rs | 8 ---- field/src/fft.rs | 70 ++++++++------------------------ field/src/goldilocks_field.rs | 7 ---- field/src/polynomial/mod.rs | 2 +- field/src/secp256k1_base.rs | 7 ---- field/src/secp256k1_scalar.rs | 6 --- field/src/types.rs | 3 -- plonky2/Cargo.toml | 7 +++- plonky2/examples/fibonacci.rs | 10 ++--- 13 files changed, 35 insertions(+), 112 deletions(-) diff --git a/.gitignore b/.gitignore index 293a17bb6..695fe9b7b 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,6 @@ pgo-data.profdata # MacOS nuisances .DS_Store + +*.log + diff --git a/field/Cargo.toml b/field/Cargo.toml index 1b6a62d71..ba60626ff 100644 --- a/field/Cargo.toml +++ b/field/Cargo.toml @@ -36,6 +36,8 @@ workspace = true [features] -default = [] -# default = [ "cuda" ] -cuda = [] \ No newline at end of file +# default = [] +default = [ "cuda", "cuda_sanity_check" ] +cuda = [] +# sanity check: when this flag is on, the computation will done on both CPU and CUDA, and the results compared +cuda_sanity_check = ["cuda"] \ No newline at end of file diff --git a/field/src/extension/quadratic.rs b/field/src/extension/quadratic.rs index ca74747b9..281369d21 100644 --- a/field/src/extension/quadratic.rs +++ b/field/src/extension/quadratic.rs @@ -59,14 +59,6 @@ impl> Sample for QuadraticExtension { } impl> Field for QuadraticExtension { - fn to_u64(&self) -> u64 { - unimplemented!() - } - - fn from_u64(u: u64) -> Self { - unimplemented!() - } - const ZERO: Self = Self([F::ZERO; 2]); const ONE: Self = Self([F::ONE, F::ZERO]); const TWO: Self = Self([F::TWO, F::ZERO]); diff --git a/field/src/extension/quartic.rs b/field/src/extension/quartic.rs index daa9d3aaf..8c8a9e7e4 100644 --- a/field/src/extension/quartic.rs +++ b/field/src/extension/quartic.rs @@ -65,14 +65,6 @@ impl> Sample for QuarticExtension { } impl> Field for QuarticExtension { - fn to_u64(&self) -> u64 { - unimplemented!() - } - - fn from_u64(u: u64) -> Self { - unimplemented!() - } - const ZERO: Self = Self([F::ZERO; 4]); const ONE: Self = Self([F::ONE, F::ZERO, F::ZERO, F::ZERO]); const TWO: Self = Self([F::TWO, F::ZERO, F::ZERO, F::ZERO]); diff --git a/field/src/extension/quintic.rs b/field/src/extension/quintic.rs index 21817c6c8..28ec92267 100644 --- a/field/src/extension/quintic.rs +++ b/field/src/extension/quintic.rs @@ -66,14 +66,6 @@ impl> Sample for QuinticExtension { } impl> Field for QuinticExtension { - fn to_u64(&self) -> u64 { - unimplemented!() - } - - fn from_u64(u: u64) -> Self { - unimplemented!() - } - const ZERO: Self = Self([F::ZERO; 5]); const ONE: Self = Self([F::ONE, F::ZERO, F::ZERO, F::ZERO, F::ZERO]); const TWO: Self = Self([F::TWO, F::ZERO, F::ZERO, F::ZERO, F::ZERO]); diff --git a/field/src/fft.rs b/field/src/fft.rs index 666c4d697..3ee9fb067 100644 --- a/field/src/fft.rs +++ b/field/src/fft.rs @@ -40,61 +40,32 @@ fn fft_dispatch_gpu( root_table: Option<&FftRootTable>, ) { if F::CUDA_SUPPORT { - use zeknox::device::memory::HostOrDeviceSlice; use zeknox::ntt_batch; use zeknox::types::NTTConfig; - // let mut input_clone = input.to_vec(); - // fft_dispatch_cpu(&mut input_clone, zero_factor, root_table); - // ark_std::println!("cpu done" ); + #[cfg(feature = "cuda_sanity_check")] + let cpu_res = { + let mut input_clone = input.to_vec(); + fft_dispatch_cpu(&mut input_clone, zero_factor, root_table); + input_clone + }; - let total_elements = input.len(); - let mut io_u64 = input.iter().map(|x| x.to_u64()).collect::>(); - - let mut device_data: HostOrDeviceSlice<'_, u64> = - HostOrDeviceSlice::cuda_malloc(0, total_elements).unwrap(); - device_data - .copy_from_host_offset(&io_u64, 0, total_elements) - .unwrap(); ntt_batch( 0, - device_data.as_mut_ptr() as *mut F, + input, input.len().trailing_zeros() as usize, NTTConfig::default(), ); - // Copy results back from device to host - io_u64.resize(total_elements, 0u64); - device_data - .copy_to_host(&mut io_u64, total_elements) - .unwrap(); - - // Convert u64 results back to field elements - input.iter_mut().zip(io_u64.iter()).for_each(|(a, b)| { - *a = F::from_canonical_u64(*b); - }); - // ark_std::println!("gpu done" ); - - // let mut to_print = false; - // for (i, (a, b)) in input.iter().zip(input_clone.iter()).enumerate() { - // if a != b { - // // panic!("Mismatch at index {}: gpu result = {}, cpu result = {}", i, a.to_u64(), b.to_u64()); - // to_print = true; - // ark_std::println!( - // "Mismatch at index {}: gpu result = {}, cpu result = {}", - // i, - // a.to_u64(), - // b.to_u64() - // ); - // } - // } - - // if to_print { - // ark_std::println!("Comparing results..."); - // ark_std::println!("cpu {:?}", input_clone); - // ark_std::println!("gpu {:?}", input); - // } - + #[cfg(feature = "cuda_sanity_check")] + for (i, (a, b)) in input.iter().zip(cpu_res.iter()).enumerate() { + if a != b { + panic!( + "Mismatch at index {}: gpu result = {}, cpu result = {}", + i, a, b + ); + } + } return; } else { return fft_dispatch_cpu(input, zero_factor, root_table); @@ -109,16 +80,7 @@ fn fft_dispatch_cpu( if root_table.is_some() { return fft_classic(input, zero_factor.unwrap_or(0), root_table.unwrap()); } else { - // let pre_computed = F::pre_compute_fft_root_table(input.len()); - // if pre_computed.is_some() { - // return fft_classic(input, zero_factor.unwrap_or(0), pre_computed.unwrap()); - // } else { - // let computed = fft_root_table::(input.len()); - - // return fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref()); - // } let computed = fft_root_table::(input.len()); - return fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref()); }; } diff --git a/field/src/goldilocks_field.rs b/field/src/goldilocks_field.rs index 2ac66d4ea..ae8457744 100644 --- a/field/src/goldilocks_field.rs +++ b/field/src/goldilocks_field.rs @@ -68,13 +68,6 @@ impl Sample for GoldilocksField { } impl Field for GoldilocksField { - fn to_u64(&self) -> u64 { - self.0 - } - fn from_u64(u: u64) -> Self { - Self(u) - } - const ZERO: Self = Self(0); const ONE: Self = Self(1); const TWO: Self = Self(2); diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs index 21eae220c..5ce66943c 100644 --- a/field/src/polynomial/mod.rs +++ b/field/src/polynomial/mod.rs @@ -12,7 +12,7 @@ use plonky2_util::log2_strict; use serde::{Deserialize, Serialize}; use crate::extension::{Extendable, FieldExtension}; -use crate::fft::{FftRootTable, fft, fft_with_options, ifft, ifft_cpu}; +use crate::fft::{fft, fft_with_options, ifft, ifft_cpu, FftRootTable}; use crate::types::Field; /// A polynomial in point-value form. diff --git a/field/src/secp256k1_base.rs b/field/src/secp256k1_base.rs index 20bc7a395..6632a7f83 100644 --- a/field/src/secp256k1_base.rs +++ b/field/src/secp256k1_base.rs @@ -98,13 +98,6 @@ impl Field for Secp256K1Base { const BITS: usize = 256; - fn to_u64(&self) -> u64 { - unimplemented!() - } - fn from_u64(u: u64) -> Self { - unimplemented!() - } - fn order() -> BigUint { BigUint::from_slice(&[ 0xFFFFFC2F, 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, diff --git a/field/src/secp256k1_scalar.rs b/field/src/secp256k1_scalar.rs index 3514188e9..2b299130d 100644 --- a/field/src/secp256k1_scalar.rs +++ b/field/src/secp256k1_scalar.rs @@ -79,12 +79,6 @@ impl Sample for Secp256K1Scalar { } impl Field for Secp256K1Scalar { - fn to_u64(&self) -> u64 { - unimplemented!() - } - fn from_u64(u: u64) -> Self { - unimplemented!() - } const ZERO: Self = Self([0; 4]); const ONE: Self = Self([1, 0, 0, 0]); diff --git a/field/src/types.rs b/field/src/types.rs index f365ea299..5a34bb6a3 100644 --- a/field/src/types.rs +++ b/field/src/types.rs @@ -94,9 +94,6 @@ pub trait Field: /// Whether this field is supported by cuda const CUDA_SUPPORT: bool = false; - fn to_u64(&self) -> u64; - fn from_u64(u: u64) -> Self; - fn order() -> BigUint; fn characteristic() -> BigUint; diff --git a/plonky2/Cargo.toml b/plonky2/Cargo.toml index 2de7408ef..5121535a7 100644 --- a/plonky2/Cargo.toml +++ b/plonky2/Cargo.toml @@ -12,13 +12,16 @@ keywords.workspace = true categories.workspace = true [features] -default = ["gate_testing", "parallel", "rand_chacha", "std", "timing"] -# default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing"] +# default = ["gate_testing", "parallel", "rand_chacha", "std", "timing"] +default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing", "cuda_sanity_check"] gate_testing = [] parallel = ["hashbrown/rayon", "plonky2_maybe_rayon/parallel"] std = ["anyhow/std", "rand/std", "itertools/use_std"] timing = ["std", "dep:web-time"] cuda = ["plonky2_field/cuda"] +# sanity check: when this flag is on, the computation will done on both CPU and CUDA, and the results compared +cuda_sanity_check = [ "cuda", "plonky2_field/cuda_sanity_check" ] + [dependencies] ahash = { workspace = true } diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs index ead3fc974..cdd4faac8 100644 --- a/plonky2/examples/fibonacci.rs +++ b/plonky2/examples/fibonacci.rs @@ -29,14 +29,14 @@ fn main() -> Result<()> { let initial_b = builder.add_virtual_target(); let mut prev_target = initial_a; let mut cur_target = initial_b; - for _ in 0..9999999 { + for _ in 0..999999 { let temp = builder.add(prev_target, cur_target); prev_target = cur_target; cur_target = temp; } println!("Circuit built."); - let size = 19; + let size = 16; #[cfg(feature = "cuda")] { @@ -54,19 +54,19 @@ fn main() -> Result<()> { // } zeknox::init_twiddle_factors_rs(0, size); - zeknox::init_twiddle_factors_rs(0, size+3); + zeknox::init_twiddle_factors_rs(0, size + 3); // Initialize coset on GPU // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR) // TODO: Make this generic for other fields if needed let coset_gen_u64 = 7u64; - zeknox::init_coset_rs(0, size+3, coset_gen_u64); + zeknox::init_coset_rs(0, size + 3, coset_gen_u64); // warm up GPU // for some reason the first 10 FFTs are somewhat buggy for i in 0..10 { let t = (0..1 << size) - .map(|x| GoldilocksField::from_u64(i * x as u64)) + .map(|x| GoldilocksField::from_canonical_u64(i * x as u64)) .collect(); let poly = PolynomialCoeffs::new(t); let _ = plonky2_field::fft::fft(poly.clone()); From 7b659d75562c7f2a07c258164581a147e262ac77 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 11 Dec 2025 19:43:03 +0000 Subject: [PATCH 18/37] continue clean up --- field/src/fft.rs | 10 -------- field/src/secp256k1_scalar.rs | 1 - plonky2/src/fri/oracle.rs | 34 +++++++++++++++++++++++++--- plonky2/src/plonk/circuit_builder.rs | 6 ++--- 4 files changed, 34 insertions(+), 17 deletions(-) diff --git a/field/src/fft.rs b/field/src/fft.rs index 3ee9fb067..6c6874a62 100644 --- a/field/src/fft.rs +++ b/field/src/fft.rs @@ -32,7 +32,6 @@ pub fn fft_root_table(n: usize) -> FftRootTable { root_table } -#[allow(dead_code)] #[cfg(feature = "cuda")] fn fft_dispatch_gpu( input: &mut [F], @@ -297,14 +296,6 @@ mod tests { type F = GoldilocksField; let degree = 200usize; let degree_padded = degree.next_power_of_two(); - println!("Initializing CUDA"); - - #[cfg(feature = "cuda")] - for i in 8..=12 { - zeknox::init_twiddle_factors_rs(0, i); - } - - println!("Testing fft/ifft with degree {}", degree); // Create a vector of coeffs; the first degree of them are // "random", the last degree_padded-degree of them are zero. @@ -326,7 +317,6 @@ mod tests { assert_eq!(interpolated_coefficients.coeffs[i], F::ZERO); } - println!("Testing ldes"); for r in 0..4 { // expand coefficients by factor 2^r by filling with zeros let zero_tail = coefficients.lde(r); diff --git a/field/src/secp256k1_scalar.rs b/field/src/secp256k1_scalar.rs index 2b299130d..3ca5b6ba2 100644 --- a/field/src/secp256k1_scalar.rs +++ b/field/src/secp256k1_scalar.rs @@ -79,7 +79,6 @@ impl Sample for Secp256K1Scalar { } impl Field for Secp256K1Scalar { - const ZERO: Self = Self([0; 4]); const ONE: Self = Self([1, 0, 0, 0]); const TWO: Self = Self([2, 0, 0, 0]); diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs index ed5af3b67..11054cb51 100644 --- a/plonky2/src/fri/oracle.rs +++ b/plonky2/src/fri/oracle.rs @@ -53,6 +53,36 @@ impl, C: GenericConfig, const D: usize> D impl, C: GenericConfig, const D: usize> PolynomialBatch { + /// Creates a list polynomial commitment for the polynomials interpolating the values in `values`. + /// This function is called by the builder during preprocessing the circuit. + /// We use parallel IFFT on CPU here to avoid strange GPU issue. + pub fn preprocessor_from_values( + values: Vec>, + rate_bits: usize, + blinding: bool, + cap_height: usize, + timing: &mut TimingTree, + fft_root_table: Option<&FftRootTable>, + ) -> Self { + let coeffs = timed!( + timing, + "IFFT", + values + .into_par_iter() + .map(|v| v.ifft_cpu()) + .collect::>() + ); + + Self::from_coeffs( + coeffs, + rate_bits, + blinding, + cap_height, + timing, + fft_root_table, + ) + } + /// Creates a list polynomial commitment for the polynomials interpolating the values in `values`. pub fn from_values( values: Vec>, @@ -62,12 +92,10 @@ impl, C: GenericConfig, const D: usize> timing: &mut TimingTree, fft_root_table: Option<&FftRootTable>, ) -> Self { - println!("using slow ifft_cpu"); let coeffs = timed!( timing, "IFFT", - // Use sequential iteration for deterministic results - values.into_iter().map(|v| v.ifft_cpu()).collect::>() + values.into_par_iter().map(|v| v.ifft()).collect::>() ); Self::from_coeffs( diff --git a/plonky2/src/plonk/circuit_builder.rs b/plonky2/src/plonk/circuit_builder.rs index 6ca5b712d..5b2f2c3a1 100644 --- a/plonky2/src/plonk/circuit_builder.rs +++ b/plonky2/src/plonk/circuit_builder.rs @@ -1225,10 +1225,11 @@ impl, const D: usize> CircuitBuilder { let max_fft_points = 1 << (degree_bits + max(rate_bits, log2_ceil(quotient_degree_factor))); let fft_root_table = fft_root_table(max_fft_points); - println!("start to commit"); + // This part of the code on GPU is buggy. So we use CPU for computation. + // It does not impact performance as this is only done once during setup. let constants_sigmas_commitment = if commit_to_sigma { let constants_sigmas_vecs = [constant_vecs, sigma_vecs.clone()].concat(); - PolynomialBatch::::from_values( + PolynomialBatch::::preprocessor_from_values( constants_sigmas_vecs, rate_bits, PlonkOracle::CONSTANTS_SIGMAS.blinding, @@ -1239,7 +1240,6 @@ impl, const D: usize> CircuitBuilder { } else { PolynomialBatch::::default() }; - println!("end commit"); // Map between gates where not all generators are used and the gate's number of used generators. let incomplete_gates = self From 83f3d96eb68b79b7c758a762c120a60ba663d981 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 11 Dec 2025 20:15:13 +0000 Subject: [PATCH 19/37] merkle tree sanity checks --- field/src/polynomial/mod.rs | 3 +++ perm_comp.md | 36 ------------------------- plonky2/src/fri/oracle.rs | 13 --------- plonky2/src/hash/merkle_tree.rs | 48 ++++++++++++++++++++++++--------- 4 files changed, 39 insertions(+), 61 deletions(-) delete mode 100644 perm_comp.md diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs index 5ce66943c..a38a7c553 100644 --- a/field/src/polynomial/mod.rs +++ b/field/src/polynomial/mod.rs @@ -55,10 +55,13 @@ impl PolynomialValues { self.values.len() } + /// Adaptive IFFT: uses GPU if available, otherwise CPU. pub fn ifft(self) -> PolynomialCoeffs { ifft(self) } + /// Enfored to use CPU IFFT. + /// Used for bypass the GPU issue during setup phase. pub fn ifft_cpu(self) -> PolynomialCoeffs { ifft_cpu(self) } diff --git a/perm_comp.md b/perm_comp.md deleted file mode 100644 index 333134394..000000000 --- a/perm_comp.md +++ /dev/null @@ -1,36 +0,0 @@ -# Performance comparison -- CPU: AMD 7950x3d; 16 core -- GPU: NVidia 4080; single card -- Circuit size: 2^19 gates -- Total CPU time: **32.97 s** -- Total GPU time: **19.71 s** - -| Operation | CPU (s) | GPU (s) | Speedup | GPU Tuned? | -|-----------|---------|---------|---------|------------| -| **Run generators** | 1.7767 | 1.7899 | - | ✗ Not accelerated | -| **Compute full witness** | 0.3369 | 0.3362 | - | ✗ Not accelerated | -| **Compute wire polynomials** | 0.0396 | 0.0392 | - | ✗ Not accelerated | -| **Compute wires commitment** | 20.1902 | 10.0548 | **2.01x** | ✓ Yes | -| └─ IFFT | 1.2070 | 0.1587 | **7.61x** | ✓ **Highly tuned** | -| └─ FFT + blinding | 11.4267 | 3.6139 | **3.16x** | ✓ **Highly tuned** | -| └─ Transpose LDEs | 2.8010 | 2.7881 | - | ✗ Not accelerated | -| └─ Build Merkle tree | 4.5166 | 3.2734 | **1.38x** | ✓ Tuned | -| **Compute partial products** | 0.1700 | 0.1671 | - | ✗ Not accelerated | -| **Commit to partial products/Z's** | 3.4213 | 1.6982 | **2.01x** | ✓ Yes | -| └─ IFFT | 0.1860 | 0.0241 | **7.72x** | ✓ **Highly tuned** | -| └─ FFT + blinding | 1.7627 | 0.4778 | **3.69x** | ✓ **Highly tuned** | -| └─ Transpose LDEs | 0.3906 | 0.3874 | - | ✗ Not accelerated | -| └─ Build Merkle tree | 1.0253 | 0.7573 | **1.35x** | ✓ Tuned | -| **Compute quotient polys** | 1.4041 | 1.3128 | - | ✗ Not accelerated | -| **Split quotient polys** | 0.0098 | 0.0212 | - | ✗ Not accelerated| -| **Commit to quotient polys** | 2.6641 | 1.4077 | **1.89x** | ✓ Yes | -| └─ FFT + blinding | 1.5496 | 0.4315 | **3.59x** | ✓ **Highly tuned** | -| └─ Transpose LDEs | 0.2952 | 0.2908 | - | ✗ Not accelerated | -| └─ Build Merkle tree | 0.7756 | 0.6453 | **1.20x** | ✓ Tuned | -| **Construct opening set** | 0.1609 | 0.1600 | - | ✗ Not accelerated | -| **Compute opening proofs** | 1.3580 | 1.2919 | - | ✗ Not accelerated | -| └─ Reduce 255 polynomials | 0.8715 | 0.8518 | - | ✗ Not accelerated | -| └─ Reduce 2 polynomials | 0.0087 | 0.0085 | - | ✗ Not accelerated | -| └─ Final FFT 4194304 | 0.3083 | 0.3023 | - | ✗ Not accelerated | -| └─ Fold codewords | 0.1312 | 0.0904 | - | ✗ Not accelerated | -| └─ Find PoW witness | 0.0014 | 0.0038 | - | ✗ Not accelerated | \ No newline at end of file diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs index 11054cb51..d008301cf 100644 --- a/plonky2/src/fri/oracle.rs +++ b/plonky2/src/fri/oracle.rs @@ -125,20 +125,7 @@ impl, C: GenericConfig, const D: usize> ); let mut leaves = timed!(timing, "transpose LDEs", transpose(&lde_values)); - // Debug: Print first leaf for determinism check - if !leaves.is_empty() && !leaves[0].is_empty() { - println!( - "First leaf before reverse_bits: {:?}", - &leaves[0][..4.min(leaves[0].len())] - ); - } reverse_index_bits_in_place(&mut leaves); - if !leaves.is_empty() && !leaves[0].is_empty() { - println!( - "First leaf after reverse_bits: {:?}", - &leaves[0][..4.min(leaves[0].len())] - ); - } let merkle_tree = timed!( timing, "build Merkle tree", diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs index 3561471bd..102a8174c 100644 --- a/plonky2/src/hash/merkle_tree.rs +++ b/plonky2/src/hash/merkle_tree.rs @@ -484,9 +484,43 @@ impl> MerkleTree { let digests_buf = capacity_up_to_mut(&mut digests, num_digests); let cap_buf = capacity_up_to_mut(&mut cap, len_cap); - let now = Instant::now(); + + #[cfg(feature = "cuda_sanity_check")] + let (digests_buf_cpu, cap_cpu) = { + let mut digests_buf_cpu = digests_buf.to_vec(); + let mut cap_buf_cpu = cap_buf.to_vec(); + + fill_digests_buf::( + &mut digests_buf_cpu, + &mut cap_buf_cpu, + &leaves_1d.clone(), + leaf_size, + cap_height, + ); + + (digests_buf_cpu, cap_buf_cpu) + }; + fill_digests_buf_meta::(digests_buf, cap_buf, &leaves_1d, leaf_size, cap_height); - print_time(now, "fill digests buffer"); + + #[cfg(feature = "cuda_sanity_check")] + { + for i in 0..num_digests { + unsafe { + let hash1 = digests_buf[i].assume_init(); + let hash2 = digests_buf_cpu[i].assume_init(); + assert_eq!(hash1, hash2, "Digest mismatch at index {}", i); + } + } + for i in 0..len_cap { + unsafe { + let hash1 = cap_buf[i].assume_init(); + let hash2 = cap_cpu[i].assume_init(); + assert_eq!(hash1, hash2, "Cap mismatch at index {}", i); + } + } + } + unsafe { // SAFETY: `fill_digests_buf` and `cap` initialized the spare capacity up to @@ -494,16 +528,6 @@ impl> MerkleTree { digests.set_len(num_digests); cap.set_len(len_cap); } - /* - println!{"Digest Buffer"}; - for dg in &digests { - println!("{:?}", dg); - } - println!{"Cap Buffer"}; - for dg in &cap { - println!("{:?}", dg); - } - */ Self { leaves: leaves_1d, leaf_size, From 2c9dd3fbd373518994450a8b148f3ad135e14075 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 11 Dec 2025 20:54:12 +0000 Subject: [PATCH 20/37] fix more tests --- field/src/fft.rs | 11 +++++ field/src/polynomial/mod.rs | 21 +++++++++ fix_env.md | 79 +++++++++++++++++++++++++++++++++ plonky2/src/hash/merkle_tree.rs | 1 - 4 files changed, 111 insertions(+), 1 deletion(-) create mode 100644 fix_env.md diff --git a/field/src/fft.rs b/field/src/fft.rs index 6c6874a62..6b0ea188c 100644 --- a/field/src/fft.rs +++ b/field/src/fft.rs @@ -293,6 +293,17 @@ mod tests { #[test] fn fft_and_ifft() { + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + // Initialize twiddle factors for sizes we'll use + // degree_padded is 256 = 2^8 + // lde then add 4 more bits + for i in 8..=12 { + zeknox::init_twiddle_factors_rs(0, i); + } + } + type F = GoldilocksField; let degree = 200usize; let degree_padded = degree.next_power_of_two(); diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs index a38a7c553..04ee20cca 100644 --- a/field/src/polynomial/mod.rs +++ b/field/src/polynomial/mod.rs @@ -454,6 +454,17 @@ mod tests { use crate::goldilocks_field::GoldilocksField; use crate::types::Sample; + #[cfg(feature = "cuda")] + fn init_gpu_for_tests() { + zeknox::clear_cuda_errors_rs(); + // Initialize twiddle factors for various sizes + for i in 0..=20 { + zeknox::init_twiddle_factors_rs(0, i); + } + let coset_gen_u64 = 7u64; + zeknox::init_coset_rs(0, 20, coset_gen_u64); + } + #[test] fn test_trimmed() { type F = GoldilocksField; @@ -482,6 +493,9 @@ mod tests { #[test] fn test_coset_fft() { + #[cfg(feature = "cuda")] + init_gpu_for_tests(); + type F = GoldilocksField; let k = 8; @@ -503,6 +517,9 @@ mod tests { #[test] fn test_coset_ifft() { + #[cfg(feature = "cuda")] + init_gpu_for_tests(); + type F = GoldilocksField; let k = 8; @@ -609,6 +626,10 @@ mod tests { // `(X^n - 1)/(X - a) #[test] fn test_division_linear() { + + #[cfg(feature = "cuda")] + init_gpu_for_tests(); + type F = GoldilocksField; let mut rng = OsRng; let l = 14; diff --git a/fix_env.md b/fix_env.md new file mode 100644 index 000000000..d8304c817 --- /dev/null +++ b/fix_env.md @@ -0,0 +1,79 @@ +# NVIDIA Driver Fix After Reboot + +## Problem +After system reboot, `nvidia-smi` fails with error: +``` +NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. +``` + +## Root Cause +System updates install new kernel versions, but NVIDIA driver modules aren't automatically built for the new kernel because kernel headers are missing. + +## Quick Fix (Run after each kernel update) + +```bash +# 1. Install kernel headers for current kernel +sudo apt update +sudo apt install -y linux-headers-$(uname -r) + +# 2. DKMS will automatically rebuild NVIDIA modules +# Wait for the installation to complete (shows "Building module(s)..." and "done") + +# 3. Load the NVIDIA driver +sudo modprobe nvidia + +# 4. Verify it works +nvidia-smi +``` + +## Diagnostic Commands + +Check if NVIDIA modules are built for current kernel: +```bash +uname -r # Show current kernel version +dkms status # Check which kernels have NVIDIA modules +modinfo nvidia # Check if nvidia module exists for current kernel +lsmod | grep nvidia # Check if nvidia modules are loaded +``` + +## Prevention - Auto-install Headers (Recommended) + +Set up automatic kernel header installation: +```bash +sudo bash -c 'cat > /etc/apt/apt.conf.d/99auto-kernel-headers <> MerkleTree { } } - unsafe { // SAFETY: `fill_digests_buf` and `cap` initialized the spare capacity up to // `num_digests` and `len_cap`, resp. From fee54ed00715a5859b5046f484621d660ded43af Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 11 Dec 2025 21:00:25 +0000 Subject: [PATCH 21/37] clean up --- field/src/fft.rs | 8 ++++---- field/src/polynomial/mod.rs | 6 ++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/field/src/fft.rs b/field/src/fft.rs index 6b0ea188c..2509c7b2c 100644 --- a/field/src/fft.rs +++ b/field/src/fft.rs @@ -77,10 +77,10 @@ fn fft_dispatch_cpu( root_table: Option<&FftRootTable>, ) { if root_table.is_some() { - return fft_classic(input, zero_factor.unwrap_or(0), root_table.unwrap()); + fft_classic(input, zero_factor.unwrap_or(0), root_table.unwrap()) } else { let computed = fft_root_table::(input.len()); - return fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref()); + fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref()) }; } @@ -92,11 +92,11 @@ fn fft_dispatch( ) { #[cfg(feature = "cuda")] { - return fft_dispatch_gpu(input, zero_factor, root_table); + fft_dispatch_gpu(input, zero_factor, root_table) } #[cfg(not(feature = "cuda"))] { - return fft_dispatch_cpu(input, zero_factor, root_table); + fft_dispatch_cpu(input, zero_factor, root_table) } } diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs index 04ee20cca..ffe5d18c6 100644 --- a/field/src/polynomial/mod.rs +++ b/field/src/polynomial/mod.rs @@ -445,7 +445,6 @@ impl Mul for &PolynomialCoeffs { #[cfg(test)] mod tests { - use std::time::Instant; use rand::rngs::OsRng; use rand::Rng; @@ -625,10 +624,9 @@ mod tests { // Test to see which polynomial division method is faster for divisions of the type // `(X^n - 1)/(X - a) #[test] + #[cfg(not(feature = "cuda"))] fn test_division_linear() { - - #[cfg(feature = "cuda")] - init_gpu_for_tests(); + use std::time::Instant; type F = GoldilocksField; let mut rng = OsRng; From 4b82d19ac25bdaab8123532699ed8b5145806667 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 11 Dec 2025 21:10:02 +0000 Subject: [PATCH 22/37] clean up printing --- field/Cargo.toml | 1 - perm_comp.md | 36 +++++++++++++++++++++++++++ plonky2/examples/fibonacci.rs | 26 ------------------- plonky2/src/gates/equality_base.rs | 1 - plonky2/src/hash/merkle_tree.rs | 5 ++++ plonky2/src/iop/generator.rs | 5 ---- plonky2/src/plonk/prover.rs | 19 +++----------- plonky2/src/util/serialization/mod.rs | 2 -- 8 files changed, 44 insertions(+), 51 deletions(-) create mode 100644 perm_comp.md diff --git a/field/Cargo.toml b/field/Cargo.toml index ba60626ff..80b4478da 100644 --- a/field/Cargo.toml +++ b/field/Cargo.toml @@ -11,7 +11,6 @@ keywords.workspace = true categories.workspace = true [dependencies] -ark-std = "0.5.0" anyhow = { workspace = true } itertools = { workspace = true, features = ["use_alloc"] } num = { workspace = true, features = ["alloc"] } diff --git a/perm_comp.md b/perm_comp.md new file mode 100644 index 000000000..333134394 --- /dev/null +++ b/perm_comp.md @@ -0,0 +1,36 @@ +# Performance comparison +- CPU: AMD 7950x3d; 16 core +- GPU: NVidia 4080; single card +- Circuit size: 2^19 gates +- Total CPU time: **32.97 s** +- Total GPU time: **19.71 s** + +| Operation | CPU (s) | GPU (s) | Speedup | GPU Tuned? | +|-----------|---------|---------|---------|------------| +| **Run generators** | 1.7767 | 1.7899 | - | ✗ Not accelerated | +| **Compute full witness** | 0.3369 | 0.3362 | - | ✗ Not accelerated | +| **Compute wire polynomials** | 0.0396 | 0.0392 | - | ✗ Not accelerated | +| **Compute wires commitment** | 20.1902 | 10.0548 | **2.01x** | ✓ Yes | +| └─ IFFT | 1.2070 | 0.1587 | **7.61x** | ✓ **Highly tuned** | +| └─ FFT + blinding | 11.4267 | 3.6139 | **3.16x** | ✓ **Highly tuned** | +| └─ Transpose LDEs | 2.8010 | 2.7881 | - | ✗ Not accelerated | +| └─ Build Merkle tree | 4.5166 | 3.2734 | **1.38x** | ✓ Tuned | +| **Compute partial products** | 0.1700 | 0.1671 | - | ✗ Not accelerated | +| **Commit to partial products/Z's** | 3.4213 | 1.6982 | **2.01x** | ✓ Yes | +| └─ IFFT | 0.1860 | 0.0241 | **7.72x** | ✓ **Highly tuned** | +| └─ FFT + blinding | 1.7627 | 0.4778 | **3.69x** | ✓ **Highly tuned** | +| └─ Transpose LDEs | 0.3906 | 0.3874 | - | ✗ Not accelerated | +| └─ Build Merkle tree | 1.0253 | 0.7573 | **1.35x** | ✓ Tuned | +| **Compute quotient polys** | 1.4041 | 1.3128 | - | ✗ Not accelerated | +| **Split quotient polys** | 0.0098 | 0.0212 | - | ✗ Not accelerated| +| **Commit to quotient polys** | 2.6641 | 1.4077 | **1.89x** | ✓ Yes | +| └─ FFT + blinding | 1.5496 | 0.4315 | **3.59x** | ✓ **Highly tuned** | +| └─ Transpose LDEs | 0.2952 | 0.2908 | - | ✗ Not accelerated | +| └─ Build Merkle tree | 0.7756 | 0.6453 | **1.20x** | ✓ Tuned | +| **Construct opening set** | 0.1609 | 0.1600 | - | ✗ Not accelerated | +| **Compute opening proofs** | 1.3580 | 1.2919 | - | ✗ Not accelerated | +| └─ Reduce 255 polynomials | 0.8715 | 0.8518 | - | ✗ Not accelerated | +| └─ Reduce 2 polynomials | 0.0087 | 0.0085 | - | ✗ Not accelerated | +| └─ Final FFT 4194304 | 0.3083 | 0.3023 | - | ✗ Not accelerated | +| └─ Fold codewords | 0.1312 | 0.0904 | - | ✗ Not accelerated | +| └─ Find PoW witness | 0.0014 | 0.0038 | - | ✗ Not accelerated | \ No newline at end of file diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs index cdd4faac8..fade89256 100644 --- a/plonky2/examples/fibonacci.rs +++ b/plonky2/examples/fibonacci.rs @@ -1,11 +1,9 @@ use anyhow::{Ok, Result}; -use log::Level; use plonky2::field::types::Field; use plonky2::iop::witness::{PartialWitness, WitnessWrite}; use plonky2::plonk::circuit_builder::CircuitBuilder; use plonky2::plonk::circuit_data::CircuitConfig; use plonky2::plonk::config::{GenericConfig, PoseidonGoldilocksConfig}; -use plonky2::util::timing::TimingTree; /// An example of using Plonky2 to prove a statement of the form /// "I know the 100th element of the Fibonacci sequence, starting with constants a and b." @@ -40,39 +38,15 @@ fn main() -> Result<()> { #[cfg(feature = "cuda")] { - use plonky2_field::fft; - use plonky2_field::goldilocks_field::GoldilocksField; - use plonky2_field::polynomial::PolynomialCoeffs; - zeknox::clear_cuda_errors_rs(); println!("Initializing CUDA twiddle factors..."); - // Initialize twiddle factors for all dimensions that will be used - // This test involves multiple polynomials and recursive verification, - // so we initialize a wider range of dimensions to be safe - // for i in 0..=19 { - // zeknox::init_twiddle_factors_rs(0, i); - // } zeknox::init_twiddle_factors_rs(0, size); zeknox::init_twiddle_factors_rs(0, size + 3); // Initialize coset on GPU // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR) - // TODO: Make this generic for other fields if needed let coset_gen_u64 = 7u64; zeknox::init_coset_rs(0, size + 3, coset_gen_u64); - - // warm up GPU - // for some reason the first 10 FFTs are somewhat buggy - - for i in 0..10 { - let t = (0..1 << size) - .map(|x| GoldilocksField::from_canonical_u64(i * x as u64)) - .collect(); - let poly = PolynomialCoeffs::new(t); - let _ = plonky2_field::fft::fft(poly.clone()); - } - println!("CUDA twiddle factors initialized."); - // zeknox::init_coset_rs(0, 16, coset_gen_u64); } // Public inputs are the two initial values (provided below) and the result (which is generated). diff --git a/plonky2/src/gates/equality_base.rs b/plonky2/src/gates/equality_base.rs index 50a315e81..86c302e18 100644 --- a/plonky2/src/gates/equality_base.rs +++ b/plonky2/src/gates/equality_base.rs @@ -160,7 +160,6 @@ impl, const D: usize> Gate for EqualityGate { ) }) .collect(); - //println!("generators {:?}", result.len()); result } diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs index d98f41681..30693ca6a 100644 --- a/plonky2/src/hash/merkle_tree.rs +++ b/plonky2/src/hash/merkle_tree.rs @@ -35,10 +35,14 @@ use crate::util::log2_strict; #[cfg(feature = "cuda")] pub static GPU_ID: Lazy>> = Lazy::new(|| Arc::new(Mutex::new(0))); +#[cfg(feature = "cuda")] fn print_time(now: Instant, msg: &str) { println!("Time {} {} ms", msg, now.elapsed().as_millis()); } +#[cfg(not(feature = "cuda"))] +fn print_time(_now: Instant, _msg: &str) {} + #[cfg(feature = "cuda")] const FORCE_SINGLE_GPU: bool = true; @@ -521,6 +525,7 @@ impl> MerkleTree { } } + unsafe { // SAFETY: `fill_digests_buf` and `cap` initialized the spare capacity up to // `num_digests` and `len_cap`, resp. diff --git a/plonky2/src/iop/generator.rs b/plonky2/src/iop/generator.rs index 8e387c4cd..d24d8d42b 100644 --- a/plonky2/src/iop/generator.rs +++ b/plonky2/src/iop/generator.rs @@ -36,7 +36,6 @@ pub fn generate_partial_witness< let config = &common_data.config; let generators = &prover_data.generators; let generator_indices_by_watches = &prover_data.generator_indices_by_watches; - println!("Initializing witness."); let mut witness = PartitionWitness::new( config.num_wires, common_data.degree(), @@ -57,8 +56,6 @@ pub fn generate_partial_witness< let mut buffer = GeneratedValues::empty(); - println!("Starting generator execution."); - // Keep running generators until we fail to make progress. while !pending_generator_indices.is_empty() { let mut next_pending_generator_indices = Vec::new(); @@ -98,8 +95,6 @@ pub fn generate_partial_witness< pending_generator_indices = next_pending_generator_indices; } - println!("Finished generator execution."); - if remaining_generators != 0 { return Err(anyhow!("{} generators weren't run", remaining_generators)); } diff --git a/plonky2/src/plonk/prover.rs b/plonky2/src/plonk/prover.rs index 649c811a1..64771ae3a 100644 --- a/plonky2/src/plonk/prover.rs +++ b/plonky2/src/plonk/prover.rs @@ -150,7 +150,6 @@ where let degree = common_data.degree(); set_lookup_wires(prover_data, common_data, &mut partition_witness)?; - println!("Set lookup wires."); let public_inputs = partition_witness.get_targets(&prover_data.public_inputs); let public_inputs_hash = C::InnerHasher::hash_no_pad(&public_inputs); @@ -160,7 +159,6 @@ where "compute full witness", partition_witness.full_witness() ); - println!("Computed full witness."); let wires_values: Vec> = timed!( timing, "compute wire polynomials", @@ -171,7 +169,6 @@ where .map(|column| PolynomialValues::new(column.clone())) .collect() ); - println!("Computed wire polynomials."); // Debug: Print first few wire values to check determinism if !wires_values.is_empty() && !wires_values[0].values.is_empty() { println!( @@ -191,7 +188,6 @@ where prover_data.fft_root_table.as_ref(), ) ); - println!("Computed wires commitment."); let mut challenger = Challenger::::new(); // Observe the FRI config @@ -239,7 +235,6 @@ where .collect(); let zs_partial_products = [plonk_z_vecs, partial_products_and_zs.concat()].concat(); - println!("Computed Z and partial products."); // All lookup polys: RE and partial SLDCs. let lookup_polys = compute_all_lookup_polys(&witness, &deltas, prover_data, common_data, has_lookup); @@ -249,8 +244,7 @@ where } else { zs_partial_products }; - - println!("Computed lookup polynomials."); + let partial_products_zs_and_lookup_commitment = timed!( timing, "commit to partial products, Z's and, if any, lookup polynomials", @@ -283,12 +277,7 @@ where &alphas, ) ); - println!("prover alphas: {:?}", alphas); - println!("prover betas: {:?}", betas); - println!("prover gammas: {:?}", gammas); - println!("prover deltas: {:?}", deltas); - println!("Split up quotient polys."); let all_quotient_poly_chunks: Vec> = timed!( timing, "split up quotient polys", @@ -304,7 +293,6 @@ where .collect() ); - println!("Committed to quotient polys."); let quotient_polys_commitment = timed!( timing, "commit to quotient polys", @@ -317,12 +305,11 @@ where prover_data.fft_root_table.as_ref(), ) ); - - println!("Committed to quotient polys."); + challenger.observe_cap::("ient_polys_commitment.merkle_tree.cap); let zeta = challenger.get_extension_challenge::(); - println!("prover zeta: {:?}", zeta); + // To avoid leaking witness data, we want to ensure that our opening locations, `zeta` and // `g * zeta`, are not in our subgroup `H`. It suffices to check `zeta` only, since // `(g * zeta)^n = zeta^n`, where `n` is the order of `g`. diff --git a/plonky2/src/util/serialization/mod.rs b/plonky2/src/util/serialization/mod.rs index fbfd7974f..209b43d3c 100644 --- a/plonky2/src/util/serialization/mod.rs +++ b/plonky2/src/util/serialization/mod.rs @@ -324,7 +324,6 @@ pub trait Read { let leaf_len = self.read_usize()?; let mut leaves_2d = Vec::with_capacity(leaves_len * leaf_len); for _ in 0..leaves_len { - // let leaf_len = self.read_usize()?; leaves_2d.push(self.read_field_vec(leaf_len)?); } @@ -1426,7 +1425,6 @@ pub trait Write { self.write_usize(leaves_count)?; self.write_usize(tree.leaf_size)?; for i in 0..leaves_count { - // self.write_usize(tree.leaf_size)?; self.write_field_vec(&tree.get(i))?; } self.write_hash_vec::(&tree.digests)?; From 8761671a74dc4825f1b32ef2e8c4bea0235ba463 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 11 Dec 2025 21:25:40 +0000 Subject: [PATCH 23/37] clean up and scripts for testing --- field/Cargo.toml | 4 +- fix_env.md | 79 ----------------- perm_comp.md | 36 -------- plonky2/Cargo.toml | 4 +- plonky2/examples/fibonacci.rs | 4 +- plonky2/src/hash/merkle_proofs.rs | 1 + plonky2/src/hash/merkle_tree.rs | 2 +- plonky2/src/plonk/prover.rs | 4 +- plonky2/src/util/mod.rs | 1 - test_gpu.sh | 141 ++++++++++++++++++++++++++++++ 10 files changed, 151 insertions(+), 125 deletions(-) delete mode 100644 fix_env.md delete mode 100644 perm_comp.md create mode 100755 test_gpu.sh diff --git a/field/Cargo.toml b/field/Cargo.toml index 80b4478da..02f535922 100644 --- a/field/Cargo.toml +++ b/field/Cargo.toml @@ -35,8 +35,8 @@ workspace = true [features] -# default = [] -default = [ "cuda", "cuda_sanity_check" ] +default = [] +# default = [ "cuda", "cuda_sanity_check" ] cuda = [] # sanity check: when this flag is on, the computation will done on both CPU and CUDA, and the results compared cuda_sanity_check = ["cuda"] \ No newline at end of file diff --git a/fix_env.md b/fix_env.md deleted file mode 100644 index d8304c817..000000000 --- a/fix_env.md +++ /dev/null @@ -1,79 +0,0 @@ -# NVIDIA Driver Fix After Reboot - -## Problem -After system reboot, `nvidia-smi` fails with error: -``` -NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. -``` - -## Root Cause -System updates install new kernel versions, but NVIDIA driver modules aren't automatically built for the new kernel because kernel headers are missing. - -## Quick Fix (Run after each kernel update) - -```bash -# 1. Install kernel headers for current kernel -sudo apt update -sudo apt install -y linux-headers-$(uname -r) - -# 2. DKMS will automatically rebuild NVIDIA modules -# Wait for the installation to complete (shows "Building module(s)..." and "done") - -# 3. Load the NVIDIA driver -sudo modprobe nvidia - -# 4. Verify it works -nvidia-smi -``` - -## Diagnostic Commands - -Check if NVIDIA modules are built for current kernel: -```bash -uname -r # Show current kernel version -dkms status # Check which kernels have NVIDIA modules -modinfo nvidia # Check if nvidia module exists for current kernel -lsmod | grep nvidia # Check if nvidia modules are loaded -``` - -## Prevention - Auto-install Headers (Recommended) - -Set up automatic kernel header installation: -```bash -sudo bash -c 'cat > /etc/apt/apt.conf.d/99auto-kernel-headers < Result<()> { } println!("Circuit built."); - let size = 16; - #[cfg(feature = "cuda")] { + let size = 16; + zeknox::clear_cuda_errors_rs(); println!("Initializing CUDA twiddle factors..."); diff --git a/plonky2/src/hash/merkle_proofs.rs b/plonky2/src/hash/merkle_proofs.rs index 892564932..4fb3393d3 100644 --- a/plonky2/src/hash/merkle_proofs.rs +++ b/plonky2/src/hash/merkle_proofs.rs @@ -240,6 +240,7 @@ impl, const D: usize> CircuitBuilder { /// Same as `verify_batch_merkle_proof_to_cap`, except with the final "cap index" as separate parameter, /// rather than being contained in `leaf_index_bits`. + #[allow(dead_code)] pub(crate) fn verify_batch_merkle_proof_to_cap_with_cap_index>( &mut self, leaf_data: &[Vec], diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs index 30693ca6a..705f5ad74 100644 --- a/plonky2/src/hash/merkle_tree.rs +++ b/plonky2/src/hash/merkle_tree.rs @@ -41,6 +41,7 @@ fn print_time(now: Instant, msg: &str) { } #[cfg(not(feature = "cuda"))] +#[allow(dead_code)] fn print_time(_now: Instant, _msg: &str) {} #[cfg(feature = "cuda")] @@ -525,7 +526,6 @@ impl> MerkleTree { } } - unsafe { // SAFETY: `fill_digests_buf` and `cap` initialized the spare capacity up to // `num_digests` and `len_cap`, resp. diff --git a/plonky2/src/plonk/prover.rs b/plonky2/src/plonk/prover.rs index 64771ae3a..d05619311 100644 --- a/plonky2/src/plonk/prover.rs +++ b/plonky2/src/plonk/prover.rs @@ -244,7 +244,7 @@ where } else { zs_partial_products }; - + let partial_products_zs_and_lookup_commitment = timed!( timing, "commit to partial products, Z's and, if any, lookup polynomials", @@ -305,7 +305,7 @@ where prover_data.fft_root_table.as_ref(), ) ); - + challenger.observe_cap::("ient_polys_commitment.merkle_tree.cap); let zeta = challenger.get_extension_challenge::(); diff --git a/plonky2/src/util/mod.rs b/plonky2/src/util/mod.rs index 6f2ae608e..cb11f05e2 100644 --- a/plonky2/src/util/mod.rs +++ b/plonky2/src/util/mod.rs @@ -3,7 +3,6 @@ #[cfg(not(feature = "std"))] use alloc::vec::Vec; -use plonky2_maybe_rayon::*; #[doc(inline)] pub use plonky2_util::*; #[cfg(feature = "cuda")] diff --git a/test_gpu.sh b/test_gpu.sh new file mode 100755 index 000000000..a9c2c4a8c --- /dev/null +++ b/test_gpu.sh @@ -0,0 +1,141 @@ +#!/bin/bash + +# GPU Testing Script for Plonky2 +# This script validates CUDA setup, zeknox library, and runs GPU-accelerated tests + +set -e # Exit on any error + +# Color codes for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo "=========================================" +echo "Plonky2 GPU Testing Script" +echo "=========================================" +echo "" + +# Step 1: Check NVIDIA driver and CUDA +echo -e "${YELLOW}[1/5] Checking NVIDIA driver and CUDA...${NC}" +if ! command -v nvidia-smi &> /dev/null; then + echo -e "${RED}ERROR: nvidia-smi not found. Please install NVIDIA drivers.${NC}" + exit 1 +fi + +echo "NVIDIA Driver Information:" +nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv +echo "" + +# Check for CUDA toolkit +if command -v nvcc &> /dev/null; then + echo "CUDA Compiler Version:" + nvcc --version | grep "release" + echo "" +elif [ -n "$CUDA_HOME" ]; then + echo "CUDA_HOME is set to: $CUDA_HOME" + echo "" +else + echo -e "${YELLOW}WARNING: nvcc not found and CUDA_HOME not set. CUDA toolkit may not be installed.${NC}" + echo "Continuing anyway as runtime libraries may still be available..." + echo "" +fi + +echo -e "${GREEN} NVIDIA driver check passed${NC}" +echo "" + +# Step 2: Check zeknox library +echo -e "${YELLOW}[2/5] Checking zeknox library...${NC}" +ZEKNOX_PATH="../zeknox" +if [ ! -d "$ZEKNOX_PATH" ]; then + echo -e "${RED}ERROR: zeknox library not found at $ZEKNOX_PATH${NC}" + echo "Expected location: $(cd .. && pwd)/zeknox" + exit 1 +fi + +if [ -d "$ZEKNOX_PATH/wrappers/rust" ]; then + echo "Found zeknox library at: $(cd $ZEKNOX_PATH && pwd)" + echo "Rust wrappers directory exists: $ZEKNOX_PATH/wrappers/rust" +else + echo -e "${YELLOW}WARNING: zeknox/wrappers/rust not found, but zeknox directory exists${NC}" +fi + +echo -e "${GREEN} zeknox library check passed${NC}" +echo "" + +# Step 3: Run field tests +echo -e "${YELLOW}[3/5] Running field tests with GPU acceleration...${NC}" +echo "Command: cd field && cargo test --release --features=cuda -- --test-threads=1" +echo "" + +cd field +if cargo test --release --features=cuda -- --test-threads=1; then + echo "" + echo -e "${GREEN} Field tests passed${NC}" +else + echo "" + echo -e "${RED}ERROR: Field tests failed${NC}" + cd .. + exit 1 +fi +cd .. +echo "" + +# Step 4: Run fibonacci example with CUDA for correctness +echo -e "${YELLOW}[4/5] Running fibonacci example with CUDA features...${NC}" +echo "Command: NUM_OF_GPUS=1 cargo run --release --features=cuda_sanity_check --example fibonacci" +echo "" + +if NUM_OF_GPUS=1 cargo run --release --features=cuda_sanity_check --example fibonacci; then + echo "" + echo -e "${GREEN} Fibonacci example completed successfully with GPU${NC}" +else + echo "" + echo -e "${RED}ERROR: Fibonacci example failed with GPU${NC}" + exit 1 +fi +echo "" + +# Step 5: Run fibonacci example with CUDA for speed +echo -e "${YELLOW}[4/5] Running fibonacci example with CUDA features...${NC}" +echo "Command: NUM_OF_GPUS=1 cargo run --release --example fibonacci --features=cuda" +echo "" + +if NUM_OF_GPUS=1 cargo run --release --example fibonacci --features=cuda; then + echo "" + echo -e "${GREEN} Fibonacci example completed successfully with GPU${NC}" +else + echo "" + echo -e "${RED}ERROR: Fibonacci example failed with GPU${NC}" + exit 1 +fi +echo "" + + +# Step 6: Run fibonacci example with CPU +echo -e "${YELLOW}[4/5] Running fibonacci example with CUDA features...${NC}" +echo "Command: NUM_OF_GPUS=1 cargo run --release --example fibonacci" +echo "" + +if cargo run --release --example fibonacci; then + echo "" + echo -e "${GREEN} Fibonacci example completed successfully with CPU${NC}" +else + echo "" + echo -e "${RED}ERROR: Fibonacci example failed with CPU${NC}" + exit 1 +fi +echo "" + +# Step 7: Summary +echo "=========================================" +echo -e "${GREEN}All GPU tests completed successfully!${NC}" +echo "=========================================" +echo "" +echo "Tests run:" +echo "  NVIDIA driver and CUDA verification" +echo "  zeknox library verification" +echo "  Field tests (FFT, polynomials, interpolation, cosets)" +echo "  Fibonacci proof generation with GPU acceleration" +echo "" +echo -e "${GREEN}GPU testing complete!${NC}" From d58854cb91d6226882f36515533afe52da450fbe Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 11 Dec 2025 21:30:23 +0000 Subject: [PATCH 24/37] more clean up --- plonky2/src/plonk/verifier.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/plonky2/src/plonk/verifier.rs b/plonky2/src/plonk/verifier.rs index d369656c6..fa1bc14b8 100644 --- a/plonky2/src/plonk/verifier.rs +++ b/plonky2/src/plonk/verifier.rs @@ -27,11 +27,6 @@ pub(crate) fn verify, C: GenericConfig, c &verifier_data.circuit_digest, common_data, )?; - println!("verifier alphas: {:?}", challenges.plonk_alphas); - println!("verifier betas: {:?}", challenges.plonk_betas); - println!("verifier gammas: {:?}", challenges.plonk_gammas); - println!("verifier deltas: {:?}", challenges.plonk_deltas); - println!("verifier zeta: {:?}", challenges.plonk_zeta); verify_with_challenges::( proof_with_pis.proof, From 50a71fb6a087eb5f325220f95d6fb3c18b33df54 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 11 Dec 2025 21:36:37 +0000 Subject: [PATCH 25/37] more clean up --- plonky2/src/hash/merkle_tree.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs index 705f5ad74..41d0ffee7 100644 --- a/plonky2/src/hash/merkle_tree.rs +++ b/plonky2/src/hash/merkle_tree.rs @@ -36,8 +36,8 @@ use crate::util::log2_strict; pub static GPU_ID: Lazy>> = Lazy::new(|| Arc::new(Mutex::new(0))); #[cfg(feature = "cuda")] -fn print_time(now: Instant, msg: &str) { - println!("Time {} {} ms", msg, now.elapsed().as_millis()); +fn print_time(_now: Instant, _msg: &str) { + // println!("Time {} {} ms", _msg, _now.elapsed().as_millis()); } #[cfg(not(feature = "cuda"))] @@ -526,6 +526,7 @@ impl> MerkleTree { } } + unsafe { // SAFETY: `fill_digests_buf` and `cap` initialized the spare capacity up to // `num_digests` and `len_cap`, resp. From 9246859ffe2eec7c0fdbefa536c69e8951601e47 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 12 Dec 2025 20:25:59 +0000 Subject: [PATCH 26/37] fix bugs --- field/Cargo.toml | 3 ++- plonky2/Cargo.toml | 9 ++++++-- plonky2/src/fri/oracle.rs | 31 +++------------------------- plonky2/src/hash/merkle_tree.rs | 1 - plonky2/src/plonk/circuit_builder.rs | 2 +- test_gpu.sh | 12 +++++------ 6 files changed, 19 insertions(+), 39 deletions(-) diff --git a/field/Cargo.toml b/field/Cargo.toml index 02f535922..39ee8ef07 100644 --- a/field/Cargo.toml +++ b/field/Cargo.toml @@ -35,7 +35,8 @@ workspace = true [features] -default = [] +# default = [] +default = [ "cuda" ] # default = [ "cuda", "cuda_sanity_check" ] cuda = [] # sanity check: when this flag is on, the computation will done on both CPU and CUDA, and the results compared diff --git a/plonky2/Cargo.toml b/plonky2/Cargo.toml index 7f91186a0..df60129ac 100644 --- a/plonky2/Cargo.toml +++ b/plonky2/Cargo.toml @@ -11,8 +11,13 @@ repository.workspace = true keywords.workspace = true categories.workspace = true + [features] -default = ["gate_testing", "parallel", "rand_chacha", "std", "timing"] +# default = ["gate_testing", "rand_chacha", "std", "timing", "cuda"] +# default = ["gate_testing", "parallel", "rand_chacha", "std", "timing", ] + +default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing", ] +# default = ["gate_testing", "rand_chacha", "std", "cuda", "timing", "cuda_sanity_check"] # default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing", "cuda_sanity_check"] gate_testing = [] parallel = ["hashbrown/rayon", "plonky2_maybe_rayon/parallel"] @@ -31,7 +36,7 @@ itertools = { workspace = true } keccak-hash = { version = "0.8.0", default-features = false } log = { workspace = true } num = { workspace = true } -once_cell = { workspace = true } +once_cell = { workspace = true, features = ["std"] } rand = { workspace = true } rand_chacha = { version = "0.3.1", optional = true, default-features = false } serde = { workspace = true, features = ["rc"] } diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs index d008301cf..e058374e2 100644 --- a/plonky2/src/fri/oracle.rs +++ b/plonky2/src/fri/oracle.rs @@ -55,8 +55,8 @@ impl, C: GenericConfig, const D: usize> { /// Creates a list polynomial commitment for the polynomials interpolating the values in `values`. /// This function is called by the builder during preprocessing the circuit. - /// We use parallel IFFT on CPU here to avoid strange GPU issue. - pub fn preprocessor_from_values( + /// This function always calls IFFT on CPU to avoid strange GPU issue. + pub fn from_values( values: Vec>, rate_bits: usize, blinding: bool, @@ -66,7 +66,7 @@ impl, C: GenericConfig, const D: usize> ) -> Self { let coeffs = timed!( timing, - "IFFT", + "CPU IFFT", values .into_par_iter() .map(|v| v.ifft_cpu()) @@ -83,31 +83,6 @@ impl, C: GenericConfig, const D: usize> ) } - /// Creates a list polynomial commitment for the polynomials interpolating the values in `values`. - pub fn from_values( - values: Vec>, - rate_bits: usize, - blinding: bool, - cap_height: usize, - timing: &mut TimingTree, - fft_root_table: Option<&FftRootTable>, - ) -> Self { - let coeffs = timed!( - timing, - "IFFT", - values.into_par_iter().map(|v| v.ifft()).collect::>() - ); - - Self::from_coeffs( - coeffs, - rate_bits, - blinding, - cap_height, - timing, - fft_root_table, - ) - } - /// Creates a list polynomial commitment for the polynomials `polynomials`. pub fn from_coeffs( polynomials: Vec>, diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs index 41d0ffee7..b2a57df52 100644 --- a/plonky2/src/hash/merkle_tree.rs +++ b/plonky2/src/hash/merkle_tree.rs @@ -526,7 +526,6 @@ impl> MerkleTree { } } - unsafe { // SAFETY: `fill_digests_buf` and `cap` initialized the spare capacity up to // `num_digests` and `len_cap`, resp. diff --git a/plonky2/src/plonk/circuit_builder.rs b/plonky2/src/plonk/circuit_builder.rs index 5b2f2c3a1..e6a81f378 100644 --- a/plonky2/src/plonk/circuit_builder.rs +++ b/plonky2/src/plonk/circuit_builder.rs @@ -1229,7 +1229,7 @@ impl, const D: usize> CircuitBuilder { // It does not impact performance as this is only done once during setup. let constants_sigmas_commitment = if commit_to_sigma { let constants_sigmas_vecs = [constant_vecs, sigma_vecs.clone()].concat(); - PolynomialBatch::::preprocessor_from_values( + PolynomialBatch::::from_values( constants_sigmas_vecs, rate_bits, PlonkOracle::CONSTANTS_SIGMAS.blinding, diff --git a/test_gpu.sh b/test_gpu.sh index a9c2c4a8c..9979bfeaf 100755 --- a/test_gpu.sh +++ b/test_gpu.sh @@ -17,7 +17,7 @@ echo "=========================================" echo "" # Step 1: Check NVIDIA driver and CUDA -echo -e "${YELLOW}[1/5] Checking NVIDIA driver and CUDA...${NC}" +echo -e "${YELLOW}[1/7] Checking NVIDIA driver and CUDA...${NC}" if ! command -v nvidia-smi &> /dev/null; then echo -e "${RED}ERROR: nvidia-smi not found. Please install NVIDIA drivers.${NC}" exit 1 @@ -45,7 +45,7 @@ echo -e "${GREEN} NVIDIA driver check passed${NC}" echo "" # Step 2: Check zeknox library -echo -e "${YELLOW}[2/5] Checking zeknox library...${NC}" +echo -e "${YELLOW}[2/7] Checking zeknox library...${NC}" ZEKNOX_PATH="../zeknox" if [ ! -d "$ZEKNOX_PATH" ]; then echo -e "${RED}ERROR: zeknox library not found at $ZEKNOX_PATH${NC}" @@ -64,7 +64,7 @@ echo -e "${GREEN} zeknox library check passed${NC}" echo "" # Step 3: Run field tests -echo -e "${YELLOW}[3/5] Running field tests with GPU acceleration...${NC}" +echo -e "${YELLOW}[3/7] Running field tests with GPU acceleration...${NC}" echo "Command: cd field && cargo test --release --features=cuda -- --test-threads=1" echo "" @@ -82,7 +82,7 @@ cd .. echo "" # Step 4: Run fibonacci example with CUDA for correctness -echo -e "${YELLOW}[4/5] Running fibonacci example with CUDA features...${NC}" +echo -e "${YELLOW}[4/7] Running fibonacci example with CUDA features...${NC}" echo "Command: NUM_OF_GPUS=1 cargo run --release --features=cuda_sanity_check --example fibonacci" echo "" @@ -97,7 +97,7 @@ fi echo "" # Step 5: Run fibonacci example with CUDA for speed -echo -e "${YELLOW}[4/5] Running fibonacci example with CUDA features...${NC}" +echo -e "${YELLOW}[5/7] Running fibonacci example with CUDA features...${NC}" echo "Command: NUM_OF_GPUS=1 cargo run --release --example fibonacci --features=cuda" echo "" @@ -113,7 +113,7 @@ echo "" # Step 6: Run fibonacci example with CPU -echo -e "${YELLOW}[4/5] Running fibonacci example with CUDA features...${NC}" +echo -e "${YELLOW}[6/7] Running fibonacci example with CUDA features...${NC}" echo "Command: NUM_OF_GPUS=1 cargo run --release --example fibonacci" echo "" From 39ca68006a2ef6da6b3b72bb6f29ff8a7317cfad Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 15 Dec 2025 21:01:10 +0000 Subject: [PATCH 27/37] working now --- field/src/fft.rs | 34 +++ field/src/polynomial/division.rs | 2 +- plonky2/examples/fibonacci.rs | 4 +- plonky2/src/fri/oracle.rs | 332 +++++++++++++++++++++++++++- plonky2/src/plonk/circuit_data.rs | 12 +- plonky2/src/plonk/prover.rs | 100 ++++++--- plonky2/src/plonk/vanishing_poly.rs | 95 +++++--- 7 files changed, 505 insertions(+), 74 deletions(-) diff --git a/field/src/fft.rs b/field/src/fft.rs index 2509c7b2c..e08fd8021 100644 --- a/field/src/fft.rs +++ b/field/src/fft.rs @@ -11,6 +11,40 @@ use crate::types::Field; pub type FftRootTable = Vec>; +pub fn batch_fft(input: &[PolynomialCoeffs]) -> Vec> { + #[cfg(feature = "cuda")] + { + use zeknox::ntt_batch; + use zeknox::types::NTTConfig; + + let mut data = input + .iter() + .flat_map(|poly| poly.coeffs.clone()) + .collect::>(); + let mut cfg = NTTConfig::default(); + cfg.batches = input.len() as u32; + let poly_len = input[0].len(); + ntt_batch(0, &mut data, log2_strict(poly_len), cfg); + + data.chunks(poly_len) + .map(|chunk| PolynomialValues::new(chunk.to_vec())) + .collect() + } + #[cfg(not(feature = "cuda"))] + { + let mut res = Vec::with_capacity(input.len()); + for poly in input.iter() { + let mut batch_res = Vec::with_capacity(poly.len()); + for p in poly { + let pv = fft_with_options(p.clone(), None, None); + batch_res.push(pv); + } + res.extend(batch_res); + } + res + } +} + pub fn fft_root_table(n: usize) -> FftRootTable { let lg_n = log2_strict(n); // bases[i] = g^2^i, for i = 0, ..., lg_n - 1 diff --git a/field/src/polynomial/division.rs b/field/src/polynomial/division.rs index 7d85d5492..a34602de4 100644 --- a/field/src/polynomial/division.rs +++ b/field/src/polynomial/division.rs @@ -78,7 +78,7 @@ impl PolynomialCoeffs { .iter() .rev() .scan(F::ZERO, |acc, &c| { - *acc = *acc * z + c; + *acc = c.multiply_accumulate(*acc, z); Some(*acc) }) .collect::>(); diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs index 71bd479bb..d2d30e2d7 100644 --- a/plonky2/examples/fibonacci.rs +++ b/plonky2/examples/fibonacci.rs @@ -27,7 +27,7 @@ fn main() -> Result<()> { let initial_b = builder.add_virtual_target(); let mut prev_target = initial_a; let mut cur_target = initial_b; - for _ in 0..999999 { + for _ in 0..99 { let temp = builder.add(prev_target, cur_target); prev_target = cur_target; cur_target = temp; @@ -36,7 +36,7 @@ fn main() -> Result<()> { #[cfg(feature = "cuda")] { - let size = 16; + let size = 3; zeknox::clear_cuda_errors_rs(); println!("Initializing CUDA twiddle factors..."); diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs index e058374e2..29ebb5fd1 100644 --- a/plonky2/src/fri/oracle.rs +++ b/plonky2/src/fri/oracle.rs @@ -53,6 +53,24 @@ impl, C: GenericConfig, const D: usize> D impl, C: GenericConfig, const D: usize> PolynomialBatch { + // pub fn from_values_gpu( + // values: Vec>, + // rate_bits: usize, + // blinding: bool, + // cap_height: usize, + // timing: &mut TimingTree, + // fft_root_table: Option<&FftRootTable>, + // ) -> Self { + // let coeffs = timed!( + // timing, + // "CPU IFFT", + // values + // .into_par_iter() + // .map(|v| v.ifft_cpu()) + // .collect::>() + // ); + // } + /// Creates a list polynomial commitment for the polynomials interpolating the values in `values`. /// This function is called by the builder during preprocessing the circuit. /// This function always calls IFFT on CPU to avoid strange GPU issue. @@ -73,7 +91,7 @@ impl, C: GenericConfig, const D: usize> .collect::>() ); - Self::from_coeffs( + Self::from_coeffs_gpu( coeffs, rate_bits, blinding, @@ -116,6 +134,318 @@ impl, C: GenericConfig, const D: usize> } } + pub fn from_coeffs_gpu( + polynomials: Vec>, + rate_bits: usize, + blinding: bool, + cap_height: usize, + timing: &mut TimingTree, + fft_root_table: Option<&FftRootTable>, + ) -> Self { + let degree = polynomials[0].len(); + + // If blinding, salt with two random elements to each leaf vector. + let salt_size = if blinding { SALT_SIZE } else { 0 }; + println!( + "lde_values: num_polys={}, degree={}, blinding={}, salt_size={}", + polynomials.len(), + degree, + blinding, + salt_size + ); + + #[cfg(feature = "cuda")] + { + if F::CUDA_SUPPORT { + return Self::from_coeffs_gpu_optimized( + polynomials, + rate_bits, + blinding, + cap_height, + timing, + fft_root_table, + degree, + salt_size, + ); + } + } + + // Fallback to CPU path + let lde_values = polynomials + .iter() + .map(|p| { + assert_eq!(p.len(), degree, "Polynomial degrees inconsistent"); + p.lde(rate_bits) + .coset_fft_with_options(F::coset_shift(), Some(rate_bits), fft_root_table) + .values + }) + .chain( + (0..salt_size) + .into_iter() + .map(|_| F::rand_vec(degree << rate_bits)), + ) + .collect::>(); + let mut leaves = timed!(timing, "transpose LDEs", transpose(&lde_values)); + reverse_index_bits_in_place(&mut leaves); + let merkle_tree = timed!( + timing, + "build Merkle tree", + MerkleTree::new_from_2d(leaves, cap_height) + ); + + Self { + polynomials, + merkle_tree, + degree_log: log2_strict(degree), + rate_bits, + blinding, + } + } + + #[cfg(feature = "cuda")] + fn from_coeffs_gpu_optimized( + polynomials: Vec>, + rate_bits: usize, + blinding: bool, + cap_height: usize, + timing: &mut TimingTree, + fft_root_table: Option<&FftRootTable>, + degree: usize, + salt_size: usize, + ) -> Self { + println!("Using GPU-accelerated LDE computation"); + + use std::time::Instant; + + use zeknox::device::memory::HostOrDeviceSlice; + use zeknox::types::{NTTConfig, TransposeConfig}; + use zeknox::{ntt_batch_ptr, transpose_rev_batch}; + + let lde_size = degree << rate_bits; + let num_polys = polynomials.len() + salt_size; + + // let lde_cpu = { + // // Fallback to CPU path + // let lde_values = polynomials + // .iter() + // .map(|p| { + // assert_eq!(p.len(), degree, "Polynomial degrees inconsistent"); + // p.lde(rate_bits) + // .coset_fft_with_options(F::coset_shift(), Some(rate_bits), fft_root_table) + // .values + // }) + // .collect::>(); + + // // for v in &lde_values { + // // println!("lde_values {:?}", v); + // // } + + // lde_values + // }; + + // let salt_size = if blinding { SALT_SIZE } else { 0 }; + + // Step 1: Compute coset FFT on GPU, keeping data on GPU + let gpu_lde_values = timed!(timing, "GPU coset FFT", { + // let mut all_lde_data = Vec::with_capacity(num_polys); + + // // Process each polynomial + // for p in polynomials.iter() { + // assert_eq!(p.len(), degree, "Polynomial degrees inconsistent"); + + // // Perform LDE (padding) and coset scaling on CPU + // let mut padded_coeffs = p.lde(rate_bits).coeffs; + + // // Apply coset shift + // let shift = F::coset_shift(); + // for (i, coeff) in padded_coeffs.iter_mut().enumerate() { + // *coeff *= shift.exp_u64(i as u64); + // } + + // all_lde_data.push(padded_coeffs); + // } + + // // Add salt + // for _ in 0..salt_size { + // all_lde_data.push(F::rand_vec(lde_size)); + // } + + // Allocate GPU memory for all polynomials + let total_elements = num_polys * lde_size; + println!( + "Allocating GPU memory for {} polynomials of size {} (total {} elements)", + num_polys, lde_size, total_elements + ); + let total_alloce_size = num_polys * lde_size; + // let total_alloce_size = num_polys.next_power_of_two() * lde_size; + + let timer = Instant::now(); + let mut gpu_buffer = HostOrDeviceSlice::cuda_malloc(0, total_alloce_size) + .expect("Failed to allocate GPU memory"); + println!("cuda alloc took: {:?}", timer.elapsed()); + println!( + "lde size: {}, total_elements: {}, total allocated size: {}", + lde_size, total_elements, total_alloce_size + ); + + let timer = Instant::now(); + // Copy all data to GPU in one go + let mut flat_data = vec![F::ZERO; total_alloce_size]; + + for i in 0..polynomials.len() { + + flat_data[i*lde_size.. i*lde_size +degree].copy_from_slice(polynomials[i].coeffs.as_ref()) + + } + + // polynomials.par_iter().zip(flat_data.par_chunks_exact_mut(lde_size)).for_each(|(p, c)| + // c[..degree].copy_from_slice(p.coeffs.as_ref()) + // ); + + // let flat_data: Vec = polynomials + // .iter() + // .flat_map(|v| v.lde(rate_bits).coeffs) // pad each polynomial to lde_size + // // .into_iter() + // // .collect() + // // .iter() + // // .chain( + // // vec![F::ZERO; total_alloce_size - total_elements] + // // .iter() + // // .copied(), + // // ) + // .collect(); + + println!("cpu prepare took: {:?}", timer.elapsed()); + // let flat_data = vec![F::ZERO; lde_size - total_elements].iter() + + println!( + "Copying {} elements to GPU (expected {})", + flat_data.len(), + total_elements + ); + + let timer = Instant::now(); + gpu_buffer + .copy_from_host(&flat_data) + .expect("Failed to copy data to GPU"); + println!("IO took: {:?}", timer.elapsed()); + + + // Perform batched NTT on GPU + let log_domain_size = log2_strict(lde_size); + let ntt_config = NTTConfig { + batches: num_polys as u32, + are_inputs_on_device: true, + are_outputs_on_device: true, + with_coset: true, + ..Default::default() + }; + + let timer = Instant::now(); + ntt_batch_ptr(0, gpu_buffer.as_mut_ptr(), log_domain_size, ntt_config); + + println!("comput took: {:?}", timer.elapsed()); + + gpu_buffer + }); + + println!("Completed GPU coset FFT for {} polynomials", num_polys); + + // let total_elements = num_polys * lde_size; + // let mut gpu_lde_values_copied_to_cpu = vec![F::ZERO; total_elements]; + + // gpu_lde_values + // .copy_to_host(&mut gpu_lde_values_copied_to_cpu, total_elements) + // .expect("Failed to copy data from GPU"); + + // println!("lde value: {:?}", gpu_lde_values_copied_to_cpu); + + // let lde_cpu_1d = lde_cpu.clone().into_iter().flatten().collect::>(); + // assert_eq!(lde_cpu_1d.len(), gpu_lde_values_copied_to_cpu.len(), "LDE size mismatch"); + // assert_eq!(lde_cpu_1d, gpu_lde_values_copied_to_cpu, "LDE values mismatch"); + + // Step 2: Transpose on GPU using Zeknox + let gpu_transposed = timed!(timing, "GPU transpose", { + let total_alloce_size = num_polys * lde_size; + // let total_alloce_size = num_polys.next_power_of_two() * lde_size; + + let mut gpu_output = HostOrDeviceSlice::cuda_malloc(0, total_alloce_size) + .expect("Failed to allocate GPU memory for transpose"); + + let log_n = log2_strict(lde_size); + let transpose_config = TransposeConfig { + batches: num_polys as u32, + are_inputs_on_device: true, + are_outputs_on_device: true, + }; + + transpose_rev_batch( + 0, + gpu_output.as_mut_ptr(), + gpu_lde_values.as_ptr(), + log_n, + transpose_config, + ); + + // gpu_lde_values will be automatically freed when it goes out of scope + + gpu_output + }); + print!("Completed GPU transpose for {} polynomials", num_polys); + + // Step 3: Copy back to CPU + let leaves = timed!(timing, "GPU to CPU transfer", { + let total_elements = num_polys * lde_size; + let mut cpu_data = vec![F::ZERO; total_elements]; + + gpu_transposed + .copy_to_host(&mut cpu_data, total_elements) + .expect("Failed to copy data from GPU"); + + // // Reshape into leaves: Vec> where each inner vec has num_polys elements + // cpu_data + // .chunks(num_polys) + // .map(|chunk| chunk.to_vec()) + // .collect::>() + + cpu_data + }); + + // let mut leaves_cpu = timed!(timing, "transpose LDEs", transpose(&lde_cpu)); + // println!("tatal leaves: {}", leaves.len()); + // println!("leaves[0]: {:?}", leaves[0]); + // println!("leaves_cpu[0]: {:?}", leaves_cpu[0]); + + // reverse_index_bits_in_place(&mut leaves_cpu); + // for i in 0..leaves.len() { + // if leaves[i] != leaves_cpu[i] { + // println!("Mismatch at leaf {}: \n{:?}\n{:?}\n", i, leaves[i], leaves_cpu[i]); + // } + // } + + // assert!(leaves == leaves_cpu, "Transposed LDE values mismatch"); + + // let merkle_tree = timed!( + // timing, + // "build Merkle tree", + // MerkleTree::new_from_2d(leaves, cap_height) + // ); + + let merkle_tree = timed!( + timing, + "build Merkle tree", + MerkleTree::new_from_1d(leaves,polynomials.len(), cap_height) + ); + + Self { + polynomials, + merkle_tree, + degree_log: log2_strict(degree), + rate_bits, + blinding, + } + } + pub(crate) fn lde_values( polynomials: &[PolynomialCoeffs], rate_bits: usize, diff --git a/plonky2/src/plonk/circuit_data.rs b/plonky2/src/plonk/circuit_data.rs index e06e9b69b..61aa36ab0 100644 --- a/plonky2/src/plonk/circuit_data.rs +++ b/plonky2/src/plonk/circuit_data.rs @@ -19,6 +19,7 @@ use core::ops::{Range, RangeFrom}; use std::collections::BTreeMap; use anyhow::Result; +use log::Level; use serde::Serialize; use super::circuit_builder::LookupWire; @@ -213,12 +214,11 @@ impl, C: GenericConfig, const D: usize> } pub fn prove(&self, inputs: PartialWitness) -> Result> { - prove::( - &self.prover_only, - &self.common, - inputs, - &mut TimingTree::default(), - ) + let mut timing = TimingTree::new("CircuitData::prove", Level::Debug); + + let res = prove::(&self.prover_only, &self.common, inputs, &mut timing); + timing.print(); + res } pub fn verify(&self, proof_with_pis: ProofWithPublicInputs) -> Result<()> { diff --git a/plonky2/src/plonk/prover.rs b/plonky2/src/plonk/prover.rs index d05619311..b79391fc8 100644 --- a/plonky2/src/plonk/prover.rs +++ b/plonky2/src/plonk/prover.rs @@ -4,6 +4,7 @@ use alloc::{format, vec, vec::Vec}; use core::cmp::min; use core::mem::swap; +use std::time::Instant; use anyhow::{ensure, Result}; use hashbrown::HashMap; @@ -649,11 +650,15 @@ fn compute_quotient_polys< // steps away since we work on an LDE of degree `max_filtered_constraint_degree`. let next_step = 1 << quotient_degree_bits; + let timer = Instant::now(); let points = F::two_adic_subgroup(common_data.degree_bits() + quotient_degree_bits); + println!("Time to compute LDE points: {:?}", timer.elapsed()); + let lde_size = points.len(); let z_h_on_coset = ZeroPolyOnCoset::new(common_data.degree_bits(), quotient_degree_bits); + let timer = Instant::now(); // Precompute the lookup table evals on the challenges in delta // These values are used to produce the final RE constraints for each lut, // and are the same each time in check_lookup_constraints_batched. @@ -686,14 +691,19 @@ fn compute_quotient_polys< } else { vec![] }; + println!( + "Time to compute LUT RE polynomial evals: {:?}", + timer.elapsed() + ); + let timer = Instant::now(); let lut_re_poly_evals_refs: Vec<&[F]> = lut_re_poly_evals.iter().map(|v| v.as_slice()).collect(); - let points_batches = points.par_chunks(BATCH_SIZE); let num_batches = points.len().div_ceil(BATCH_SIZE); - let quotient_values: Vec> = points_batches + let quotient_values: Vec> = points + .par_chunks(BATCH_SIZE) .enumerate() .flat_map(|(batch_i, xs_batch)| { // Each batch must be the same size, except the last one, which may be smaller. @@ -702,23 +712,26 @@ fn compute_quotient_polys< || (batch_i == num_batches - 1 && xs_batch.len() <= BATCH_SIZE) ); - let indices_batch: Vec = - (BATCH_SIZE * batch_i..BATCH_SIZE * batch_i + xs_batch.len()).collect(); + let batch_size = xs_batch.len(); + let batch_start = BATCH_SIZE * batch_i; + + let mut shifted_xs_batch = Vec::with_capacity(batch_size); + let mut local_zs_batch = Vec::with_capacity(batch_size); + let mut next_zs_batch = Vec::with_capacity(batch_size); - let mut shifted_xs_batch = Vec::with_capacity(xs_batch.len()); - let mut local_zs_batch = Vec::with_capacity(xs_batch.len()); - let mut next_zs_batch = Vec::with_capacity(xs_batch.len()); + let mut local_lookup_batch = Vec::with_capacity(batch_size); + let mut next_lookup_batch = Vec::with_capacity(batch_size); - let mut local_lookup_batch = Vec::with_capacity(xs_batch.len()); - let mut next_lookup_batch = Vec::with_capacity(xs_batch.len()); + let mut partial_products_batch = Vec::with_capacity(batch_size); + let mut s_sigmas_batch = Vec::with_capacity(batch_size); - let mut partial_products_batch = Vec::with_capacity(xs_batch.len()); - let mut s_sigmas_batch = Vec::with_capacity(xs_batch.len()); + let mut local_constants_batch_refs = Vec::with_capacity(batch_size); + let mut local_wires_batch_refs = Vec::with_capacity(batch_size); - let mut local_constants_batch_refs = Vec::with_capacity(xs_batch.len()); - let mut local_wires_batch_refs = Vec::with_capacity(xs_batch.len()); + // let timer1 = Instant::now(); - for (&i, &x) in indices_batch.iter().zip(xs_batch) { + for (j, &x) in xs_batch.iter().enumerate() { + let i = batch_start + j; let shifted_x = F::coset_shift() * x; let i_next = (i + next_step) % lde_size; let local_constants_sigmas = prover_data @@ -762,20 +775,28 @@ fn compute_quotient_polys< s_sigmas_batch.push(s_sigmas); } - // NB (JN): I'm not sure how (in)efficient the below is. It needs measuring. - let mut local_constants_batch = - vec![F::ZERO; xs_batch.len() * local_constants_batch_refs[0].len()]; - for i in 0..local_constants_batch_refs[0].len() { + // println!( + // "Time to gather LDE values for batch {}: {:?}", + // batch_i, + // timer1.elapsed() + // ); + + // Optimized transposition with better cache locality + let n_constants = local_constants_batch_refs[0].len(); + let mut local_constants_batch = vec![F::ZERO; xs_batch.len() * n_constants]; + for i in 0..n_constants { + let offset = i * xs_batch.len(); for (j, constants) in local_constants_batch_refs.iter().enumerate() { - local_constants_batch[i * xs_batch.len() + j] = constants[i]; + local_constants_batch[offset + j] = constants[i]; } } - let mut local_wires_batch = - vec![F::ZERO; xs_batch.len() * local_wires_batch_refs[0].len()]; - for i in 0..local_wires_batch_refs[0].len() { + let n_wires = local_wires_batch_refs[0].len(); + let mut local_wires_batch = vec![F::ZERO; xs_batch.len() * n_wires]; + for i in 0..n_wires { + let offset = i * xs_batch.len(); for (j, wires) in local_wires_batch_refs.iter().enumerate() { - local_wires_batch[i * xs_batch.len() + j] = wires[i]; + local_wires_batch[offset + j] = wires[i]; } } @@ -786,6 +807,8 @@ fn compute_quotient_polys< public_inputs_hash, ); + // let timer1 = Instant::now(); + let indices_batch: Vec = (batch_start..batch_start + batch_size).collect(); let mut quotient_values_batch = eval_vanishing_poly_base_batch::( common_data, &indices_batch, @@ -804,21 +827,42 @@ fn compute_quotient_polys< &z_h_on_coset, &lut_re_poly_evals_refs, ); - - for (&i, quotient_values) in indices_batch.iter().zip(quotient_values_batch.iter_mut()) - { + // println!( + // "Time to eval vanishing poly for batch {}: {:?}", + // batch_i, + // timer1.elapsed() + // ); + + // let timer1 = Instant::now(); + for (j, quotient_values) in quotient_values_batch.iter_mut().enumerate() { + let i = batch_start + j; let denominator_inv = z_h_on_coset.eval_inverse(i); quotient_values .iter_mut() .for_each(|v| *v *= denominator_inv); } + // println!( + // "Time to divide out Z_H for batch {}: {:?}", + // batch_i, + // timer1.elapsed() + // ); + quotient_values_batch }) .collect(); - transpose("ient_values) + println!( + "Time to compute quotient polys: {:?} for {} points", + timer.elapsed(), + quotient_values.len() + ); + + let timer = Instant::now(); + let res = transpose("ient_values) .into_par_iter() .map(PolynomialValues::new) .map(|values| values.coset_ifft(F::coset_shift())) - .collect() + .collect(); + println!("Time to compute quotient polys IFFT: {:?}", timer.elapsed()); + res } diff --git a/plonky2/src/plonk/vanishing_poly.rs b/plonky2/src/plonk/vanishing_poly.rs index 48179ce63..0f6177daa 100644 --- a/plonky2/src/plonk/vanishing_poly.rs +++ b/plonky2/src/plonk/vanishing_poly.rs @@ -211,20 +211,29 @@ pub(crate) fn eval_vanishing_poly_base_batch, const let num_challenges = common_data.config.num_challenges; let num_routed_wires = common_data.config.num_routed_wires; - let mut numerator_values = Vec::with_capacity(num_routed_wires); - let mut denominator_values = Vec::with_capacity(num_routed_wires); + // Pre-allocate reusable buffers with exact capacities + let mut numerator_values = Vec::with_capacity(num_routed_wires * num_challenges); + let mut denominator_values = Vec::with_capacity(num_routed_wires * num_challenges); // The L_0(x) (Z(x) - 1) vanishing terms. let mut vanishing_z_1_terms = Vec::with_capacity(num_challenges); // The terms checking the partial products. - let mut vanishing_partial_products_terms = Vec::new(); + let mut vanishing_partial_products_terms = Vec::with_capacity(num_challenges * num_prods); // The terms checking the lookup constraints. - let mut vanishing_all_lookup_terms = if has_lookup { + let lookup_terms_capacity = if has_lookup { let num_sldc_polys = common_data.num_lookup_polys - 1; - Vec::with_capacity( - common_data.config.num_challenges * (4 + common_data.luts.len() + 2 * num_sldc_polys), - ) + num_challenges * (4 + common_data.luts.len() + 2 * num_sldc_polys) + } else { + 0 + }; + let mut vanishing_all_lookup_terms = Vec::with_capacity(lookup_terms_capacity); + + // Pre-allocate selector buffer if needed + let selector_offset = common_data.selectors_info.num_selectors(); + let num_lookup_selectors = common_data.num_lookup_selectors; + let mut lookup_selectors = if has_lookup && num_lookup_selectors > 0 { + Vec::with_capacity(num_lookup_selectors) } else { Vec::new() }; @@ -235,22 +244,20 @@ pub(crate) fn eval_vanishing_poly_base_batch, const let x = xs_batch[k]; let vars = vars_batch.view(k); - let lookup_selectors: Vec = (0..common_data.num_lookup_selectors) - .map(|i| vars.local_constants[common_data.selectors_info.num_selectors() + i]) - .collect(); + // Reuse lookup_selectors buffer + if has_lookup { + lookup_selectors.clear(); + lookup_selectors.extend( + (0..num_lookup_selectors).map(|i| vars.local_constants[selector_offset + i]), + ); + } let local_zs = local_zs_batch[k]; let next_zs = next_zs_batch[k]; - let local_lookup_zs = if has_lookup { - local_lookup_zs_batch[k] - } else { - &[] - }; - - let next_lookup_zs = if has_lookup { - next_lookup_zs_batch[k] + let (local_lookup_zs, next_lookup_zs) = if has_lookup { + (local_lookup_zs_batch[k], next_lookup_zs_batch[k]) } else { - &[] + (&[][..], &[][..]) }; let partial_products = partial_products_batch[k]; @@ -259,6 +266,16 @@ pub(crate) fn eval_vanishing_poly_base_batch, const let constraint_terms = PackedStridedView::new(&constraint_terms_batch, n, k); let l_0_x = z_h_on_coset.eval_l_0(index, x); + + // Pre-compute common values for all challenges + let beta_x_s_ids: Vec = if num_challenges > 0 { + (0..num_routed_wires) + .map(|j| common_data.k_is[j] * x) + .collect() + } else { + Vec::new() + }; + for i in 0..num_challenges { let z_x = local_zs[i]; let z_gx = next_zs[i]; @@ -268,10 +285,10 @@ pub(crate) fn eval_vanishing_poly_base_batch, const if has_lookup { let cur_deltas = &deltas[NUM_COINS_LOOKUP * i..NUM_COINS_LOOKUP * (i + 1)]; - let cur_local_lookup_zs = &local_lookup_zs - [common_data.num_lookup_polys * i..common_data.num_lookup_polys * (i + 1)]; - let cur_next_lookup_zs = &next_lookup_zs - [common_data.num_lookup_polys * i..common_data.num_lookup_polys * (i + 1)]; + let lookup_poly_start = common_data.num_lookup_polys * i; + let lookup_poly_end = lookup_poly_start + common_data.num_lookup_polys; + let cur_local_lookup_zs = &local_lookup_zs[lookup_poly_start..lookup_poly_end]; + let cur_next_lookup_zs = &next_lookup_zs[lookup_poly_start..lookup_poly_end]; let lookup_constraints = check_lookup_constraints_batch( common_data, @@ -285,17 +302,17 @@ pub(crate) fn eval_vanishing_poly_base_batch, const vanishing_all_lookup_terms.extend(lookup_constraints); } - numerator_values.extend((0..num_routed_wires).map(|j| { - let wire_value = vars.local_wires[j]; - let k_i = common_data.k_is[j]; - let s_id = k_i * x; - wire_value + betas[i] * s_id + gammas[i] - })); - denominator_values.extend((0..num_routed_wires).map(|j| { + let beta_i = betas[i]; + let gamma_i = gammas[i]; + + // Compute numerators and denominators in a single pass + for j in 0..num_routed_wires { let wire_value = vars.local_wires[j]; - let s_sigma = s_sigmas[j]; - wire_value + betas[i] * s_sigma + gammas[i] - })); + numerator_values + .push(wire_value + gamma_i.multiply_accumulate(beta_i, beta_x_s_ids[j])); + denominator_values + .push(wire_value + gamma_i.multiply_accumulate(beta_i, s_sigmas[j])); + } // The partial products considered for this iteration of `i`. let current_partial_products = &partial_products[i * num_prods..(i + 1) * num_prods]; @@ -587,7 +604,9 @@ pub fn check_lookup_constraints_batch, const D: usi // Check RE row transition constraint. let mut cur_sum = next_z_re; for elt in ¤t_lookup_combos { - cur_sum = cur_sum * deltas[LookupChallenges::ChallengeDelta as usize] + *elt; + // cur_sum = cur_sum * deltas[LookupChallenges::ChallengeDelta as usize] + *elt; + cur_sum = + elt.multiply_accumulate(cur_sum, deltas[LookupChallenges::ChallengeDelta as usize]); } let unfiltered_re_line = z_re - cur_sum; @@ -639,7 +658,10 @@ pub fn check_lookup_constraints_batch, const D: usi let lut_sum_prods_with_mul = (poly * lut_degree ..min((poly + 1) * lut_degree, num_lut_slots)) .fold(F::ZERO, |acc, i| { - acc + vars.local_wires[LookupTableGate::wire_ith_multiplicity(i)] * lut_prod_i(i) + acc.multiply_accumulate( + vars.local_wires[LookupTableGate::wire_ith_multiplicity(i)], + lut_prod_i(i), + ) }); // The previous element is the previous poly of the current row or the last poly of the next row. @@ -656,7 +678,8 @@ pub fn check_lookup_constraints_batch, const D: usi .push(lookup_selectors[LookupSelectors::TransSre as usize] * unfiltered_sum_transition); // Check LDC row and col transitions. It's the same constraint, with a row transition happening for slot == 0. - let unfiltered_ldc_transition = lu_prod * (z_x_lookup_sldcs[poly] - prev) + lu_sum_prods; + let unfiltered_ldc_transition = + lu_sum_prods.multiply_accumulate(lu_prod, (z_x_lookup_sldcs[poly] - prev)); constraints .push(lookup_selectors[LookupSelectors::TransLdc as usize] * unfiltered_ldc_transition); } From d9375789c72fc85726b3813df13c01cf2816cfab Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 15 Dec 2025 22:04:55 +0000 Subject: [PATCH 28/37] clean up --- plonky2/src/fri/oracle.rs | 306 +++++++++---------------- plonky2/src/plonk/prover.rs | 332 +++++++++++++--------------- plonky2/src/plonk/vanishing_poly.rs | 2 +- plonky2/src/util/mod.rs | 102 --------- 4 files changed, 265 insertions(+), 477 deletions(-) diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs index 29ebb5fd1..56bb34c53 100644 --- a/plonky2/src/fri/oracle.rs +++ b/plonky2/src/fri/oracle.rs @@ -53,24 +53,6 @@ impl, C: GenericConfig, const D: usize> D impl, C: GenericConfig, const D: usize> PolynomialBatch { - // pub fn from_values_gpu( - // values: Vec>, - // rate_bits: usize, - // blinding: bool, - // cap_height: usize, - // timing: &mut TimingTree, - // fft_root_table: Option<&FftRootTable>, - // ) -> Self { - // let coeffs = timed!( - // timing, - // "CPU IFFT", - // values - // .into_par_iter() - // .map(|v| v.ifft_cpu()) - // .collect::>() - // ); - // } - /// Creates a list polynomial commitment for the polynomials interpolating the values in `values`. /// This function is called by the builder during preprocessing the circuit. /// This function always calls IFFT on CPU to avoid strange GPU issue. @@ -91,14 +73,25 @@ impl, C: GenericConfig, const D: usize> .collect::>() ); - Self::from_coeffs_gpu( - coeffs, - rate_bits, - blinding, - cap_height, - timing, - fft_root_table, - ) + if cfg!(feature = "cuda") { + Self::from_coeffs_gpu( + coeffs, + rate_bits, + blinding, + cap_height, + timing, + fft_root_table, + ) + } else { + Self::from_coeffs_cpu( + coeffs, + rate_bits, + blinding, + cap_height, + timing, + fft_root_table, + ) + } } /// Creates a list polynomial commitment for the polynomials `polynomials`. @@ -109,6 +102,36 @@ impl, C: GenericConfig, const D: usize> cap_height: usize, timing: &mut TimingTree, fft_root_table: Option<&FftRootTable>, + ) -> Self { + if cfg!(feature = "cuda") { + Self::from_coeffs_gpu( + polynomials, + rate_bits, + blinding, + cap_height, + timing, + fft_root_table, + ) + } else { + Self::from_coeffs_cpu( + polynomials, + rate_bits, + blinding, + cap_height, + timing, + fft_root_table, + ) + } + } + + /// Creates a list polynomial commitment for the polynomials `polynomials`. + fn from_coeffs_cpu( + polynomials: Vec>, + rate_bits: usize, + blinding: bool, + cap_height: usize, + timing: &mut TimingTree, + fft_root_table: Option<&FftRootTable>, ) -> Self { let degree = polynomials[0].len(); let lde_values = timed!( @@ -134,7 +157,7 @@ impl, C: GenericConfig, const D: usize> } } - pub fn from_coeffs_gpu( + fn from_coeffs_gpu( polynomials: Vec>, rate_bits: usize, blinding: bool, @@ -154,20 +177,17 @@ impl, C: GenericConfig, const D: usize> salt_size ); - #[cfg(feature = "cuda")] - { - if F::CUDA_SUPPORT { - return Self::from_coeffs_gpu_optimized( - polynomials, - rate_bits, - blinding, - cap_height, - timing, - fft_root_table, - degree, - salt_size, - ); - } + if F::CUDA_SUPPORT { + return Self::from_coeffs_gpu_optimized( + polynomials, + rate_bits, + blinding, + cap_height, + timing, + fft_root_table, + degree, + salt_size, + ); } // Fallback to CPU path @@ -209,129 +229,64 @@ impl, C: GenericConfig, const D: usize> blinding: bool, cap_height: usize, timing: &mut TimingTree, - fft_root_table: Option<&FftRootTable>, + _fft_root_table: Option<&FftRootTable>, degree: usize, salt_size: usize, ) -> Self { - println!("Using GPU-accelerated LDE computation"); - - use std::time::Instant; - use zeknox::device::memory::HostOrDeviceSlice; use zeknox::types::{NTTConfig, TransposeConfig}; use zeknox::{ntt_batch_ptr, transpose_rev_batch}; let lde_size = degree << rate_bits; let num_polys = polynomials.len() + salt_size; + let total_alloc_size = num_polys * lde_size; - // let lde_cpu = { - // // Fallback to CPU path - // let lde_values = polynomials - // .iter() - // .map(|p| { - // assert_eq!(p.len(), degree, "Polynomial degrees inconsistent"); - // p.lde(rate_bits) - // .coset_fft_with_options(F::coset_shift(), Some(rate_bits), fft_root_table) - // .values - // }) - // .collect::>(); - - // // for v in &lde_values { - // // println!("lde_values {:?}", v); - // // } - - // lde_values - // }; - - // let salt_size = if blinding { SALT_SIZE } else { 0 }; + let salt_polys = (0..salt_size) + .map(|_| F::rand_vec(lde_size)) + .collect::>(); // Step 1: Compute coset FFT on GPU, keeping data on GPU let gpu_lde_values = timed!(timing, "GPU coset FFT", { - // let mut all_lde_data = Vec::with_capacity(num_polys); - - // // Process each polynomial - // for p in polynomials.iter() { - // assert_eq!(p.len(), degree, "Polynomial degrees inconsistent"); - - // // Perform LDE (padding) and coset scaling on CPU - // let mut padded_coeffs = p.lde(rate_bits).coeffs; - - // // Apply coset shift - // let shift = F::coset_shift(); - // for (i, coeff) in padded_coeffs.iter_mut().enumerate() { - // *coeff *= shift.exp_u64(i as u64); - // } - - // all_lde_data.push(padded_coeffs); - // } - - // // Add salt - // for _ in 0..salt_size { - // all_lde_data.push(F::rand_vec(lde_size)); - // } - // Allocate GPU memory for all polynomials - let total_elements = num_polys * lde_size; println!( "Allocating GPU memory for {} polynomials of size {} (total {} elements)", - num_polys, lde_size, total_elements + num_polys, lde_size, total_alloc_size ); - let total_alloce_size = num_polys * lde_size; - // let total_alloce_size = num_polys.next_power_of_two() * lde_size; - let timer = Instant::now(); - let mut gpu_buffer = HostOrDeviceSlice::cuda_malloc(0, total_alloce_size) - .expect("Failed to allocate GPU memory"); - println!("cuda alloc took: {:?}", timer.elapsed()); - println!( - "lde size: {}, total_elements: {}, total allocated size: {}", - lde_size, total_elements, total_alloce_size + let mut gpu_buffer = timed!( + timing, + format!("cuda alloc memory for {} elements", total_alloc_size).as_ref(), + HostOrDeviceSlice::cuda_malloc(0, total_alloc_size) + .expect("Failed to allocate GPU memory") ); - let timer = Instant::now(); // Copy all data to GPU in one go - let mut flat_data = vec![F::ZERO; total_alloce_size]; - - for i in 0..polynomials.len() { - - flat_data[i*lde_size.. i*lde_size +degree].copy_from_slice(polynomials[i].coeffs.as_ref()) - - } - // polynomials.par_iter().zip(flat_data.par_chunks_exact_mut(lde_size)).for_each(|(p, c)| - // c[..degree].copy_from_slice(p.coeffs.as_ref()) - // ); - - // let flat_data: Vec = polynomials - // .iter() - // .flat_map(|v| v.lde(rate_bits).coeffs) // pad each polynomial to lde_size - // // .into_iter() - // // .collect() - // // .iter() - // // .chain( - // // vec![F::ZERO; total_alloce_size - total_elements] - // // .iter() - // // .copied(), - // // ) - // .collect(); - - println!("cpu prepare took: {:?}", timer.elapsed()); - // let flat_data = vec![F::ZERO; lde_size - total_elements].iter() + let mut flat_data = vec![F::ZERO; total_alloc_size]; + + timed!(timing, "Prepare CPU memory", { + for i in 0..polynomials.len() { + flat_data[i * lde_size..i * lde_size + degree] + .copy_from_slice(polynomials[i].coeffs.as_ref()) + } + for i in polynomials.len()..num_polys { + flat_data[i * lde_size..(i + 1) * lde_size] + .copy_from_slice(salt_polys[i - polynomials.len()].as_slice()); + } + }); - println!( - "Copying {} elements to GPU (expected {})", - flat_data.len(), - total_elements + timed!( + timing, + "CPU to GPU", + gpu_buffer + .copy_from_host(&flat_data) + .expect("Failed to copy data to GPU") ); - let timer = Instant::now(); - gpu_buffer - .copy_from_host(&flat_data) - .expect("Failed to copy data to GPU"); - println!("IO took: {:?}", timer.elapsed()); - - // Perform batched NTT on GPU + // Technically we don't really need to do FFTs for the salt polynomial + // but then the cuda memory becomes extremely difficult to handle + // so we might as well do those FFTs. let log_domain_size = log2_strict(lde_size); let ntt_config = NTTConfig { batches: num_polys as u32, @@ -340,36 +295,21 @@ impl, C: GenericConfig, const D: usize> with_coset: true, ..Default::default() }; - - let timer = Instant::now(); - ntt_batch_ptr(0, gpu_buffer.as_mut_ptr(), log_domain_size, ntt_config); - - println!("comput took: {:?}", timer.elapsed()); - + timed!( + timing, + format!( + "GPU batch NTT for {} poly of degree {}", + num_polys, lde_size + ) + .as_ref(), + ntt_batch_ptr(0, gpu_buffer.as_mut_ptr(), log_domain_size, ntt_config) + ); gpu_buffer }); - println!("Completed GPU coset FFT for {} polynomials", num_polys); - - // let total_elements = num_polys * lde_size; - // let mut gpu_lde_values_copied_to_cpu = vec![F::ZERO; total_elements]; - - // gpu_lde_values - // .copy_to_host(&mut gpu_lde_values_copied_to_cpu, total_elements) - // .expect("Failed to copy data from GPU"); - - // println!("lde value: {:?}", gpu_lde_values_copied_to_cpu); - - // let lde_cpu_1d = lde_cpu.clone().into_iter().flatten().collect::>(); - // assert_eq!(lde_cpu_1d.len(), gpu_lde_values_copied_to_cpu.len(), "LDE size mismatch"); - // assert_eq!(lde_cpu_1d, gpu_lde_values_copied_to_cpu, "LDE values mismatch"); - // Step 2: Transpose on GPU using Zeknox let gpu_transposed = timed!(timing, "GPU transpose", { - let total_alloce_size = num_polys * lde_size; - // let total_alloce_size = num_polys.next_power_of_two() * lde_size; - - let mut gpu_output = HostOrDeviceSlice::cuda_malloc(0, total_alloce_size) + let mut gpu_output = HostOrDeviceSlice::cuda_malloc(0, total_alloc_size) .expect("Failed to allocate GPU memory for transpose"); let log_n = log2_strict(lde_size); @@ -387,54 +327,24 @@ impl, C: GenericConfig, const D: usize> transpose_config, ); - // gpu_lde_values will be automatically freed when it goes out of scope - gpu_output }); - print!("Completed GPU transpose for {} polynomials", num_polys); // Step 3: Copy back to CPU - let leaves = timed!(timing, "GPU to CPU transfer", { - let total_elements = num_polys * lde_size; - let mut cpu_data = vec![F::ZERO; total_elements]; + let leaves_1d = timed!(timing, "GPU to CPU", { + let mut cpu_data = vec![F::ZERO; total_alloc_size]; gpu_transposed - .copy_to_host(&mut cpu_data, total_elements) + .copy_to_host(&mut cpu_data, total_alloc_size) .expect("Failed to copy data from GPU"); - // // Reshape into leaves: Vec> where each inner vec has num_polys elements - // cpu_data - // .chunks(num_polys) - // .map(|chunk| chunk.to_vec()) - // .collect::>() - cpu_data }); - // let mut leaves_cpu = timed!(timing, "transpose LDEs", transpose(&lde_cpu)); - // println!("tatal leaves: {}", leaves.len()); - // println!("leaves[0]: {:?}", leaves[0]); - // println!("leaves_cpu[0]: {:?}", leaves_cpu[0]); - - // reverse_index_bits_in_place(&mut leaves_cpu); - // for i in 0..leaves.len() { - // if leaves[i] != leaves_cpu[i] { - // println!("Mismatch at leaf {}: \n{:?}\n{:?}\n", i, leaves[i], leaves_cpu[i]); - // } - // } - - // assert!(leaves == leaves_cpu, "Transposed LDE values mismatch"); - - // let merkle_tree = timed!( - // timing, - // "build Merkle tree", - // MerkleTree::new_from_2d(leaves, cap_height) - // ); - - let merkle_tree = timed!( + let merkle_tree = timed!( timing, "build Merkle tree", - MerkleTree::new_from_1d(leaves,polynomials.len(), cap_height) + MerkleTree::new_from_1d(leaves_1d, polynomials.len(), cap_height) ); Self { diff --git a/plonky2/src/plonk/prover.rs b/plonky2/src/plonk/prover.rs index b79391fc8..24229a25e 100644 --- a/plonky2/src/plonk/prover.rs +++ b/plonky2/src/plonk/prover.rs @@ -4,7 +4,6 @@ use alloc::{format, vec, vec::Vec}; use core::cmp::min; use core::mem::swap; -use std::time::Instant; use anyhow::{ensure, Result}; use hashbrown::HashMap; @@ -276,6 +275,7 @@ where &gammas, &deltas, &alphas, + timing, ) ); @@ -631,6 +631,7 @@ fn compute_quotient_polys< gammas: &[F], deltas: &[F], alphas: &[F], + timing: &mut TimingTree, ) -> Vec> { let num_challenges = common_data.config.num_challenges; @@ -650,219 +651,198 @@ fn compute_quotient_polys< // steps away since we work on an LDE of degree `max_filtered_constraint_degree`. let next_step = 1 << quotient_degree_bits; - let timer = Instant::now(); - let points = F::two_adic_subgroup(common_data.degree_bits() + quotient_degree_bits); - println!("Time to compute LDE points: {:?}", timer.elapsed()); + let points = timed!( + timing, + "set up subgroup generators", + F::two_adic_subgroup(common_data.degree_bits() + quotient_degree_bits) + ); let lde_size = points.len(); let z_h_on_coset = ZeroPolyOnCoset::new(common_data.degree_bits(), quotient_degree_bits); - let timer = Instant::now(); // Precompute the lookup table evals on the challenges in delta // These values are used to produce the final RE constraints for each lut, // and are the same each time in check_lookup_constraints_batched. // lut_poly_evals[i][j] gives the eval for the i'th challenge and the j'th lookup table - let lut_re_poly_evals: Vec> = if has_lookup { - let num_lut_slots = LookupTableGate::num_slots(&common_data.config); - (0..num_challenges) - .map(move |i| { - let cur_deltas = &deltas[NUM_COINS_LOOKUP * i..NUM_COINS_LOOKUP * (i + 1)]; - let cur_challenge_delta = cur_deltas[LookupChallenges::ChallengeDelta as usize]; - - (LookupSelectors::StartEnd as usize..common_data.num_lookup_selectors) - .map(|r| { - let lut_row_number = common_data.luts - [r - LookupSelectors::StartEnd as usize] - .len() - .div_ceil(num_lut_slots); - - get_lut_poly( - common_data, - r - LookupSelectors::StartEnd as usize, - cur_deltas, - num_lut_slots * lut_row_number, - ) - .eval(cur_challenge_delta) - }) - .collect() - }) - .collect() - } else { - vec![] - }; - println!( - "Time to compute LUT RE polynomial evals: {:?}", - timer.elapsed() + let lut_re_poly_evals: Vec> = timed!( + timing, + "compute LUT RE polynomial evals", + if has_lookup { + let num_lut_slots = LookupTableGate::num_slots(&common_data.config); + (0..num_challenges) + .map(move |i| { + let cur_deltas = &deltas[NUM_COINS_LOOKUP * i..NUM_COINS_LOOKUP * (i + 1)]; + let cur_challenge_delta = cur_deltas[LookupChallenges::ChallengeDelta as usize]; + + (LookupSelectors::StartEnd as usize..common_data.num_lookup_selectors) + .map(|r| { + let lut_row_number = common_data.luts + [r - LookupSelectors::StartEnd as usize] + .len() + .div_ceil(num_lut_slots); + + get_lut_poly( + common_data, + r - LookupSelectors::StartEnd as usize, + cur_deltas, + num_lut_slots * lut_row_number, + ) + .eval(cur_challenge_delta) + }) + .collect() + }) + .collect() + } else { + vec![] + } ); - let timer = Instant::now(); let lut_re_poly_evals_refs: Vec<&[F]> = lut_re_poly_evals.iter().map(|v| v.as_slice()).collect(); let num_batches = points.len().div_ceil(BATCH_SIZE); - let quotient_values: Vec> = points - .par_chunks(BATCH_SIZE) - .enumerate() - .flat_map(|(batch_i, xs_batch)| { - // Each batch must be the same size, except the last one, which may be smaller. - debug_assert!( - xs_batch.len() == BATCH_SIZE - || (batch_i == num_batches - 1 && xs_batch.len() <= BATCH_SIZE) - ); - - let batch_size = xs_batch.len(); - let batch_start = BATCH_SIZE * batch_i; + let quotient_values: Vec> = timed!(timing, "compute quotient value", { + points + .par_chunks(BATCH_SIZE) + .enumerate() + .flat_map(|(batch_i, xs_batch)| { + // Each batch must be the same size, except the last one, which may be smaller. + debug_assert!( + xs_batch.len() == BATCH_SIZE + || (batch_i == num_batches - 1 && xs_batch.len() <= BATCH_SIZE) + ); - let mut shifted_xs_batch = Vec::with_capacity(batch_size); - let mut local_zs_batch = Vec::with_capacity(batch_size); - let mut next_zs_batch = Vec::with_capacity(batch_size); + let batch_size = xs_batch.len(); + let batch_start = BATCH_SIZE * batch_i; - let mut local_lookup_batch = Vec::with_capacity(batch_size); - let mut next_lookup_batch = Vec::with_capacity(batch_size); + let mut shifted_xs_batch = Vec::with_capacity(batch_size); + let mut local_zs_batch = Vec::with_capacity(batch_size); + let mut next_zs_batch = Vec::with_capacity(batch_size); - let mut partial_products_batch = Vec::with_capacity(batch_size); - let mut s_sigmas_batch = Vec::with_capacity(batch_size); + let mut local_lookup_batch = Vec::with_capacity(batch_size); + let mut next_lookup_batch = Vec::with_capacity(batch_size); - let mut local_constants_batch_refs = Vec::with_capacity(batch_size); - let mut local_wires_batch_refs = Vec::with_capacity(batch_size); + let mut partial_products_batch = Vec::with_capacity(batch_size); + let mut s_sigmas_batch = Vec::with_capacity(batch_size); - // let timer1 = Instant::now(); + let mut local_constants_batch_refs = Vec::with_capacity(batch_size); + let mut local_wires_batch_refs = Vec::with_capacity(batch_size); - for (j, &x) in xs_batch.iter().enumerate() { - let i = batch_start + j; - let shifted_x = F::coset_shift() * x; - let i_next = (i + next_step) % lde_size; - let local_constants_sigmas = prover_data - .constants_sigmas_commitment - .get_lde_values(i, step); - let local_constants = &local_constants_sigmas[common_data.constants_range()]; - let s_sigmas = &local_constants_sigmas[common_data.sigmas_range()]; - let local_wires = wires_commitment.get_lde_values(i, step); - let local_zs_partial_and_lookup = - zs_partial_products_and_lookup_commitment.get_lde_values(i, step); - let next_zs_partial_and_lookup = - zs_partial_products_and_lookup_commitment.get_lde_values(i_next, step); + for (j, &x) in xs_batch.iter().enumerate() { + let i = batch_start + j; + let shifted_x = F::coset_shift() * x; + let i_next = (i + next_step) % lde_size; + let local_constants_sigmas = prover_data + .constants_sigmas_commitment + .get_lde_values(i, step); + let local_constants = &local_constants_sigmas[common_data.constants_range()]; + let s_sigmas = &local_constants_sigmas[common_data.sigmas_range()]; + let local_wires = wires_commitment.get_lde_values(i, step); + let local_zs_partial_and_lookup = + zs_partial_products_and_lookup_commitment.get_lde_values(i, step); + let next_zs_partial_and_lookup = + zs_partial_products_and_lookup_commitment.get_lde_values(i_next, step); - let local_zs = &local_zs_partial_and_lookup[common_data.zs_range()]; + let local_zs = &local_zs_partial_and_lookup[common_data.zs_range()]; - let next_zs = &next_zs_partial_and_lookup[common_data.zs_range()]; + let next_zs = &next_zs_partial_and_lookup[common_data.zs_range()]; - let partial_products = - &local_zs_partial_and_lookup[common_data.partial_products_range()]; + let partial_products = + &local_zs_partial_and_lookup[common_data.partial_products_range()]; - if has_lookup { - let local_lookup_zs = &local_zs_partial_and_lookup[common_data.lookup_range()]; + if has_lookup { + let local_lookup_zs = + &local_zs_partial_and_lookup[common_data.lookup_range()]; - let next_lookup_zs = &next_zs_partial_and_lookup[common_data.lookup_range()]; - debug_assert_eq!(local_lookup_zs.len(), common_data.num_all_lookup_polys()); + let next_lookup_zs = + &next_zs_partial_and_lookup[common_data.lookup_range()]; + debug_assert_eq!(local_lookup_zs.len(), common_data.num_all_lookup_polys()); - local_lookup_batch.push(local_lookup_zs); - next_lookup_batch.push(next_lookup_zs); - } + local_lookup_batch.push(local_lookup_zs); + next_lookup_batch.push(next_lookup_zs); + } - debug_assert_eq!(local_wires.len(), common_data.config.num_wires); - debug_assert_eq!(local_zs.len(), num_challenges); + debug_assert_eq!(local_wires.len(), common_data.config.num_wires); + debug_assert_eq!(local_zs.len(), num_challenges); - local_constants_batch_refs.push(local_constants); - local_wires_batch_refs.push(local_wires); + local_constants_batch_refs.push(local_constants); + local_wires_batch_refs.push(local_wires); - shifted_xs_batch.push(shifted_x); - local_zs_batch.push(local_zs); - next_zs_batch.push(next_zs); - partial_products_batch.push(partial_products); - s_sigmas_batch.push(s_sigmas); - } + shifted_xs_batch.push(shifted_x); + local_zs_batch.push(local_zs); + next_zs_batch.push(next_zs); + partial_products_batch.push(partial_products); + s_sigmas_batch.push(s_sigmas); + } - // println!( - // "Time to gather LDE values for batch {}: {:?}", - // batch_i, - // timer1.elapsed() - // ); - - // Optimized transposition with better cache locality - let n_constants = local_constants_batch_refs[0].len(); - let mut local_constants_batch = vec![F::ZERO; xs_batch.len() * n_constants]; - for i in 0..n_constants { - let offset = i * xs_batch.len(); - for (j, constants) in local_constants_batch_refs.iter().enumerate() { - local_constants_batch[offset + j] = constants[i]; + // Optimized transposition with better cache locality + let n_constants = local_constants_batch_refs[0].len(); + let mut local_constants_batch = vec![F::ZERO; xs_batch.len() * n_constants]; + for i in 0..n_constants { + let offset = i * xs_batch.len(); + for (j, constants) in local_constants_batch_refs.iter().enumerate() { + local_constants_batch[offset + j] = constants[i]; + } } - } - let n_wires = local_wires_batch_refs[0].len(); - let mut local_wires_batch = vec![F::ZERO; xs_batch.len() * n_wires]; - for i in 0..n_wires { - let offset = i * xs_batch.len(); - for (j, wires) in local_wires_batch_refs.iter().enumerate() { - local_wires_batch[offset + j] = wires[i]; + let n_wires = local_wires_batch_refs[0].len(); + let mut local_wires_batch = vec![F::ZERO; xs_batch.len() * n_wires]; + for i in 0..n_wires { + let offset = i * xs_batch.len(); + for (j, wires) in local_wires_batch_refs.iter().enumerate() { + local_wires_batch[offset + j] = wires[i]; + } } - } - let vars_batch = EvaluationVarsBaseBatch::new( - xs_batch.len(), - &local_constants_batch, - &local_wires_batch, - public_inputs_hash, - ); + let vars_batch = EvaluationVarsBaseBatch::new( + xs_batch.len(), + &local_constants_batch, + &local_wires_batch, + public_inputs_hash, + ); - // let timer1 = Instant::now(); - let indices_batch: Vec = (batch_start..batch_start + batch_size).collect(); - let mut quotient_values_batch = eval_vanishing_poly_base_batch::( - common_data, - &indices_batch, - &shifted_xs_batch, - vars_batch, - &local_zs_batch, - &next_zs_batch, - &local_lookup_batch, - &next_lookup_batch, - &partial_products_batch, - &s_sigmas_batch, - betas, - gammas, - deltas, - alphas, - &z_h_on_coset, - &lut_re_poly_evals_refs, - ); - // println!( - // "Time to eval vanishing poly for batch {}: {:?}", - // batch_i, - // timer1.elapsed() - // ); - - // let timer1 = Instant::now(); - for (j, quotient_values) in quotient_values_batch.iter_mut().enumerate() { - let i = batch_start + j; - let denominator_inv = z_h_on_coset.eval_inverse(i); - quotient_values - .iter_mut() - .for_each(|v| *v *= denominator_inv); - } - // println!( - // "Time to divide out Z_H for batch {}: {:?}", - // batch_i, - // timer1.elapsed() - // ); + let indices_batch: Vec = (batch_start..batch_start + batch_size).collect(); + let mut quotient_values_batch = eval_vanishing_poly_base_batch::( + common_data, + &indices_batch, + &shifted_xs_batch, + vars_batch, + &local_zs_batch, + &next_zs_batch, + &local_lookup_batch, + &next_lookup_batch, + &partial_products_batch, + &s_sigmas_batch, + betas, + gammas, + deltas, + alphas, + &z_h_on_coset, + &lut_re_poly_evals_refs, + ); - quotient_values_batch - }) - .collect(); + for (j, quotient_values) in quotient_values_batch.iter_mut().enumerate() { + let i = batch_start + j; + let denominator_inv = z_h_on_coset.eval_inverse(i); + quotient_values + .iter_mut() + .for_each(|v| *v *= denominator_inv); + } - println!( - "Time to compute quotient polys: {:?} for {} points", - timer.elapsed(), - quotient_values.len() - ); + quotient_values_batch + }) + .collect() + }); - let timer = Instant::now(); - let res = transpose("ient_values) - .into_par_iter() - .map(PolynomialValues::new) - .map(|values| values.coset_ifft(F::coset_shift())) - .collect(); - println!("Time to compute quotient polys IFFT: {:?}", timer.elapsed()); - res + timed!( + timing, + "transpose and final ifft", + transpose("ient_values) + .into_par_iter() + .map(PolynomialValues::new) + .map(|values| values.coset_ifft(F::coset_shift())) + .collect() + ) } diff --git a/plonky2/src/plonk/vanishing_poly.rs b/plonky2/src/plonk/vanishing_poly.rs index 0f6177daa..6a5b8a266 100644 --- a/plonky2/src/plonk/vanishing_poly.rs +++ b/plonky2/src/plonk/vanishing_poly.rs @@ -679,7 +679,7 @@ pub fn check_lookup_constraints_batch, const D: usi // Check LDC row and col transitions. It's the same constraint, with a row transition happening for slot == 0. let unfiltered_ldc_transition = - lu_sum_prods.multiply_accumulate(lu_prod, (z_x_lookup_sldcs[poly] - prev)); + lu_sum_prods.multiply_accumulate(lu_prod, z_x_lookup_sldcs[poly] - prev); constraints .push(lookup_selectors[LookupSelectors::TransLdc as usize] * unfiltered_ldc_transition); } diff --git a/plonky2/src/util/mod.rs b/plonky2/src/util/mod.rs index cb11f05e2..d0ec960c8 100644 --- a/plonky2/src/util/mod.rs +++ b/plonky2/src/util/mod.rs @@ -5,8 +5,6 @@ use alloc::vec::Vec; #[doc(inline)] pub use plonky2_util::*; -#[cfg(feature = "cuda")] -use zeknox::{device::memory::HostOrDeviceSlice, transpose_rev_batch, types::TransposeConfig}; use crate::field::polynomial::PolynomialValues; use crate::field::types::Field; @@ -23,81 +21,6 @@ pub(crate) fn transpose_poly_values(polys: Vec>) - transpose(&poly_values) } -#[cfg(feature = "cuda")] -fn transpose_gpu(matrix: &[Vec]) -> Vec> { - use std::time::Instant; - - if matrix.is_empty() || matrix[0].is_empty() { - return vec![]; - } - - let num_rows = matrix.len(); - let num_cols = matrix[0].len(); - let total_elements = num_rows * num_cols; - - // Flatten the 2D matrix into a 1D vector for GPU - let mut flat_input: Vec = Vec::with_capacity(total_elements); - for row in matrix { - flat_input.extend_from_slice(row); - } - - let gpu_id = 0; - let log_n = (num_cols as f64).log2().ceil() as usize; - - // Allocate GPU memory for input and output - let mut gpu_input: HostOrDeviceSlice<'_, T> = - HostOrDeviceSlice::cuda_malloc(gpu_id, total_elements).unwrap(); - let mut gpu_output: HostOrDeviceSlice<'_, T> = - HostOrDeviceSlice::cuda_malloc(gpu_id, total_elements).unwrap(); - - // Copy input to GPU - gpu_input.copy_from_host(&flat_input).unwrap(); - - // Configure transpose - let mut cfg = TransposeConfig::default(); - cfg.batches = num_rows as u32; - cfg.are_inputs_on_device = true; - cfg.are_outputs_on_device = true; - - let timers = Instant::now(); - // Perform GPU transpose - transpose_rev_batch( - gpu_id, - gpu_output.as_mut_ptr(), - gpu_input.as_mut_ptr(), - log_n, - cfg, - ); - println!( - "CUDA transpose of {}x{} took {:?}", - num_rows, - num_cols, - timers.elapsed() - ); - - let timer = Instant::now(); - // Copy result back to host - let mut flat_output = vec![unsafe { std::mem::zeroed() }; total_elements]; - gpu_output - .copy_to_host(&mut flat_output, total_elements) - .unwrap(); - println!( - "CUDA transpose copy back and reshape of {}x{} took {:?}", - num_rows, - num_cols, - timer.elapsed() - ); - - // Reshape back to 2D (transposed) using chunks_exact for better performance - // The GPU transpose outputs in column-major order, so we can just chunk by num_rows - let result: Vec> = flat_output - .chunks_exact(num_rows) - .map(|chunk| chunk.to_vec()) - .collect(); - - result -} - pub fn transpose(matrix: &[Vec]) -> Vec> { if matrix.is_empty() { return vec![]; @@ -105,31 +28,6 @@ pub fn transpose(matrix: &[Vec]) -> Vec> { let len = matrix[0].len(); - // #[cfg(feature = "cuda")] - // { - // // Use GPU for large matrices - // // Threshold: use GPU if total elements >= 2^16 (65536) or if CUDA_TRANSPOSE_THRESHOLD is set - // let num_rows = matrix.len(); - // let num_cols = len; - // let total_elements = num_rows * num_cols; - - // let use_gpu = if let Ok(threshold_str) = std::env::var("CUDA_TRANSPOSE_THRESHOLD") { - // if let Ok(threshold) = threshold_str.parse::() { - // total_elements >= threshold - // } else { - // total_elements >= 65536 - // } - // } else { - // total_elements >= 65536 - // }; - - // if use_gpu && num_cols.is_power_of_two() { - // return transpose_gpu(matrix); - // } - // } - - // CPU fallback - // Use sequential iteration for deterministic results (0..len) .map(|i| matrix.iter().map(|row| row[i]).collect()) .collect() From 248682545ef63d6d3b75e38cd37e97b151135abc Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 15 Dec 2025 22:13:14 +0000 Subject: [PATCH 29/37] finished --- plonky2/examples/fibonacci.rs | 67 ++++++++++++----------------------- 1 file changed, 23 insertions(+), 44 deletions(-) diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs index d2d30e2d7..1cca1b513 100644 --- a/plonky2/examples/fibonacci.rs +++ b/plonky2/examples/fibonacci.rs @@ -1,9 +1,13 @@ use anyhow::{Ok, Result}; +use log::Level; use plonky2::field::types::Field; use plonky2::iop::witness::{PartialWitness, WitnessWrite}; use plonky2::plonk::circuit_builder::CircuitBuilder; use plonky2::plonk::circuit_data::CircuitConfig; use plonky2::plonk::config::{GenericConfig, PoseidonGoldilocksConfig}; +use plonky2::util::timing::TimingTree; + +const LOOP: usize = 100_000; /// An example of using Plonky2 to prove a statement of the form /// "I know the 100th element of the Fibonacci sequence, starting with constants a and b." @@ -19,31 +23,34 @@ fn main() -> Result<()> { type F = >::F; let config = CircuitConfig::standard_recursion_config(); - println!("Building circuit..."); let mut builder = CircuitBuilder::::new(config); - println!("Building arithmetic circuit..."); // The arithmetic circuit. let initial_a = builder.add_virtual_target(); let initial_b = builder.add_virtual_target(); let mut prev_target = initial_a; let mut cur_target = initial_b; - for _ in 0..99 { + for _ in 0..LOOP { let temp = builder.add(prev_target, cur_target); prev_target = cur_target; cur_target = temp; } - println!("Circuit built."); #[cfg(feature = "cuda")] { - let size = 3; + use plonky2_util::log2_ceil; + + let size = log2_ceil(builder.num_gates()); zeknox::clear_cuda_errors_rs(); - println!("Initializing CUDA twiddle factors..."); + println!( + "Initializing CUDA twiddle factors for dimeinsions 2^{} and 2^{}", + size, + size + 3 + ); zeknox::init_twiddle_factors_rs(0, size); zeknox::init_twiddle_factors_rs(0, size + 3); - // Initialize coset on GPU + // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR) let coset_gen_u64 = 7u64; zeknox::init_coset_rs(0, size + 3, coset_gen_u64); @@ -53,52 +60,24 @@ fn main() -> Result<()> { builder.register_public_input(initial_a); builder.register_public_input(initial_b); builder.register_public_input(cur_target); - println!("Public inputs registered."); + // Provide initial values. let mut pw = PartialWitness::new(); pw.set_target(initial_a, F::ZERO)?; pw.set_target(initial_b, F::ONE)?; - println!("Initial values set in witness."); let data = builder.build::(); - println!("Circuit data built. Generating proof..."); - #[cfg(feature = "timing")] - { - use log::Level; - use plonky2::util::timing::TimingTree; - let mut timing = TimingTree::new("prove", Level::Info); - println!("Starting proof generation..."); - let proof = - plonky2::plonk::prover::prove(&data.prover_only, &data.common, pw, &mut timing)?; - println!( - "100th Fibonacci number mod |F| (starting with {}, {}) is: {}", - proof.public_inputs[0], proof.public_inputs[1], proof.public_inputs[2] - ); + let mut timing = TimingTree::new("prove", Level::Info); - // Print first few elements of wires_cap for comparison - println!("First wires_cap hash: {:?}", proof.proof.wires_cap.0[0]); - println!( - "First plonk_zs hash: {:?}", - proof.proof.plonk_zs_partial_products_cap.0[0] - ); - println!( - "First quotient hash: {:?}", - proof.proof.quotient_polys_cap.0[0] - ); + let proof = plonky2::plonk::prover::prove(&data.prover_only, &data.common, pw, &mut timing)?; - timing.print(); - data.verify(proof)?; - } + println!( + "{}-th Fibonacci number mod |F| (starting with {}, {}) is: {}", + LOOP, proof.public_inputs[0], proof.public_inputs[1], proof.public_inputs[2] + ); - #[cfg(not(feature = "timing"))] - { - let proof = data.prove(pw)?; - println!( - "100th Fibonacci number mod |F| (starting with {}, {}) is: {}", - proof.public_inputs[0], proof.public_inputs[1], proof.public_inputs[2] - ); - data.verify(proof)?; - } + timing.print(); + data.verify(proof)?; println!("finished"); Ok(()) From e0c397c8d6fccda0286365167ca2f22a1056f7e0 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 15 Dec 2025 22:36:00 +0000 Subject: [PATCH 30/37] clean up --- plonky2/src/fri/oracle.rs | 108 ++++++++------------------------ plonky2/src/hash/merkle_tree.rs | 6 +- plonky2/src/plonk/prover.rs | 13 +--- plonky2/src/util/mod.rs | 2 + 4 files changed, 33 insertions(+), 96 deletions(-) diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs index 56bb34c53..f3daa648d 100644 --- a/plonky2/src/fri/oracle.rs +++ b/plonky2/src/fri/oracle.rs @@ -54,8 +54,6 @@ impl, C: GenericConfig, const D: usize> PolynomialBatch { /// Creates a list polynomial commitment for the polynomials interpolating the values in `values`. - /// This function is called by the builder during preprocessing the circuit. - /// This function always calls IFFT on CPU to avoid strange GPU issue. pub fn from_values( values: Vec>, rate_bits: usize, @@ -64,6 +62,7 @@ impl, C: GenericConfig, const D: usize> timing: &mut TimingTree, fft_root_table: Option<&FftRootTable>, ) -> Self { + // The first IFFT is always done on CPU to avoid strange GPU issue. let coeffs = timed!( timing, "CPU IFFT", @@ -73,25 +72,14 @@ impl, C: GenericConfig, const D: usize> .collect::>() ); - if cfg!(feature = "cuda") { - Self::from_coeffs_gpu( - coeffs, - rate_bits, - blinding, - cap_height, - timing, - fft_root_table, - ) - } else { - Self::from_coeffs_cpu( - coeffs, - rate_bits, - blinding, - cap_height, - timing, - fft_root_table, - ) - } + Self::from_coeffs( + coeffs, + rate_bits, + blinding, + cap_height, + timing, + fft_root_table, + ) } /// Creates a list polynomial commitment for the polynomials `polynomials`. @@ -157,79 +145,40 @@ impl, C: GenericConfig, const D: usize> } } + #[cfg(feature = "cuda")] fn from_coeffs_gpu( polynomials: Vec>, rate_bits: usize, blinding: bool, cap_height: usize, timing: &mut TimingTree, - fft_root_table: Option<&FftRootTable>, + _fft_root_table: Option<&FftRootTable>, ) -> Self { + assert!(F::CUDA_SUPPORT, "CUDA is not support for this field"); + let degree = polynomials[0].len(); // If blinding, salt with two random elements to each leaf vector. let salt_size = if blinding { SALT_SIZE } else { 0 }; - println!( - "lde_values: num_polys={}, degree={}, blinding={}, salt_size={}", - polynomials.len(), - degree, - blinding, - salt_size - ); - if F::CUDA_SUPPORT { - return Self::from_coeffs_gpu_optimized( - polynomials, - rate_bits, - blinding, - cap_height, - timing, - fft_root_table, - degree, - salt_size, - ); - } - - // Fallback to CPU path - let lde_values = polynomials - .iter() - .map(|p| { - assert_eq!(p.len(), degree, "Polynomial degrees inconsistent"); - p.lde(rate_bits) - .coset_fft_with_options(F::coset_shift(), Some(rate_bits), fft_root_table) - .values - }) - .chain( - (0..salt_size) - .into_iter() - .map(|_| F::rand_vec(degree << rate_bits)), - ) - .collect::>(); - let mut leaves = timed!(timing, "transpose LDEs", transpose(&lde_values)); - reverse_index_bits_in_place(&mut leaves); - let merkle_tree = timed!( - timing, - "build Merkle tree", - MerkleTree::new_from_2d(leaves, cap_height) - ); - - Self { + Self::from_coeffs_gpu_helper( polynomials, - merkle_tree, - degree_log: log2_strict(degree), rate_bits, blinding, - } + cap_height, + timing, + degree, + salt_size, + ) } #[cfg(feature = "cuda")] - fn from_coeffs_gpu_optimized( + fn from_coeffs_gpu_helper( polynomials: Vec>, rate_bits: usize, blinding: bool, cap_height: usize, timing: &mut TimingTree, - _fft_root_table: Option<&FftRootTable>, degree: usize, salt_size: usize, ) -> Self { @@ -248,9 +197,11 @@ impl, C: GenericConfig, const D: usize> // Step 1: Compute coset FFT on GPU, keeping data on GPU let gpu_lde_values = timed!(timing, "GPU coset FFT", { // Allocate GPU memory for all polynomials - println!( + log::debug!( "Allocating GPU memory for {} polynomials of size {} (total {} elements)", - num_polys, lde_size, total_alloc_size + num_polys, + lde_size, + total_alloc_size ); let mut gpu_buffer = timed!( @@ -366,16 +317,9 @@ impl, C: GenericConfig, const D: usize> // If blinding, salt with two random elements to each leaf vector. let salt_size = if blinding { SALT_SIZE } else { 0 }; - println!( - "lde_values: num_polys={}, degree={}, blinding={}, salt_size={}", - polynomials.len(), - degree, - blinding, - salt_size - ); polynomials - .iter() + .par_iter() .map(|p| { assert_eq!(p.len(), degree, "Polynomial degrees inconsistent"); p.lde(rate_bits) @@ -384,7 +328,7 @@ impl, C: GenericConfig, const D: usize> }) .chain( (0..salt_size) - .into_iter() + .into_par_iter() .map(|_| F::rand_vec(degree << rate_bits)), ) .collect() diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs index b2a57df52..12f44970b 100644 --- a/plonky2/src/hash/merkle_tree.rs +++ b/plonky2/src/hash/merkle_tree.rs @@ -276,7 +276,7 @@ fn fill_digests_buf_gpu>( let leaves_count = leaves.len() / leaf_size; let num_gpus: usize = std::env::var("NUM_OF_GPUS") - .expect("NUM_OF_GPUS should be set") + .unwrap_or("1".to_string()) .parse() .unwrap(); @@ -286,7 +286,7 @@ fn fill_digests_buf_gpu>( if *gpu_id_lock >= num_gpus as u64 { *gpu_id_lock = 0; } - println!("Using GPU id {} leave length {}", gpu_id, leaves.len()); + log::debug!("Using GPU id {} leave length {}", gpu_id, leaves.len()); let now = Instant::now(); let gpu_leaves_buf_result = HostOrDeviceSlice::cuda_malloc(gpu_id as i32, leaves.len()); @@ -351,7 +351,7 @@ fn fill_digests_buf_gpu_ptr>( unsafe { let num_gpus: usize = std::env::var("NUM_OF_GPUS") - .expect("NUM_OF_GPUS should be set") + .unwrap_or("1".to_string()) .parse() .unwrap(); if !FORCE_SINGLE_GPU diff --git a/plonky2/src/plonk/prover.rs b/plonky2/src/plonk/prover.rs index 24229a25e..b4112cf8f 100644 --- a/plonky2/src/plonk/prover.rs +++ b/plonky2/src/plonk/prover.rs @@ -165,17 +165,10 @@ where // Use sequential iteration for deterministic results witness .wire_values - .iter() + .par_iter() .map(|column| PolynomialValues::new(column.clone())) .collect() ); - // Debug: Print first few wire values to check determinism - if !wires_values.is_empty() && !wires_values[0].values.is_empty() { - println!( - "First wire poly first 5 values: {:?}", - &wires_values[0].values[..5.min(wires_values[0].values.len())] - ); - } let wires_commitment = timed!( timing, "compute wires commitment", @@ -320,7 +313,6 @@ where "Opening point is in the subgroup." ); - println!("Constructing the opening set, including lookups."); let openings = timed!( timing, "construct the opening set, including lookups", @@ -334,7 +326,6 @@ where common_data ) ); - println!("Computed openings."); challenger.observe_openings(&openings.to_fri_openings()); let instance = common_data.get_fri_instance(zeta); @@ -357,7 +348,7 @@ where timing, ) ); - println!("Computed opening proofs."); + let proof = Proof:: { wires_cap: wires_commitment.merkle_tree.cap, plonk_zs_partial_products_cap: partial_products_zs_and_lookup_commitment.merkle_tree.cap, diff --git a/plonky2/src/util/mod.rs b/plonky2/src/util/mod.rs index d0ec960c8..73ec06f7a 100644 --- a/plonky2/src/util/mod.rs +++ b/plonky2/src/util/mod.rs @@ -3,6 +3,7 @@ #[cfg(not(feature = "std"))] use alloc::vec::Vec; +use plonky2_maybe_rayon::{MaybeIntoParIter, ParallelIterator}; #[doc(inline)] pub use plonky2_util::*; @@ -29,6 +30,7 @@ pub fn transpose(matrix: &[Vec]) -> Vec> { let len = matrix[0].len(); (0..len) + .into_par_iter() .map(|i| matrix.iter().map(|row| row[i]).collect()) .collect() } From 0606854a7e572cc9d94e46e4207afafe8de27bbb Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 16 Dec 2025 15:14:13 +0000 Subject: [PATCH 31/37] fix bug --- field/Cargo.toml | 4 ++-- field/src/fft.rs | 8 ++------ plonky2/Cargo.toml | 4 ++-- plonky2/src/fri/oracle.rs | 37 ++++++++++++++++++------------------- 4 files changed, 24 insertions(+), 29 deletions(-) diff --git a/field/Cargo.toml b/field/Cargo.toml index 39ee8ef07..ba7de5f04 100644 --- a/field/Cargo.toml +++ b/field/Cargo.toml @@ -35,8 +35,8 @@ workspace = true [features] -# default = [] -default = [ "cuda" ] +default = [] +# default = [ "cuda" ] # default = [ "cuda", "cuda_sanity_check" ] cuda = [] # sanity check: when this flag is on, the computation will done on both CPU and CUDA, and the results compared diff --git a/field/src/fft.rs b/field/src/fft.rs index e08fd8021..6f1d5355d 100644 --- a/field/src/fft.rs +++ b/field/src/fft.rs @@ -34,12 +34,8 @@ pub fn batch_fft(input: &[PolynomialCoeffs]) -> Vec, C: GenericConfig, const D: usize> timing: &mut TimingTree, fft_root_table: Option<&FftRootTable>, ) -> Self { - if cfg!(feature = "cuda") { - Self::from_coeffs_gpu( - polynomials, - rate_bits, - blinding, - cap_height, - timing, - fft_root_table, - ) - } else { - Self::from_coeffs_cpu( - polynomials, - rate_bits, - blinding, - cap_height, - timing, - fft_root_table, - ) - } + #[cfg(feature = "cuda")] + return Self::from_coeffs_gpu( + polynomials, + rate_bits, + blinding, + cap_height, + timing, + fft_root_table, + ); + #[cfg(not(feature = "cuda"))] + Self::from_coeffs_cpu( + polynomials, + rate_bits, + blinding, + cap_height, + timing, + fft_root_table, + ) } /// Creates a list polynomial commitment for the polynomials `polynomials`. From ca5a5bb0a3d6e573264e5524439bfc9e799ef921 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 16 Dec 2025 21:58:08 +0000 Subject: [PATCH 32/37] optimized cpu memory alloce --- field/Cargo.toml | 4 ++-- plonky2/Cargo.toml | 6 +++--- plonky2/src/fri/oracle.rs | 11 ++++++++--- plonky2/src/util/mem.rs | 17 +++++++++++++++++ plonky2/src/util/mod.rs | 1 + 5 files changed, 31 insertions(+), 8 deletions(-) create mode 100644 plonky2/src/util/mem.rs diff --git a/field/Cargo.toml b/field/Cargo.toml index ba7de5f04..39ee8ef07 100644 --- a/field/Cargo.toml +++ b/field/Cargo.toml @@ -35,8 +35,8 @@ workspace = true [features] -default = [] -# default = [ "cuda" ] +# default = [] +default = [ "cuda" ] # default = [ "cuda", "cuda_sanity_check" ] cuda = [] # sanity check: when this flag is on, the computation will done on both CPU and CUDA, and the results compared diff --git a/plonky2/Cargo.toml b/plonky2/Cargo.toml index 82594fb67..10deefedf 100644 --- a/plonky2/Cargo.toml +++ b/plonky2/Cargo.toml @@ -14,9 +14,9 @@ categories.workspace = true [features] # default = ["gate_testing", "rand_chacha", "std", "timing", "cuda"] -default = ["gate_testing", "parallel", "rand_chacha", "std", "timing", ] - -# default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing", ] +# default = ["gate_testing", "parallel", "rand_chacha", "std", "timing", ] +# +default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing", ] # default = ["gate_testing", "rand_chacha", "std", "cuda", "timing", "cuda_sanity_check"] # default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing", "cuda_sanity_check"] gate_testing = [] diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs index e301e6e8e..364ea8497 100644 --- a/plonky2/src/fri/oracle.rs +++ b/plonky2/src/fri/oracle.rs @@ -185,6 +185,8 @@ impl, C: GenericConfig, const D: usize> use zeknox::types::{NTTConfig, TransposeConfig}; use zeknox::{ntt_batch_ptr, transpose_rev_batch}; + use crate::util::mem::vec_zeroed; + let lde_size = degree << rate_bits; let num_polys = polynomials.len() + salt_size; let total_alloc_size = num_polys * lde_size; @@ -211,10 +213,13 @@ impl, C: GenericConfig, const D: usize> ); // Copy all data to GPU in one go + let mut flat_data = timed!( + timing, + "Prepare CPU memory", + unsafe { vec_zeroed::(total_alloc_size) } + ); - let mut flat_data = vec![F::ZERO; total_alloc_size]; - - timed!(timing, "Prepare CPU memory", { + timed!(timing, "Copy CPU memory", { for i in 0..polynomials.len() { flat_data[i * lde_size..i * lde_size + degree] .copy_from_slice(polynomials[i].coeffs.as_ref()) diff --git a/plonky2/src/util/mem.rs b/plonky2/src/util/mem.rs new file mode 100644 index 000000000..c1cc44ef9 --- /dev/null +++ b/plonky2/src/util/mem.rs @@ -0,0 +1,17 @@ +// alloc memory for Vec, where every element is 0. (a lot) faster than vec![F::ZERO; len] +pub unsafe fn vec_zeroed(len: usize) -> Vec { + let elem_size = std::mem::size_of::(); + debug_assert!(elem_size != 0, "ZST not supported by this helper"); + + // Layout for len elements + let layout = std::alloc::Layout::array::(len).expect("layout overflow"); + + // Allocate zeroed memory + let ptr = std::alloc::alloc_zeroed(layout) as *mut F; + if ptr.is_null() { + std::alloc::handle_alloc_error(layout); + } + + // Take ownership as a Vec + Vec::from_raw_parts(ptr, len, len) +} diff --git a/plonky2/src/util/mod.rs b/plonky2/src/util/mod.rs index 73ec06f7a..a0e81e06e 100644 --- a/plonky2/src/util/mod.rs +++ b/plonky2/src/util/mod.rs @@ -11,6 +11,7 @@ use crate::field::polynomial::PolynomialValues; use crate::field::types::Field; pub(crate) mod context_tree; +pub(crate) mod mem; pub(crate) mod partial_products; pub mod reducing; pub mod serialization; From 67853083bc860c3df368026abe3ce6beb5d94764 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 16 Dec 2025 22:32:06 +0000 Subject: [PATCH 33/37] futher opts --- field/src/lib.rs | 1 + plonky2/src/util/mem.rs => field/src/util.rs | 11 +++++++---- plonky2/src/fri/oracle.rs | 11 ++++------- plonky2/src/hash/merkle_tree.rs | 4 +++- plonky2/src/plonk/prover.rs | 1 + plonky2/src/util/mod.rs | 1 - 6 files changed, 16 insertions(+), 13 deletions(-) rename plonky2/src/util/mem.rs => field/src/util.rs (58%) diff --git a/field/src/lib.rs b/field/src/lib.rs index 9a2ea4f9c..be1c4b512 100644 --- a/field/src/lib.rs +++ b/field/src/lib.rs @@ -24,6 +24,7 @@ pub mod polynomial; pub mod secp256k1_base; pub mod secp256k1_scalar; pub mod types; +pub mod util; pub mod zero_poly_coset; #[cfg(test)] diff --git a/plonky2/src/util/mem.rs b/field/src/util.rs similarity index 58% rename from plonky2/src/util/mem.rs rename to field/src/util.rs index c1cc44ef9..374a38986 100644 --- a/plonky2/src/util/mem.rs +++ b/field/src/util.rs @@ -1,15 +1,18 @@ +use alloc::vec::Vec; +use core::mem; + // alloc memory for Vec, where every element is 0. (a lot) faster than vec![F::ZERO; len] pub unsafe fn vec_zeroed(len: usize) -> Vec { - let elem_size = std::mem::size_of::(); + let elem_size = mem::size_of::(); debug_assert!(elem_size != 0, "ZST not supported by this helper"); // Layout for len elements - let layout = std::alloc::Layout::array::(len).expect("layout overflow"); + let layout = alloc::alloc::Layout::array::(len).expect("layout overflow"); // Allocate zeroed memory - let ptr = std::alloc::alloc_zeroed(layout) as *mut F; + let ptr = alloc::alloc::alloc_zeroed(layout) as *mut F; if ptr.is_null() { - std::alloc::handle_alloc_error(layout); + alloc::alloc::handle_alloc_error(layout); } // Take ownership as a Vec diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs index 364ea8497..292f2a0cf 100644 --- a/plonky2/src/fri/oracle.rs +++ b/plonky2/src/fri/oracle.rs @@ -181,12 +181,11 @@ impl, C: GenericConfig, const D: usize> degree: usize, salt_size: usize, ) -> Self { + use plonky2_field::util::vec_zeroed; use zeknox::device::memory::HostOrDeviceSlice; use zeknox::types::{NTTConfig, TransposeConfig}; use zeknox::{ntt_batch_ptr, transpose_rev_batch}; - use crate::util::mem::vec_zeroed; - let lde_size = degree << rate_bits; let num_polys = polynomials.len() + salt_size; let total_alloc_size = num_polys * lde_size; @@ -213,11 +212,9 @@ impl, C: GenericConfig, const D: usize> ); // Copy all data to GPU in one go - let mut flat_data = timed!( - timing, - "Prepare CPU memory", - unsafe { vec_zeroed::(total_alloc_size) } - ); + let mut flat_data = timed!(timing, "Prepare CPU memory", unsafe { + vec_zeroed::(total_alloc_size) + }); timed!(timing, "Copy CPU memory", { for i in 0..polynomials.len() { diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs index 12f44970b..4773d71ab 100644 --- a/plonky2/src/hash/merkle_tree.rs +++ b/plonky2/src/hash/merkle_tree.rs @@ -576,6 +576,8 @@ impl> MerkleTree { leaf_len: usize, cap_height: usize, ) -> Self { + use plonky2_field::util::vec_zeroed; + let log2_leaves_len = log2_strict(leaves_len); assert!( cap_height <= log2_leaves_len, @@ -585,7 +587,7 @@ impl> MerkleTree { ); // copy data from GPU in async mode - let mut host_leaves: Vec = vec![F::ZERO; leaves_len * leaf_len]; + let mut host_leaves: Vec = unsafe { vec_zeroed(leaves_len * leaf_len) }; let stream_copy = CudaStream::create().unwrap(); let start = std::time::Instant::now(); diff --git a/plonky2/src/plonk/prover.rs b/plonky2/src/plonk/prover.rs index b4112cf8f..18ce4fc64 100644 --- a/plonky2/src/plonk/prover.rs +++ b/plonky2/src/plonk/prover.rs @@ -7,6 +7,7 @@ use core::mem::swap; use anyhow::{ensure, Result}; use hashbrown::HashMap; +use plonky2_field::util::vec_zeroed; use plonky2_maybe_rayon::*; use super::circuit_builder::{LookupChallenges, LookupWire}; diff --git a/plonky2/src/util/mod.rs b/plonky2/src/util/mod.rs index a0e81e06e..73ec06f7a 100644 --- a/plonky2/src/util/mod.rs +++ b/plonky2/src/util/mod.rs @@ -11,7 +11,6 @@ use crate::field::polynomial::PolynomialValues; use crate::field::types::Field; pub(crate) mod context_tree; -pub(crate) mod mem; pub(crate) mod partial_products; pub mod reducing; pub mod serialization; From fa060c41dabbdf8bbc8c73609aa301fc0459829d Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 10 Jan 2026 17:41:18 +0000 Subject: [PATCH 34/37] clean up --- plonky2/src/gadgets/arithmetic_extension.rs | 36 +++++++++++++++++++++ plonky2/src/plonk/circuit_builder.rs | 16 +++++++++ 2 files changed, 52 insertions(+) diff --git a/plonky2/src/gadgets/arithmetic_extension.rs b/plonky2/src/gadgets/arithmetic_extension.rs index 9d1088030..6c8a253cf 100644 --- a/plonky2/src/gadgets/arithmetic_extension.rs +++ b/plonky2/src/gadgets/arithmetic_extension.rs @@ -629,6 +629,18 @@ mod tests { #[test] fn test_mul_many() -> Result<()> { + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + // Initialize twiddle factors for a range of sizes that might be used + for i in 0..=20 { + zeknox::init_twiddle_factors_rs(0, i); + } + // Initialize coset for Goldilocks field (coset generator = 7) + let coset_gen_u64 = 7u64; + zeknox::init_coset_rs(0, 20, coset_gen_u64); + } + const D: usize = 2; type C = PoseidonGoldilocksConfig; type F = >::F; @@ -665,6 +677,18 @@ mod tests { #[test] fn test_div_extension() -> Result<()> { + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + // Initialize twiddle factors for a range of sizes that might be used + for i in 0..=20 { + zeknox::init_twiddle_factors_rs(0, i); + } + // Initialize coset for Goldilocks field (coset generator = 7) + let coset_gen_u64 = 7u64; + zeknox::init_coset_rs(0, 20, coset_gen_u64); + } + const D: usize = 2; type C = PoseidonGoldilocksConfig; type F = >::F; @@ -692,6 +716,18 @@ mod tests { #[test] fn test_mul_algebra() -> Result<()> { + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + // Initialize twiddle factors for a range of sizes that might be used + for i in 0..=20 { + zeknox::init_twiddle_factors_rs(0, i); + } + // Initialize coset for Goldilocks field (coset generator = 7) + let coset_gen_u64 = 7u64; + zeknox::init_coset_rs(0, 20, coset_gen_u64); + } + const D: usize = 2; type C = KeccakGoldilocksConfig; type F = >::F; diff --git a/plonky2/src/plonk/circuit_builder.rs b/plonky2/src/plonk/circuit_builder.rs index e6a81f378..74fa4be62 100644 --- a/plonky2/src/plonk/circuit_builder.rs +++ b/plonky2/src/plonk/circuit_builder.rs @@ -1225,6 +1225,22 @@ impl, const D: usize> CircuitBuilder { let max_fft_points = 1 << (degree_bits + max(rate_bits, log2_ceil(quotient_degree_factor))); let fft_root_table = fft_root_table(max_fft_points); + // Initialize GPU twiddle factors for all sizes we'll use + #[cfg(feature = "cuda")] + { + if F::CUDA_SUPPORT { + zeknox::clear_cuda_errors_rs(); + // Initialize twiddle factors for degree and LDE sizes + // degree_bits: the base degree + // degree_bits + rate_bits: the LDE size used in from_coeffs + // We need to initialize from 0 to cover all potential sizes + let max_log_size = degree_bits + max(rate_bits, log2_ceil(quotient_degree_factor)); + for i in 0..=max_log_size { + zeknox::init_twiddle_factors_rs(0, i); + } + } + } + // This part of the code on GPU is buggy. So we use CPU for computation. // It does not impact performance as this is only done once during setup. let constants_sigmas_commitment = if commit_to_sigma { From 2055bbc704b1f05d60de13e1e2dc59ec1c2e828d Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 10 Jan 2026 18:03:23 +0000 Subject: [PATCH 35/37] fix example --- plonky2/examples/fibonacci.rs | 50 +++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs index 63c94cc78..fdde49534 100644 --- a/plonky2/examples/fibonacci.rs +++ b/plonky2/examples/fibonacci.rs @@ -1,39 +1,44 @@ -use anyhow::{Ok, Result}; +use std::time::Instant; + +use anyhow::Result; use log::Level; use plonky2::field::types::Field; use plonky2::iop::witness::{PartialWitness, WitnessWrite}; use plonky2::plonk::circuit_builder::CircuitBuilder; use plonky2::plonk::circuit_data::CircuitConfig; -use plonky2::plonk::config::{GenericConfig, PoseidonGoldilocksConfig}; +use plonky2::plonk::config::{GenericConfig, Poseidon2GoldilocksConfig, PoseidonGoldilocksConfig}; +use plonky2::plonk::prover::prove; use plonky2::util::timing::TimingTree; -const LOOP: usize = 100_000; - /// An example of using Plonky2 to prove a statement of the form /// "I know the 100th element of the Fibonacci sequence, starting with constants a and b." /// When a == 0 and b == 1, this is proving knowledge of the 100th (standard) Fibonacci number. fn main() -> Result<()> { - // Initialize logger to see timing output env_logger::Builder::from_default_env() - .format_timestamp(None) .filter_level(log::LevelFilter::Debug) .init(); + work::()?; + work::() +} + +fn work>() -> Result<()> { const D: usize = 2; let config = CircuitConfig::standard_recursion_config(); - let mut builder = CircuitBuilder::::new(config); + let mut builder = CircuitBuilder::::new(config); + // The arithmetic circuit. let initial_a = builder.add_virtual_target(); let initial_b = builder.add_virtual_target(); let mut prev_target = initial_a; let mut cur_target = initial_b; - for _ in 0..LOOP { + for _ in 0..999999 { let temp = builder.add(prev_target, cur_target); prev_target = cur_target; cur_target = temp; } - #[cfg(feature = "cuda")] + #[cfg(feature = "cuda")] { use plonky2_util::log2_ceil; @@ -62,22 +67,27 @@ fn main() -> Result<()> { // Provide initial values. let timer1 = Instant::now(); let mut pw = PartialWitness::new(); - pw.set_target(initial_a, F::ZERO)?; - pw.set_target(initial_b, F::ONE)?; + pw.set_target(initial_a, C::F::ZERO)?; + pw.set_target(initial_b, C::F::ONE)?; + let data = builder.build::(); + let timer2 = Instant::now(); - let mut timing = TimingTree::new("prove", Level::Info); + // Create a TimingTree to track detailed timing information + let mut timing = TimingTree::new("prove", Level::Debug); + let proof = prove::(&data.prover_only, &data.common, pw, &mut timing)?; + let timer3 = Instant::now(); - let proof = plonky2::plonk::prover::prove(&data.prover_only, &data.common, pw, &mut timing)?; + // Print the timing tree + timing.print(); println!( - "{}-th Fibonacci number mod |F| (starting with {}, {}) is: {}", - LOOP, proof.public_inputs[0], proof.public_inputs[1], proof.public_inputs[2] + "100th Fibonacci number mod |F| (starting with {}, {}) is: {}", + proof.public_inputs[0], proof.public_inputs[1], proof.public_inputs[2] ); - timing.print(); - data.verify(proof)?; + println!("Build time: {:?}", timer2.duration_since(timer1)); + println!("Prove time: {:?}", timer3.duration_since(timer2)); - println!("finished"); - Ok(()) -} + data.verify(proof) +} \ No newline at end of file From bd17b4ddfb6db20fb579aabacc92e9a817562543 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 10 Jan 2026 18:36:24 +0000 Subject: [PATCH 36/37] update dependency --- Cargo.toml | 3 ++- plonky2/examples/fibonacci.rs | 4 ++-- plonky2/src/hash/poseidon2/hash.rs | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 3bb243ecf..336f94466 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,8 @@ rand = { version = "0.8.4", default-features = false } serde = { version = "1.0", default-features = false, features = ["derive"] } static_assertions = { version = "1.1.0", default-features = false } unroll = { version = "0.1.5", default-features = false } -zeknox = { path = "../zeknox/wrappers/rust" } +# zeknox = { path = "../zeknox/wrappers/rust" } +zeknox = { git = "https://github.com/elliottech/zeknox", branch = "zz/cuda-integration" } [profile.release] opt-level = 3 diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs index fdde49534..2f8bb84ae 100644 --- a/plonky2/examples/fibonacci.rs +++ b/plonky2/examples/fibonacci.rs @@ -38,7 +38,7 @@ fn work>() -> Result<()> { cur_target = temp; } - #[cfg(feature = "cuda")] + #[cfg(feature = "cuda")] { use plonky2_util::log2_ceil; @@ -90,4 +90,4 @@ fn work>() -> Result<()> { println!("Prove time: {:?}", timer3.duration_since(timer2)); data.verify(proof) -} \ No newline at end of file +} diff --git a/plonky2/src/hash/poseidon2/hash.rs b/plonky2/src/hash/poseidon2/hash.rs index 9fc6b8e4f..e73db08bc 100644 --- a/plonky2/src/hash/poseidon2/hash.rs +++ b/plonky2/src/hash/poseidon2/hash.rs @@ -10,9 +10,9 @@ use crate::gates::poseidon2::Poseidon2Gate; use crate::hash::hash_types::{HashOut, RichField}; use crate::hash::hashing::{compress, hash_n_to_hash_no_pad, PlonkyPermutation}; use crate::iop::ext_target::ExtensionTarget; -use crate::iop::target::{BoolTarget, Target};use crate::plonk::config::HasherType; +use crate::iop::target::{BoolTarget, Target}; use crate::plonk::circuit_builder::CircuitBuilder; -use crate::plonk::config::{AlgebraicHasher, Hasher}; +use crate::plonk::config::{AlgebraicHasher, Hasher, HasherType}; pub trait Poseidon2: PrimeField64 { #[inline] From 041e1b75d22aaca351ae9aa9c3ee3b01a4a3c311 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 10 Jan 2026 18:58:51 +0000 Subject: [PATCH 37/37] update --- Cargo.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 336f94466..3bb243ecf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,8 +14,7 @@ rand = { version = "0.8.4", default-features = false } serde = { version = "1.0", default-features = false, features = ["derive"] } static_assertions = { version = "1.1.0", default-features = false } unroll = { version = "0.1.5", default-features = false } -# zeknox = { path = "../zeknox/wrappers/rust" } -zeknox = { git = "https://github.com/elliottech/zeknox", branch = "zz/cuda-integration" } +zeknox = { path = "../zeknox/wrappers/rust" } [profile.release] opt-level = 3