diff --git a/Cargo.toml b/Cargo.toml index 81dbbde49..eed6218b8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,6 +13,8 @@ rand = { version = "0.8.4", default-features = false } serde = { version = "1.0", default-features = false, features = ["derive"] } static_assertions = { version = "1.1.0", default-features = false } unroll = { version = "0.1.5", default-features = false } +zeknox_= { path = "../zeknox/wrappers/rust"} + [profile.release] opt-level = 3 diff --git a/field/Cargo.toml b/field/Cargo.toml index e13f49efd..49cb04494 100644 --- a/field/Cargo.toml +++ b/field/Cargo.toml @@ -19,6 +19,9 @@ serde = { workspace = true, features = ["alloc"] } static_assertions = { workspace = true } unroll = { workspace = true } +# cuda accelerator wrapper +zeknox = { workspace = true } + # Local dependencies plonky2_util = { version = "1.0.0", path = "../util", default-features = false } @@ -29,3 +32,9 @@ rustdoc-args = ["--html-in-header", ".cargo/katex-header.html"] [lints] workspace = true + + +[features] +# default = [] +default = [ "cuda" ] +cuda = [] \ No newline at end of file diff --git a/field/src/fft.rs b/field/src/fft.rs index d078ca6c3..eeb86b62d 100644 --- a/field/src/fft.rs +++ b/field/src/fft.rs @@ -32,16 +32,230 @@ pub fn fft_root_table(n: usize) -> FftRootTable { root_table } +#[cfg(feature = "cuda")] +fn fft_dispatch_gpu( + input: &mut [F], + zero_factor: Option, + root_table: Option<&FftRootTable>, +) { + use zeknox::ntt_batch; + use zeknox::types::NTTConfig; + if F::CUDA_SUPPORT { + return ntt_batch( + 0, + input.as_mut_ptr(), + input.len().trailing_zeros() as usize, + NTTConfig::default(), + ); + } else { + return fft_dispatch_cpu(input, zero_factor, root_table); + } +} + +/// Batch FFT computation for multiple polynomials on GPU +#[cfg(feature = "cuda")] +fn fft_batch_dispatch_gpu( + inputs: &mut [F], + poly_size: usize, + num_polys: usize, + zero_factor: Option, + root_table: Option<&FftRootTable>, +) { + use zeknox::ntt_batch; + use zeknox::types::NTTConfig; + + if F::CUDA_SUPPORT { + let mut cfg = NTTConfig::default(); + cfg.batches = num_polys as u32; + + return ntt_batch( + 0, + inputs.as_mut_ptr(), + poly_size.trailing_zeros() as usize, + cfg, + ); + } else { + // Fallback to CPU: process each polynomial separately + for i in 0..num_polys { + let start = i * poly_size; + let end = start + poly_size; + fft_dispatch_cpu(&mut inputs[start..end], zero_factor, root_table); + } + } +} + +#[cfg(feature = "cuda")] +pub(crate) fn coset_fft_gpu( + poly: PolynomialCoeffs, + zero_factor: Option, + root_table: Option<&FftRootTable>, +) -> PolynomialValues { + use zeknox::ntt_batch; + use zeknox::types::NTTConfig; + + if !F::CUDA_SUPPORT { + // Fallback to CPU if CUDA not supported for this field + let modified_poly: PolynomialCoeffs = F::coset_shift() + .powers() + .zip(&poly.coeffs) + .map(|(r, &c)| r * c) + .collect::>() + .into(); + return fft_with_options(modified_poly, zero_factor, root_table); + } + + let PolynomialCoeffs { coeffs: mut buffer } = poly; + let lg_n = buffer.len().trailing_zeros() as usize; + + // // Initialize coset on GPU + // // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR) + // // TODO: Make this generic for other fields if needed + // let coset_gen_u64 = 7u64; + // init_coset_rs(0, lg_n, coset_gen_u64); + + // Configure NTT for coset + let mut cfg = NTTConfig::default(); + cfg.with_coset = true; + cfg.ntt_type = zeknox::types::NTTType::Coset; + + // Perform coset NTT on GPU + ntt_batch(0, buffer.as_mut_ptr(), lg_n, cfg); + + PolynomialValues::new(buffer) +} + +/// Batch coset FFT computation for multiple polynomials on GPU +#[cfg(feature = "cuda")] +fn coset_fft_batch_gpu( + polys: Vec>, + zero_factor: Option, + root_table: Option<&FftRootTable>, +) -> Vec> { + use zeknox::ntt_batch; + use zeknox::types::NTTConfig; + + if polys.is_empty() { + return Vec::new(); + } + + let num_polys = polys.len(); + let poly_size = polys[0].len(); + + // Verify all polynomials have the same size + assert!( + polys.iter().all(|p| p.len() == poly_size), + "All polynomials must have the same size for batch coset FFT" + ); + + if !F::CUDA_SUPPORT { + // Fallback to CPU if CUDA not supported for this field + return polys + .into_iter() + .map(|poly| { + let modified_poly: PolynomialCoeffs = F::coset_shift() + .powers() + .zip(&poly.coeffs) + .map(|(r, &c)| r * c) + .collect::>() + .into(); + fft_with_options(modified_poly, zero_factor, root_table) + }) + .collect(); + } + + // Flatten all polynomials into a single contiguous buffer + let mut buffer: Vec = Vec::with_capacity(num_polys * poly_size); + for poly in polys { + buffer.extend_from_slice(&poly.coeffs); + } + + let lg_n = poly_size.trailing_zeros() as usize; + + // Configure NTT for batch coset + let mut cfg = NTTConfig::default(); + cfg.batches = num_polys as u32; + cfg.with_coset = true; + cfg.ntt_type = zeknox::types::NTTType::Coset; + + // Perform batch coset NTT on GPU + ntt_batch(0, buffer.as_mut_ptr(), lg_n, cfg); + + // Split the buffer back into separate polynomials + buffer + .chunks(poly_size) + .map(|chunk| PolynomialValues::new(chunk.to_vec())) + .collect() +} + +/// Compute coset FFT for multiple polynomials in batch. +/// All polynomials must have the same size (power of 2). +/// Returns a vector of PolynomialValues in the same order as input. +pub fn coset_fft_batch(polys: Vec>) -> Vec> { + coset_fft_batch_with_options(polys, None, None) +} + +/// Compute coset FFT for multiple polynomials in batch with options. +/// All polynomials must have the same size (power of 2). +/// Returns a vector of PolynomialValues in the same order as input. +pub fn coset_fft_batch_with_options( + polys: Vec>, + zero_factor: Option, + root_table: Option<&FftRootTable>, +) -> Vec> { + #[cfg(feature = "cuda")] + return coset_fft_batch_gpu(polys, zero_factor, root_table); + + #[cfg(not(feature = "cuda"))] + { + // CPU fallback: process each polynomial separately + polys + .into_iter() + .map(|poly| { + let modified_poly: PolynomialCoeffs = F::coset_shift() + .powers() + .zip(&poly.coeffs) + .map(|(r, &c)| r * c) + .collect::>() + .into(); + fft_with_options(modified_poly, zero_factor, root_table) + }) + .collect() + } +} + +fn fft_dispatch_cpu( + input: &mut [F], + zero_factor: Option, + root_table: Option<&FftRootTable>, +) { + if root_table.is_some() { + return fft_classic(input, zero_factor.unwrap_or(0), root_table.unwrap()); + } else { + // let pre_computed = F::pre_compute_fft_root_table(input.len()); + // if pre_computed.is_some() { + // return fft_classic(input, zero_factor.unwrap_or(0), pre_computed.unwrap()); + // } else { + // let computed = fft_root_table::(input.len()); + + // return fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref()); + // } + let computed = fft_root_table::(input.len()); + + return fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref()); + }; +} + #[inline] fn fft_dispatch( input: &mut [F], zero_factor: Option, root_table: Option<&FftRootTable>, ) { - let computed_root_table = root_table.is_none().then(|| fft_root_table(input.len())); - let used_root_table = root_table.or(computed_root_table.as_ref()).unwrap(); + #[cfg(feature = "cuda")] + return fft_dispatch_gpu(input, zero_factor, root_table); - fft_classic(input, zero_factor.unwrap_or(0), used_root_table); + #[cfg(not(feature = "cuda"))] + return fft_dispatch_cpu(input, zero_factor, root_table); } #[inline] @@ -60,6 +274,66 @@ pub fn fft_with_options( PolynomialValues::new(buffer) } +/// Compute FFT for multiple polynomials in batch. +/// All polynomials must have the same size (power of 2). +/// Returns a vector of PolynomialValues in the same order as input. +#[inline] +pub fn fft_batch(polys: Vec>) -> Vec> { + fft_batch_with_options(polys, None, None) +} + +/// Compute FFT for multiple polynomials in batch with options. +/// All polynomials must have the same size (power of 2). +/// Returns a vector of PolynomialValues in the same order as input. +pub fn fft_batch_with_options( + polys: Vec>, + zero_factor: Option, + root_table: Option<&FftRootTable>, +) -> Vec> { + if polys.is_empty() { + return Vec::new(); + } + + let num_polys = polys.len(); + let poly_size = polys[0].len(); + + // Verify all polynomials have the same size + assert!( + polys.iter().all(|p| p.len() == poly_size), + "All polynomials must have the same size for batch FFT" + ); + assert!( + poly_size.is_power_of_two(), + "Polynomial size must be a power of 2" + ); + + // Flatten all polynomials into a single contiguous buffer + let mut buffer: Vec = Vec::with_capacity(num_polys * poly_size); + for poly in polys { + buffer.extend_from_slice(&poly.coeffs); + } + + // Dispatch to GPU or CPU batch processing + #[cfg(feature = "cuda")] + fft_batch_dispatch_gpu(&mut buffer, poly_size, num_polys, zero_factor, root_table); + + #[cfg(not(feature = "cuda"))] + { + // CPU fallback: process each polynomial separately + for i in 0..num_polys { + let start = i * poly_size; + let end = start + poly_size; + fft_dispatch_cpu(&mut buffer[start..end], zero_factor, root_table); + } + } + + // Split the buffer back into separate polynomials + buffer + .chunks(poly_size) + .map(|chunk| PolynomialValues::new(chunk.to_vec())) + .collect() +} + #[inline] pub fn ifft(poly: PolynomialValues) -> PolynomialCoeffs { ifft_with_options(poly, None, None) @@ -206,8 +480,10 @@ mod tests { use alloc::vec::Vec; use plonky2_util::{log2_ceil, log2_strict}; + #[cfg(feature = "cuda")] + use zeknox::init_twiddle_factors_rs; - use crate::fft::{fft, fft_with_options, ifft}; + use crate::fft::{coset_fft_batch, fft, fft_batch, fft_with_options, ifft}; use crate::goldilocks_field::GoldilocksField; use crate::polynomial::{PolynomialCoeffs, PolynomialValues}; use crate::types::Field; @@ -218,6 +494,13 @@ mod tests { let degree = 200usize; let degree_padded = degree.next_power_of_two(); + #[cfg(feature = "cuda")] + let log_degree = { + zeknox::clear_cuda_errors_rs(); + let log_degree = degree_padded.trailing_zeros() as usize; + init_twiddle_factors_rs(0, log_degree); + log_degree + }; // Create a vector of coeffs; the first degree of them are // "random", the last degree_padded-degree of them are zero. let coeffs = (0..degree) @@ -239,6 +522,8 @@ mod tests { } for r in 0..4 { + #[cfg(feature = "cuda")] + init_twiddle_factors_rs(0, log_degree + r); // expand coefficients by factor 2^r by filling with zeros let zero_tail = coefficients.lde(r); assert_eq!( @@ -248,6 +533,248 @@ mod tests { } } + #[test] + #[cfg(feature = "cuda")] + fn test_fft_gpu_vs_cpu_single() { + type F = GoldilocksField; + + // Test various polynomial sizes + for log_size in [8, 10, 12, 14] { + let size = 1 << log_size; + zeknox::clear_cuda_errors_rs(); + init_twiddle_factors_rs(0, log_size); + + // Create a random polynomial + let coeffs: Vec = (0..size) + .map(|i| F::from_canonical_usize(i * 7919 % 1000000)) + .collect(); + + let poly = PolynomialCoeffs { + coeffs: coeffs.clone(), + }; + + // Compute FFT using GPU (via fft function which dispatches to GPU) + let gpu_result = fft(poly.clone()); + + // Compute FFT using CPU (force CPU path) + let mut cpu_buffer = coeffs.clone(); + super::fft_dispatch_cpu(&mut cpu_buffer, None, None); + let cpu_result = PolynomialValues::new(cpu_buffer); + + // Compare results + assert_eq!( + gpu_result.len(), + cpu_result.len(), + "GPU and CPU results have different lengths for size {}", + size + ); + + for i in 0..size { + assert_eq!( + gpu_result.values[i], cpu_result.values[i], + "Mismatch at index {} for polynomial size {}", + i, size + ); + } + } + } + + #[test] + #[cfg(feature = "cuda")] + fn test_fft_batch_gpu_vs_cpu() { + type F = GoldilocksField; + + let poly_size: usize = 1 << 10; // 1024 elements + let num_polys = 8; + let log_size = poly_size.trailing_zeros() as usize; + + zeknox::clear_cuda_errors_rs(); + init_twiddle_factors_rs(0, log_size); + + // Create multiple random polynomials + let polys: Vec> = (0..num_polys) + .map(|batch_idx| { + let coeffs: Vec = (0..poly_size) + .map(|i| F::from_canonical_usize((i * 7919 + batch_idx * 12345) % 1000000)) + .collect(); + PolynomialCoeffs { coeffs } + }) + .collect(); + + // Compute batch FFT using GPU + let gpu_results = fft_batch(polys.clone()); + + // Compute FFT for each polynomial using CPU + let cpu_results: Vec> = polys + .into_iter() + .map(|poly| { + let mut buffer = poly.coeffs.clone(); + super::fft_dispatch_cpu(&mut buffer, None, None); + PolynomialValues::new(buffer) + }) + .collect(); + + // Compare results + assert_eq!(gpu_results.len(), cpu_results.len()); + for (batch_idx, (gpu_result, cpu_result)) in + gpu_results.iter().zip(cpu_results.iter()).enumerate() + { + assert_eq!(gpu_result.len(), cpu_result.len()); + for i in 0..poly_size { + assert_eq!( + gpu_result.values[i], cpu_result.values[i], + "Batch FFT mismatch at batch {} index {}", + batch_idx, i + ); + } + } + } + + #[test] + #[cfg(feature = "cuda")] + fn test_coset_fft_gpu_vs_cpu_single() { + use zeknox::init_coset_rs; + + use crate::types::PrimeField64; + type F = GoldilocksField; + + for log_size in [8, 10, 12] { + let size = 1 << log_size; + zeknox::clear_cuda_errors_rs(); + init_twiddle_factors_rs(0, log_size); + + // Initialize coset for GPU + let coset_gen_u64 = F::coset_shift().to_canonical_u64(); + init_coset_rs(0, log_size, coset_gen_u64); + + // Create a random polynomial + let coeffs: Vec = (0..size) + .map(|i| F::from_canonical_usize(i * 8191 % 1000000)) + .collect(); + + let poly = PolynomialCoeffs { + coeffs: coeffs.clone(), + }; + + // Compute coset FFT using GPU + let gpu_result = super::coset_fft_gpu(poly.clone(), None, None); + + // Compute coset FFT using CPU (apply coset shift then FFT) + let modified_poly: PolynomialCoeffs = F::coset_shift() + .powers() + .zip(&coeffs) + .map(|(r, &c)| r * c) + .collect::>() + .into(); + + let mut cpu_buffer = modified_poly.coeffs; + super::fft_dispatch_cpu(&mut cpu_buffer, None, None); + let cpu_result = PolynomialValues::new(cpu_buffer); + + // Compare results + assert_eq!( + gpu_result.len(), + cpu_result.len(), + "GPU and CPU coset FFT results have different lengths for size {}", + size + ); + + for i in 0..size { + assert_eq!( + gpu_result.values[i], cpu_result.values[i], + "Coset FFT mismatch at index {} for polynomial size {}", + i, size + ); + } + } + } + + #[test] + #[cfg(feature = "cuda")] + fn test_coset_fft_batch_gpu_vs_cpu() { + use zeknox::init_coset_rs; + + use crate::types::PrimeField64; + type F = GoldilocksField; + + let poly_size: usize = 1 << 10; // 1024 elements + let num_polys = 8; + let log_size = poly_size.trailing_zeros() as usize; + + zeknox::clear_cuda_errors_rs(); + init_twiddle_factors_rs(0, log_size); + + // Initialize coset for GPU + let coset_gen_u64 = F::coset_shift().to_canonical_u64(); + init_coset_rs(0, log_size, coset_gen_u64); + + // Create multiple random polynomials + let polys: Vec> = (0..num_polys) + .map(|batch_idx| { + let coeffs: Vec = (0..poly_size) + .map(|i| F::from_canonical_usize((i * 8191 + batch_idx * 54321) % 1000000)) + .collect(); + PolynomialCoeffs { coeffs } + }) + .collect(); + + // Compute batch coset FFT using GPU + let gpu_results = coset_fft_batch(polys.clone()); + + // Compute coset FFT for each polynomial using CPU + let cpu_results: Vec> = polys + .into_iter() + .map(|poly| { + let modified_poly: PolynomialCoeffs = F::coset_shift() + .powers() + .zip(&poly.coeffs) + .map(|(r, &c)| r * c) + .collect::>() + .into(); + + let mut buffer = modified_poly.coeffs; + super::fft_dispatch_cpu(&mut buffer, None, None); + PolynomialValues::new(buffer) + }) + .collect(); + + // Compare results + assert_eq!(gpu_results.len(), cpu_results.len()); + for (batch_idx, (gpu_result, cpu_result)) in + gpu_results.iter().zip(cpu_results.iter()).enumerate() + { + assert_eq!(gpu_result.len(), cpu_result.len()); + for i in 0..poly_size { + assert_eq!( + gpu_result.values[i], cpu_result.values[i], + "Batch coset FFT mismatch at batch {} index {}", + batch_idx, i + ); + } + } + } + + #[test] + fn test_batch_fft_empty() { + type F = GoldilocksField; + let polys: Vec> = vec![]; + let results = fft_batch(polys); + assert!(results.is_empty()); + } + + #[test] + #[should_panic(expected = "All polynomials must have the same size")] + fn test_batch_fft_different_sizes() { + type F = GoldilocksField; + let poly1 = PolynomialCoeffs { + coeffs: vec![F::ONE; 256], + }; + let poly2 = PolynomialCoeffs { + coeffs: vec![F::ONE; 512], + }; + let _ = fft_batch(vec![poly1, poly2]); + } + fn evaluate_naive(coefficients: &PolynomialCoeffs) -> PolynomialValues { let degree = coefficients.len(); let degree_padded = 1 << log2_ceil(degree); diff --git a/field/src/goldilocks_extensions.rs b/field/src/goldilocks_extensions.rs index 8f2d85253..6dd15ce0d 100644 --- a/field/src/goldilocks_extensions.rs +++ b/field/src/goldilocks_extensions.rs @@ -21,9 +21,10 @@ impl Extendable<2> for GoldilocksField { // DTH_ROOT = W^((ORDER - 1)/2) const DTH_ROOT: Self = Self(18446744069414584320); - const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] = [Self(0), Self(11713931119993638672)]; + const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] = + [Self(18081566051660590251), Self(16121475356294670766)]; - const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(7226896044987257365)]; + const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(15659105665374529263)]; } impl Mul for QuadraticExtension { @@ -44,11 +45,15 @@ impl Extendable<4> for GoldilocksField { // DTH_ROOT = W^((ORDER - 1)/4) const DTH_ROOT: Self = Self(281474976710656); - const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] = - [Self(0), Self(8295451483910296135), Self(0), Self(0)]; + const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] = [ + Self(5024755240244648895), + Self(13227474371289740625), + Self(3912887029498544536), + Self(3900057112666848848), + ]; const EXT_POWER_OF_TWO_GENERATOR: [Self; 4] = - [Self(0), Self(0), Self(0), Self(17216955519093520442)]; + [Self(0), Self(0), Self(0), Self(12587610116473453104)]; } impl Mul for QuarticExtension { @@ -70,11 +75,11 @@ impl Extendable<5> for GoldilocksField { const DTH_ROOT: Self = Self(1041288259238279555); const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 5] = [ - Self(4624713872807171977), - Self(381988216716071028), - Self(14499722700050429911), - Self(4870631734967222356), - Self(4518902370426242880), + Self(2899034827742553394), + Self(13012057356839176729), + Self(14593811582388663055), + Self(7722900811313895436), + Self(4557222484695340057), ]; const EXT_POWER_OF_TWO_GENERATOR: [Self; 5] = [ diff --git a/field/src/goldilocks_field.rs b/field/src/goldilocks_field.rs index b0191ca59..ae8457744 100644 --- a/field/src/goldilocks_field.rs +++ b/field/src/goldilocks_field.rs @@ -77,17 +77,19 @@ impl Field for GoldilocksField { const CHARACTERISTIC_TWO_ADICITY: usize = Self::TWO_ADICITY; // Sage: `g = GF(p).multiplicative_generator()` - const MULTIPLICATIVE_GROUP_GENERATOR: Self = Self(14293326489335486720); + const MULTIPLICATIVE_GROUP_GENERATOR: Self = Self(7); // Sage: // ``` // g_2 = g^((p - 1) / 2^32) // g_2.multiplicative_order().factor() // ``` - const POWER_OF_TWO_GENERATOR: Self = Self(7277203076849721926); + const POWER_OF_TWO_GENERATOR: Self = Self(1753635133440165772); const BITS: usize = 64; + const CUDA_SUPPORT: bool = true; + fn order() -> BigUint { Self::ORDER.into() } diff --git a/field/src/interpolation.rs b/field/src/interpolation.rs index df7084572..9772fff56 100644 --- a/field/src/interpolation.rs +++ b/field/src/interpolation.rs @@ -77,6 +77,9 @@ pub fn interpolate2(points: [(F, F); 2], x: F) -> F { #[cfg(test)] mod tests { + #[cfg(feature = "cuda")] + use zeknox::init_twiddle_factors_rs; + use super::*; use crate::extension::quartic::QuarticExtension; use crate::goldilocks_field::GoldilocksField; @@ -87,7 +90,12 @@ mod tests { fn interpolant_random() { type F = GoldilocksField; - for deg in 0..10 { + #[cfg(feature = "cuda")] + zeknox::clear_cuda_errors_rs(); + + for deg in 2..10 { + #[cfg(feature = "cuda")] + init_twiddle_factors_rs(0, log2_ceil(deg)); let domain = F::rand_vec(deg); let coeffs = F::rand_vec(deg); let coeffs = PolynomialCoeffs { coeffs }; @@ -101,7 +109,13 @@ mod tests { fn interpolant_random_roots_of_unity() { type F = GoldilocksField; - for deg_log in 0..4 { + #[cfg(feature = "cuda")] + zeknox::clear_cuda_errors_rs(); + + for deg_log in 1..4 { + #[cfg(feature = "cuda")] + init_twiddle_factors_rs(0, deg_log); + let deg = 1 << deg_log; let domain = F::two_adic_subgroup(deg_log); let coeffs = F::rand_vec(deg); @@ -116,8 +130,15 @@ mod tests { fn interpolant_random_overspecified() { type F = GoldilocksField; + #[cfg(feature = "cuda")] + zeknox::clear_cuda_errors_rs(); + for deg in 0..10 { let points = deg + 5; + + #[cfg(feature = "cuda")] + init_twiddle_factors_rs(0, log2_ceil(points)); + let domain = F::rand_vec(points); let coeffs = F::rand_vec(deg); let coeffs = PolynomialCoeffs { coeffs }; @@ -137,6 +158,8 @@ mod tests { let points = [(F::rand(), F::rand()), (F::rand(), F::rand())]; let x = F::rand(); + #[cfg(feature = "cuda")] + init_twiddle_factors_rs(0, 2); let ev0 = interpolant(&points).eval(x); let ev1 = interpolate(&points, x, &barycentric_weights(&points)); let ev2 = interpolate2(points, x); diff --git a/field/src/lib.rs b/field/src/lib.rs index c713db885..9a2ea4f9c 100644 --- a/field/src/lib.rs +++ b/field/src/lib.rs @@ -4,7 +4,6 @@ #![deny(rustdoc::broken_intra_doc_links)] #![deny(missing_debug_implementations)] #![feature(specialization)] -#![cfg_attr(target_arch = "x86_64", feature(stdarch_x86_avx512))] #![cfg_attr(not(test), no_std)] extern crate alloc; diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs index c13bbca27..be8bf0ad9 100644 --- a/field/src/polynomial/mod.rs +++ b/field/src/polynomial/mod.rs @@ -283,6 +283,15 @@ impl PolynomialCoeffs { zero_factor: Option, root_table: Option<&FftRootTable>, ) -> PolynomialValues { + #[cfg(feature = "cuda")] + { + if F::CUDA_SUPPORT && shift == F::coset_shift() { + // Use GPU coset FFT directly without CPU-side coefficient modification + return crate::fft::coset_fft_gpu(self.clone(), zero_factor, root_table); + } + } + + // CPU path: multiply by powers of shift, then do regular FFT let modified_poly: Self = shift .powers() .zip(&self.coeffs) @@ -440,6 +449,8 @@ impl Mul for &PolynomialCoeffs { mod tests { use std::time::Instant; + #[cfg(feature = "cuda")] + use plonky2_util::log2_ceil; use rand::rngs::OsRng; use rand::Rng; @@ -479,6 +490,13 @@ mod tests { let k = 8; let n = 1 << k; + + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + zeknox::init_twiddle_factors_rs(0, k); + } + let poly = PolynomialCoeffs::new(F::rand_vec(n)); let shift = F::rand(); let coset_evals = poly.coset_fft(shift).values; @@ -500,6 +518,13 @@ mod tests { let k = 8; let n = 1 << k; + + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + zeknox::init_twiddle_factors_rs(0, k); + } + let evals = PolynomialValues::new(F::rand_vec(n)); let shift = F::rand(); let coeffs = evals.clone().coset_ifft(shift); @@ -520,6 +545,12 @@ mod tests { type F = GoldilocksField; let mut rng = OsRng; let (a_deg, b_deg) = (rng.gen_range(1..10_000), rng.gen_range(1..10_000)); + + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + zeknox::init_twiddle_factors_rs(0, log2_ceil(a_deg + b_deg + 1)); + } let a = PolynomialCoeffs::new(F::rand_vec(a_deg)); let b = PolynomialCoeffs::new(F::rand_vec(b_deg)); let m1 = &a * &b; @@ -537,11 +568,24 @@ mod tests { let mut rng = OsRng; let a_deg = rng.gen_range(0..1_000); let n = rng.gen_range(1..1_000); + + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + for i in 1..=log2_ceil(max(a_deg, n)) + 1 { + zeknox::init_twiddle_factors_rs(0, i); + } + } + let mut a = PolynomialCoeffs::new(F::rand_vec(a_deg + 1)); + println!("a {} b {}", a.len(), n); + if a.coeffs[0].is_zero() { a.coeffs[0] = F::ONE; // First coefficient needs to be nonzero. } let b = a.inv_mod_xn(n); + println!("a {} b {}", a.len(), b.len()); + let mut m = &a * &b; m.coeffs.truncate(n); m.trim(); @@ -575,6 +619,15 @@ mod tests { type F = GoldilocksField; let mut rng = OsRng; let (a_deg, b_deg) = (rng.gen_range(1..10_000), rng.gen_range(1..10_000)); + + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + for i in 1..=log2_ceil(max(a_deg, b_deg)) + 1 { + zeknox::init_twiddle_factors_rs(0, i); + } + } + let a = PolynomialCoeffs::new(F::rand_vec(a_deg)); let b = PolynomialCoeffs::new(F::rand_vec(b_deg)); let (q, r) = a.div_rem(&b); @@ -606,6 +659,7 @@ mod tests { let mut rng = OsRng; let l = 14; let n = 1 << l; + let g = F::primitive_root_of_unity(l); let xn_minus_one = { let mut xn_min_one_vec = vec![F::ZERO; n + 1]; @@ -616,6 +670,15 @@ mod tests { let a = g.exp_u64(rng.gen_range(0..(n as u64))); let denom = PolynomialCoeffs::new(vec![-a, F::ONE]); + + #[cfg(feature = "cuda")] + { + zeknox::clear_cuda_errors_rs(); + for i in 1..=l + 1 { + zeknox::init_twiddle_factors_rs(0, i); + } + } + let now = Instant::now(); xn_minus_one.div_rem(&denom); println!("Division time: {:?}", now.elapsed()); diff --git a/field/src/types.rs b/field/src/types.rs index d714b7a84..5a34bb6a3 100644 --- a/field/src/types.rs +++ b/field/src/types.rs @@ -91,6 +91,9 @@ pub trait Field: /// The bit length of the field order. const BITS: usize; + /// Whether this field is supported by cuda + const CUDA_SUPPORT: bool = false; + fn order() -> BigUint; fn characteristic() -> BigUint;