elliottech · lighter-zz · Nov 14, 2025 · Nov 15, 2025 · Nov 18, 2025 · Nov 20, 2025
diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,6 @@ pgo-data.profdata
 
 # MacOS nuisances
 .DS_Store
+
+*.log
+
diff --git a/Cargo.toml b/Cargo.toml
@@ -9,10 +9,12 @@ hashbrown = { version = "0.14.3", default-features = false, features = ["ahash",
 itertools = { version = "0.11.0", default-features = false }
 log = { version = "0.4.14", default-features = false }
 num = { version = "0.4", default-features = false, features = ["rand"] }
+once_cell = { version = "1.18.0", default-features = false }
 rand = { version = "0.8.4", default-features = false }
 serde = { version = "1.0", default-features = false, features = ["derive"] }
 static_assertions = { version = "1.1.0", default-features = false }
 unroll = { version = "0.1.5", default-features = false }
+zeknox = { path = "../zeknox/wrappers/rust" }
 
 [profile.release]
 opt-level = 3

diff --git a/field/Cargo.toml b/field/Cargo.toml
@@ -19,6 +19,9 @@ serde = { workspace = true, features = ["alloc"] }
 static_assertions = { workspace = true }
 unroll = { workspace = true }
 
+# cuda accelerator wrapper
+zeknox = { workspace = true }
+
 # Local dependencies
 plonky2_util = { version = "1.0.0", path = "../util", default-features = false }
 
@@ -29,3 +32,12 @@ rustdoc-args = ["--html-in-header", ".cargo/katex-header.html"]
 
 [lints]
 workspace = true
+
+
+[features]
+# default = []
+default = [ "cuda" ]
+# default = [ "cuda", "cuda_sanity_check" ]
+cuda = []
+# sanity check: when this flag is on, the computation will done on both CPU and CUDA, and the results compared
+cuda_sanity_check = ["cuda"]
diff --git a/field/src/fft.rs b/field/src/fft.rs
@@ -11,6 +11,36 @@ use crate::types::Field;
 
 pub type FftRootTable<F> = Vec<Vec<F>>;
 
+pub fn batch_fft<F: Field>(input: &[PolynomialCoeffs<F>]) -> Vec<PolynomialValues<F>> {
+    #[cfg(feature = "cuda")]
+    {
+        use zeknox::ntt_batch;
+        use zeknox::types::NTTConfig;
+
+        let mut data = input
+            .iter()
+            .flat_map(|poly| poly.coeffs.clone())
+            .collect::<Vec<F>>();
+        let mut cfg = NTTConfig::default();
+        cfg.batches = input.len() as u32;
+        let poly_len = input[0].len();
+        ntt_batch(0, &mut data, log2_strict(poly_len), cfg);
+
+        data.chunks(poly_len)
+            .map(|chunk| PolynomialValues::new(chunk.to_vec()))
+            .collect()
+    }
+    #[cfg(not(feature = "cuda"))]
+    {
+        let mut res = Vec::with_capacity(input.len());
+        for poly in input.iter() {
+            let pv = fft_with_options(poly.clone(), None, None);
+            res.push(pv);
+        }
+        res
+    }
+}
+
 pub fn fft_root_table<F: Field>(n: usize) -> FftRootTable<F> {
     let lg_n = log2_strict(n);
     // bases[i] = g^2^i, for i = 0, ..., lg_n - 1
@@ -32,16 +62,72 @@ pub fn fft_root_table<F: Field>(n: usize) -> FftRootTable<F> {
     root_table
 }
 
+#[cfg(feature = "cuda")]
+fn fft_dispatch_gpu<F: Field>(
+    input: &mut [F],
+    zero_factor: Option<usize>,
+    root_table: Option<&FftRootTable<F>>,
+) {
+    if F::CUDA_SUPPORT {
+        use zeknox::ntt_batch;
+        use zeknox::types::NTTConfig;
+
+        #[cfg(feature = "cuda_sanity_check")]
+        let cpu_res = {
+            let mut input_clone = input.to_vec();
+            fft_dispatch_cpu(&mut input_clone, zero_factor, root_table);
+            input_clone
+        };
+
+        ntt_batch(
+            0,
+            input,
+            input.len().trailing_zeros() as usize,
+            NTTConfig::default(),
+        );
+
+        #[cfg(feature = "cuda_sanity_check")]
+        for (i, (a, b)) in input.iter().zip(cpu_res.iter()).enumerate() {
+            if a != b {
+                panic!(
+                    "Mismatch at index {}: gpu result = {}, cpu result = {}",
+                    i, a, b
+                );
+            }
+        }
+        return;
+    } else {
+        return fft_dispatch_cpu(input, zero_factor, root_table);
+    }
+}
+
+fn fft_dispatch_cpu<F: Field>(
+    input: &mut [F],
+    zero_factor: Option<usize>,
+    root_table: Option<&FftRootTable<F>>,
+) {
+    if root_table.is_some() {
+        fft_classic(input, zero_factor.unwrap_or(0), root_table.unwrap())
+    } else {
+        let computed = fft_root_table::<F>(input.len());
+        fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref())
+    };
+}
+
 #[inline]
 fn fft_dispatch<F: Field>(
     input: &mut [F],
     zero_factor: Option<usize>,
     root_table: Option<&FftRootTable<F>>,
 ) {
-    let computed_root_table = root_table.is_none().then(|| fft_root_table(input.len()));
-    let used_root_table = root_table.or(computed_root_table.as_ref()).unwrap();
-
-    fft_classic(input, zero_factor.unwrap_or(0), used_root_table);
+    #[cfg(feature = "cuda")]
+    {
+        fft_dispatch_gpu(input, zero_factor, root_table)
+    }
+    #[cfg(not(feature = "cuda"))]
+    {
+        fft_dispatch_cpu(input, zero_factor, root_table)
+    }
 }
 
 #[inline]
@@ -50,6 +136,7 @@ pub fn fft<F: Field>(poly: PolynomialCoeffs<F>) -> PolynomialValues<F> {
 }
 
 #[inline]
+
 pub fn fft_with_options<F: Field>(
     poly: PolynomialCoeffs<F>,
     zero_factor: Option<usize>,
@@ -65,6 +152,28 @@ pub fn ifft<F: Field>(poly: PolynomialValues<F>) -> PolynomialCoeffs<F> {
     ifft_with_options(poly, None, None)
 }
 
+#[inline]
+pub fn ifft_cpu<F: Field>(poly: PolynomialValues<F>) -> PolynomialCoeffs<F> {
+    let n = poly.len();
+    let lg_n = log2_strict(n);
+    let n_inv = F::inverse_2exp(lg_n);
+
+    let PolynomialValues { values: mut buffer } = poly;
+    fft_dispatch_cpu(&mut buffer, None, None);
+
+    // We reverse all values except the first, and divide each by n.
+    buffer[0] *= n_inv;
+    buffer[n / 2] *= n_inv;
+    for i in 1..(n / 2) {
+        let j = n - i;
+        let coeffs_i = buffer[j] * n_inv;
+        let coeffs_j = buffer[i] * n_inv;
+        buffer[i] = coeffs_i;
+        buffer[j] = coeffs_j;
+    }
+    PolynomialCoeffs { coeffs: buffer }
+}
+
 pub fn ifft_with_options<F: Field>(
     poly: PolynomialValues<F>,
     zero_factor: Option<usize>,
@@ -214,6 +323,17 @@ mod tests {
 
     #[test]
     fn fft_and_ifft() {
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            // Initialize twiddle factors for sizes we'll use
+            // degree_padded is 256 = 2^8
+            // lde then add 4 more bits
+            for i in 8..=12 {
+                zeknox::init_twiddle_factors_rs(0, i);
+            }
+        }
+
         type F = GoldilocksField;
         let degree = 200usize;
         let degree_padded = degree.next_power_of_two();
@@ -222,7 +342,7 @@ mod tests {
         // "random", the last degree_padded-degree of them are zero.
         let coeffs = (0..degree)
             .map(|i| F::from_canonical_usize(i * 1337 % 100))
-            .chain(core::iter::repeat_n(F::ZERO, degree_padded - degree))
+            .chain(core::iter::repeat(F::ZERO).take(degree_padded - degree))
             .collect::<Vec<_>>();
         assert_eq!(coeffs.len(), degree_padded);
         let coefficients = PolynomialCoeffs { coeffs };

diff --git a/field/src/goldilocks_extensions.rs b/field/src/goldilocks_extensions.rs
@@ -21,9 +21,10 @@ impl Extendable<2> for GoldilocksField {
     // DTH_ROOT = W^((ORDER - 1)/2)
     const DTH_ROOT: Self = Self(18446744069414584320);
 
-    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] = [Self(0), Self(11713931119993638672)];
+    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] =
+        [Self(18081566051660590251), Self(16121475356294670766)];
 
-    const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(7226896044987257365)];
+    const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(15659105665374529263)];
 }
 
 impl Mul for QuadraticExtension<GoldilocksField> {
@@ -44,11 +45,15 @@ impl Extendable<4> for GoldilocksField {
     // DTH_ROOT = W^((ORDER - 1)/4)
     const DTH_ROOT: Self = Self(281474976710656);
 
-    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] =
-        [Self(0), Self(8295451483910296135), Self(0), Self(0)];
+    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] = [
+        Self(5024755240244648895),
+        Self(13227474371289740625),
+        Self(3912887029498544536),
+        Self(3900057112666848848),
+    ];
 
     const EXT_POWER_OF_TWO_GENERATOR: [Self; 4] =
-        [Self(0), Self(0), Self(0), Self(17216955519093520442)];
+        [Self(0), Self(0), Self(0), Self(12587610116473453104)];
 }
 
 impl Mul for QuarticExtension<GoldilocksField> {
@@ -70,11 +75,11 @@ impl Extendable<5> for GoldilocksField {
     const DTH_ROOT: Self = Self(1041288259238279555);
 
     const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 5] = [
-        Self(4624713872807171977),
-        Self(381988216716071028),
-        Self(14499722700050429911),
-        Self(4870631734967222356),
-        Self(4518902370426242880),
+        Self(2899034827742553394),
+        Self(13012057356839176729),
+        Self(14593811582388663055),
+        Self(7722900811313895436),
+        Self(4557222484695340057),
     ];
 
     const EXT_POWER_OF_TWO_GENERATOR: [Self; 5] = [

diff --git a/field/src/goldilocks_field.rs b/field/src/goldilocks_field.rs
@@ -77,17 +77,19 @@ impl Field for GoldilocksField {
     const CHARACTERISTIC_TWO_ADICITY: usize = Self::TWO_ADICITY;
 
     // Sage: `g = GF(p).multiplicative_generator()`
-    const MULTIPLICATIVE_GROUP_GENERATOR: Self = Self(14293326489335486720);
+    const MULTIPLICATIVE_GROUP_GENERATOR: Self = Self(7);
 
     // Sage:
     // ```
     // g_2 = g^((p - 1) / 2^32)
     // g_2.multiplicative_order().factor()
     // ```
-    const POWER_OF_TWO_GENERATOR: Self = Self(7277203076849721926);
+    const POWER_OF_TWO_GENERATOR: Self = Self(1753635133440165772);
 
     const BITS: usize = 64;
 
+    const CUDA_SUPPORT: bool = true;
+
     fn order() -> BigUint {
         Self::ORDER.into()
     }

diff --git a/field/src/interpolation.rs b/field/src/interpolation.rs
@@ -77,6 +77,9 @@ pub fn interpolate2<F: Field>(points: [(F, F); 2], x: F) -> F {
 
 #[cfg(test)]
 mod tests {
+    #[cfg(feature = "cuda")]
+    use zeknox::init_twiddle_factors_rs;
+
     use super::*;
     use crate::extension::quartic::QuarticExtension;
     use crate::goldilocks_field::GoldilocksField;
@@ -87,7 +90,12 @@ mod tests {
     fn interpolant_random() {
         type F = GoldilocksField;
 
-        for deg in 0..10 {
+        #[cfg(feature = "cuda")]
+        zeknox::clear_cuda_errors_rs();
+
+        for deg in 2..10 {
+            #[cfg(feature = "cuda")]
+            init_twiddle_factors_rs(0, log2_ceil(deg));
             let domain = F::rand_vec(deg);
             let coeffs = F::rand_vec(deg);
             let coeffs = PolynomialCoeffs { coeffs };
@@ -101,7 +109,13 @@ mod tests {
     fn interpolant_random_roots_of_unity() {
         type F = GoldilocksField;
 
-        for deg_log in 0..4 {
+        #[cfg(feature = "cuda")]
+        zeknox::clear_cuda_errors_rs();
+
+        for deg_log in 1..4 {
+            #[cfg(feature = "cuda")]
+            init_twiddle_factors_rs(0, deg_log);
+
             let deg = 1 << deg_log;
             let domain = F::two_adic_subgroup(deg_log);
             let coeffs = F::rand_vec(deg);
@@ -116,8 +130,15 @@ mod tests {
     fn interpolant_random_overspecified() {
         type F = GoldilocksField;
 
+        #[cfg(feature = "cuda")]
+        zeknox::clear_cuda_errors_rs();
+
         for deg in 0..10 {
             let points = deg + 5;
+
+            #[cfg(feature = "cuda")]
+            init_twiddle_factors_rs(0, log2_ceil(points));
+
             let domain = F::rand_vec(points);
             let coeffs = F::rand_vec(deg);
             let coeffs = PolynomialCoeffs { coeffs };
@@ -137,6 +158,8 @@ mod tests {
         let points = [(F::rand(), F::rand()), (F::rand(), F::rand())];
         let x = F::rand();
 
+        #[cfg(feature = "cuda")]
+        init_twiddle_factors_rs(0, 2);
         let ev0 = interpolant(&points).eval(x);
         let ev1 = interpolate(&points, x, &barycentric_weights(&points));
         let ev2 = interpolate2(points, x);

diff --git a/field/src/lib.rs b/field/src/lib.rs
@@ -24,6 +24,7 @@ pub mod polynomial;
 pub mod secp256k1_base;
 pub mod secp256k1_scalar;
 pub mod types;
+pub mod util;
 pub mod zero_poly_coset;
 
 #[cfg(test)]

diff --git a/field/src/polynomial/division.rs b/field/src/polynomial/division.rs
@@ -78,7 +78,7 @@ impl<F: Field> PolynomialCoeffs<F> {
             .iter()
             .rev()
             .scan(F::ZERO, |acc, &c| {
-                *acc = *acc * z + c;
+                *acc = c.multiply_accumulate(*acc, z);
                 Some(*acc)
             })
             .collect::<Vec<_>>();
Original file line number	Diff line number	Diff line change
Expand Up		@@ -8,3 +8,6 @@ pgo-data.profdata

		# MacOS nuisances
		.DS_Store

		*.log