From 0c6091e5adcf153fd916795a3bc060a0f258f6e6 Mon Sep 17 00:00:00 2001
From: zhenfeizhang <zhenfei.zhang@hotmail.com>
Date: Fri, 14 Nov 2025 10:18:20 -0500
Subject: [PATCH 01/37] fix parameters

---
 Cargo.toml                         | 2 ++
 field/src/goldilocks_extensions.rs | 5 +++--
 field/src/goldilocks_field.rs      | 4 ++--
 field/src/lib.rs                   | 1 -
 4 files changed, 7 insertions(+), 5 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index 81dbbde49..eed6218b8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,6 +13,8 @@ rand = { version = "0.8.4", default-features = false }
 serde = { version = "1.0", default-features = false, features = ["derive"] }
 static_assertions = { version = "1.1.0", default-features = false }
 unroll = { version = "0.1.5", default-features = false }
+zeknox_= { path = "../zeknox/wrappers/rust"}
+
 
 [profile.release]
 opt-level = 3
diff --git a/field/src/goldilocks_extensions.rs b/field/src/goldilocks_extensions.rs
index 8f2d85253..8b7d5607e 100644
--- a/field/src/goldilocks_extensions.rs
+++ b/field/src/goldilocks_extensions.rs
@@ -21,9 +21,10 @@ impl Extendable<2> for GoldilocksField {
     // DTH_ROOT = W^((ORDER - 1)/2)
     const DTH_ROOT: Self = Self(18446744069414584320);
 
-    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] = [Self(0), Self(11713931119993638672)];
+    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] =
+        [Self(18081566051660590251), Self(16121475356294670766)];
 
-    const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(7226896044987257365)];
+    const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(15659105665374529263)];
 }
 
 impl Mul for QuadraticExtension<GoldilocksField> {
diff --git a/field/src/goldilocks_field.rs b/field/src/goldilocks_field.rs
index b0191ca59..4e459c908 100644
--- a/field/src/goldilocks_field.rs
+++ b/field/src/goldilocks_field.rs
@@ -77,14 +77,14 @@ impl Field for GoldilocksField {
     const CHARACTERISTIC_TWO_ADICITY: usize = Self::TWO_ADICITY;
 
     // Sage: `g = GF(p).multiplicative_generator()`
-    const MULTIPLICATIVE_GROUP_GENERATOR: Self = Self(14293326489335486720);
+    const MULTIPLICATIVE_GROUP_GENERATOR: Self = Self(7);
 
     // Sage:
     // ```
     // g_2 = g^((p - 1) / 2^32)
     // g_2.multiplicative_order().factor()
     // ```
-    const POWER_OF_TWO_GENERATOR: Self = Self(7277203076849721926);
+    const POWER_OF_TWO_GENERATOR: Self = Self(1753635133440165772);
 
     const BITS: usize = 64;
 
diff --git a/field/src/lib.rs b/field/src/lib.rs
index c713db885..9a2ea4f9c 100644
--- a/field/src/lib.rs
+++ b/field/src/lib.rs
@@ -4,7 +4,6 @@
 #![deny(rustdoc::broken_intra_doc_links)]
 #![deny(missing_debug_implementations)]
 #![feature(specialization)]
-#![cfg_attr(target_arch = "x86_64", feature(stdarch_x86_avx512))]
 #![cfg_attr(not(test), no_std)]
 
 extern crate alloc;

From 7065a8277aa1f426a7f3bd2380716c6f8e31dfc3 Mon Sep 17 00:00:00 2001
From: zhenfeizhang <zhenfei.zhang@hotmail.com>
Date: Fri, 14 Nov 2025 19:37:21 -0500
Subject: [PATCH 02/37] fix fft

---
 field/Cargo.toml                   |  9 +++++
 field/src/fft.rs                   | 60 ++++++++++++++++++++++++++++--
 field/src/goldilocks_extensions.rs | 20 ++++++----
 field/src/goldilocks_field.rs      |  2 +
 field/src/interpolation.rs         | 27 +++++++++++++-
 field/src/polynomial/mod.rs        | 54 +++++++++++++++++++++++++++
 field/src/types.rs                 |  3 ++
 7 files changed, 162 insertions(+), 13 deletions(-)

diff --git a/field/Cargo.toml b/field/Cargo.toml
index e13f49efd..49cb04494 100644
--- a/field/Cargo.toml
+++ b/field/Cargo.toml
@@ -19,6 +19,9 @@ serde = { workspace = true, features = ["alloc"] }
 static_assertions = { workspace = true }
 unroll = { workspace = true }
 
+# cuda accelerator wrapper
+zeknox = { workspace = true }
+
 # Local dependencies
 plonky2_util = { version = "1.0.0", path = "../util", default-features = false }
 
@@ -29,3 +32,9 @@ rustdoc-args = ["--html-in-header", ".cargo/katex-header.html"]
 
 [lints]
 workspace = true
+
+
+[features]
+# default = []
+default = [ "cuda" ]
+cuda = []
\ No newline at end of file
diff --git a/field/src/fft.rs b/field/src/fft.rs
index d078ca6c3..85defc48b 100644
--- a/field/src/fft.rs
+++ b/field/src/fft.rs
@@ -32,16 +32,59 @@ pub fn fft_root_table<F: Field>(n: usize) -> FftRootTable<F> {
     root_table
 }
 
+#[cfg(feature = "cuda")]
+fn fft_dispatch_gpu<F: Field>(
+    input: &mut [F],
+    zero_factor: Option<usize>,
+    root_table: Option<&FftRootTable<F>>,
+) {
+    use zeknox::ntt_batch;
+    use zeknox::types::NTTConfig;
+    if F::CUDA_SUPPORT {
+        return ntt_batch(
+            0,
+            input.as_mut_ptr(),
+            input.len().trailing_zeros() as usize,
+            NTTConfig::default(),
+        );
+    } else {
+        return fft_dispatch_cpu(input, zero_factor, root_table);
+    }
+}
+
+fn fft_dispatch_cpu<F: Field>(
+    input: &mut [F],
+    zero_factor: Option<usize>,
+    root_table: Option<&FftRootTable<F>>,
+) {
+    if root_table.is_some() {
+        return fft_classic(input, zero_factor.unwrap_or(0), root_table.unwrap());
+    } else {
+        // let pre_computed = F::pre_compute_fft_root_table(input.len());
+        // if pre_computed.is_some() {
+        //     return fft_classic(input, zero_factor.unwrap_or(0), pre_computed.unwrap());
+        // } else {
+        //     let computed = fft_root_table::<F>(input.len());
+
+        //     return fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref());
+        // }
+        let computed = fft_root_table::<F>(input.len());
+
+        return fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref());
+    };
+}
+
 #[inline]
 fn fft_dispatch<F: Field>(
     input: &mut [F],
     zero_factor: Option<usize>,
     root_table: Option<&FftRootTable<F>>,
 ) {
-    let computed_root_table = root_table.is_none().then(|| fft_root_table(input.len()));
-    let used_root_table = root_table.or(computed_root_table.as_ref()).unwrap();
+    #[cfg(feature = "cuda")]
+    return fft_dispatch_gpu(input, zero_factor, root_table);
 
-    fft_classic(input, zero_factor.unwrap_or(0), used_root_table);
+    #[cfg(not(feature = "cuda"))]
+    return fft_dispatch_cpu(input, zero_factor, root_table);
 }
 
 #[inline]
@@ -206,6 +249,8 @@ mod tests {
     use alloc::vec::Vec;
 
     use plonky2_util::{log2_ceil, log2_strict};
+    #[cfg(feature = "cuda")]
+    use zeknox::init_twiddle_factors_rs;
 
     use crate::fft::{fft, fft_with_options, ifft};
     use crate::goldilocks_field::GoldilocksField;
@@ -218,6 +263,13 @@ mod tests {
         let degree = 200usize;
         let degree_padded = degree.next_power_of_two();
 
+        #[cfg(feature = "cuda")]
+        let log_degree = {
+            zeknox::clear_cuda_errors_rs();
+            let log_degree = degree_padded.trailing_zeros() as usize;
+            init_twiddle_factors_rs(0, log_degree);
+            log_degree
+        };
         // Create a vector of coeffs; the first degree of them are
         // "random", the last degree_padded-degree of them are zero.
         let coeffs = (0..degree)
@@ -239,6 +291,8 @@ mod tests {
         }
 
         for r in 0..4 {
+            #[cfg(feature = "cuda")]
+            init_twiddle_factors_rs(0, log_degree + r);
             // expand coefficients by factor 2^r by filling with zeros
             let zero_tail = coefficients.lde(r);
             assert_eq!(
diff --git a/field/src/goldilocks_extensions.rs b/field/src/goldilocks_extensions.rs
index 8b7d5607e..6dd15ce0d 100644
--- a/field/src/goldilocks_extensions.rs
+++ b/field/src/goldilocks_extensions.rs
@@ -45,11 +45,15 @@ impl Extendable<4> for GoldilocksField {
     // DTH_ROOT = W^((ORDER - 1)/4)
     const DTH_ROOT: Self = Self(281474976710656);
 
-    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] =
-        [Self(0), Self(8295451483910296135), Self(0), Self(0)];
+    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] = [
+        Self(5024755240244648895),
+        Self(13227474371289740625),
+        Self(3912887029498544536),
+        Self(3900057112666848848),
+    ];
 
     const EXT_POWER_OF_TWO_GENERATOR: [Self; 4] =
-        [Self(0), Self(0), Self(0), Self(17216955519093520442)];
+        [Self(0), Self(0), Self(0), Self(12587610116473453104)];
 }
 
 impl Mul for QuarticExtension<GoldilocksField> {
@@ -71,11 +75,11 @@ impl Extendable<5> for GoldilocksField {
     const DTH_ROOT: Self = Self(1041288259238279555);
 
     const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 5] = [
-        Self(4624713872807171977),
-        Self(381988216716071028),
-        Self(14499722700050429911),
-        Self(4870631734967222356),
-        Self(4518902370426242880),
+        Self(2899034827742553394),
+        Self(13012057356839176729),
+        Self(14593811582388663055),
+        Self(7722900811313895436),
+        Self(4557222484695340057),
     ];
 
     const EXT_POWER_OF_TWO_GENERATOR: [Self; 5] = [
diff --git a/field/src/goldilocks_field.rs b/field/src/goldilocks_field.rs
index 4e459c908..ae8457744 100644
--- a/field/src/goldilocks_field.rs
+++ b/field/src/goldilocks_field.rs
@@ -88,6 +88,8 @@ impl Field for GoldilocksField {
 
     const BITS: usize = 64;
 
+    const CUDA_SUPPORT: bool = true;
+
     fn order() -> BigUint {
         Self::ORDER.into()
     }
diff --git a/field/src/interpolation.rs b/field/src/interpolation.rs
index df7084572..9772fff56 100644
--- a/field/src/interpolation.rs
+++ b/field/src/interpolation.rs
@@ -77,6 +77,9 @@ pub fn interpolate2<F: Field>(points: [(F, F); 2], x: F) -> F {
 
 #[cfg(test)]
 mod tests {
+    #[cfg(feature = "cuda")]
+    use zeknox::init_twiddle_factors_rs;
+
     use super::*;
     use crate::extension::quartic::QuarticExtension;
     use crate::goldilocks_field::GoldilocksField;
@@ -87,7 +90,12 @@ mod tests {
     fn interpolant_random() {
         type F = GoldilocksField;
 
-        for deg in 0..10 {
+        #[cfg(feature = "cuda")]
+        zeknox::clear_cuda_errors_rs();
+
+        for deg in 2..10 {
+            #[cfg(feature = "cuda")]
+            init_twiddle_factors_rs(0, log2_ceil(deg));
             let domain = F::rand_vec(deg);
             let coeffs = F::rand_vec(deg);
             let coeffs = PolynomialCoeffs { coeffs };
@@ -101,7 +109,13 @@ mod tests {
     fn interpolant_random_roots_of_unity() {
         type F = GoldilocksField;
 
-        for deg_log in 0..4 {
+        #[cfg(feature = "cuda")]
+        zeknox::clear_cuda_errors_rs();
+
+        for deg_log in 1..4 {
+            #[cfg(feature = "cuda")]
+            init_twiddle_factors_rs(0, deg_log);
+
             let deg = 1 << deg_log;
             let domain = F::two_adic_subgroup(deg_log);
             let coeffs = F::rand_vec(deg);
@@ -116,8 +130,15 @@ mod tests {
     fn interpolant_random_overspecified() {
         type F = GoldilocksField;
 
+        #[cfg(feature = "cuda")]
+        zeknox::clear_cuda_errors_rs();
+
         for deg in 0..10 {
             let points = deg + 5;
+
+            #[cfg(feature = "cuda")]
+            init_twiddle_factors_rs(0, log2_ceil(points));
+
             let domain = F::rand_vec(points);
             let coeffs = F::rand_vec(deg);
             let coeffs = PolynomialCoeffs { coeffs };
@@ -137,6 +158,8 @@ mod tests {
         let points = [(F::rand(), F::rand()), (F::rand(), F::rand())];
         let x = F::rand();
 
+        #[cfg(feature = "cuda")]
+        init_twiddle_factors_rs(0, 2);
         let ev0 = interpolant(&points).eval(x);
         let ev1 = interpolate(&points, x, &barycentric_weights(&points));
         let ev2 = interpolate2(points, x);
diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs
index c13bbca27..2a97352c4 100644
--- a/field/src/polynomial/mod.rs
+++ b/field/src/polynomial/mod.rs
@@ -440,6 +440,8 @@ impl<F: Field> Mul for &PolynomialCoeffs<F> {
 mod tests {
     use std::time::Instant;
 
+    #[cfg(feature = "cuda")]
+    use plonky2_util::log2_ceil;
     use rand::rngs::OsRng;
     use rand::Rng;
 
@@ -479,6 +481,13 @@ mod tests {
 
         let k = 8;
         let n = 1 << k;
+
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            zeknox::init_twiddle_factors_rs(0, k);
+        }
+
         let poly = PolynomialCoeffs::new(F::rand_vec(n));
         let shift = F::rand();
         let coset_evals = poly.coset_fft(shift).values;
@@ -500,6 +509,13 @@ mod tests {
 
         let k = 8;
         let n = 1 << k;
+
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            zeknox::init_twiddle_factors_rs(0, k);
+        }
+
         let evals = PolynomialValues::new(F::rand_vec(n));
         let shift = F::rand();
         let coeffs = evals.clone().coset_ifft(shift);
@@ -520,6 +536,12 @@ mod tests {
         type F = GoldilocksField;
         let mut rng = OsRng;
         let (a_deg, b_deg) = (rng.gen_range(1..10_000), rng.gen_range(1..10_000));
+
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            zeknox::init_twiddle_factors_rs(0, log2_ceil(a_deg + b_deg + 1));
+        }
         let a = PolynomialCoeffs::new(F::rand_vec(a_deg));
         let b = PolynomialCoeffs::new(F::rand_vec(b_deg));
         let m1 = &a * &b;
@@ -537,11 +559,24 @@ mod tests {
         let mut rng = OsRng;
         let a_deg = rng.gen_range(0..1_000);
         let n = rng.gen_range(1..1_000);
+
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            for i in 1..=log2_ceil(max(a_deg, n)) + 1 {
+                zeknox::init_twiddle_factors_rs(0, i);
+            }
+        }
+
         let mut a = PolynomialCoeffs::new(F::rand_vec(a_deg + 1));
+        println!("a {} b {}", a.len(), n);
+
         if a.coeffs[0].is_zero() {
             a.coeffs[0] = F::ONE; // First coefficient needs to be nonzero.
         }
         let b = a.inv_mod_xn(n);
+        println!("a {} b {}", a.len(), b.len());
+
         let mut m = &a * &b;
         m.coeffs.truncate(n);
         m.trim();
@@ -575,6 +610,15 @@ mod tests {
         type F = GoldilocksField;
         let mut rng = OsRng;
         let (a_deg, b_deg) = (rng.gen_range(1..10_000), rng.gen_range(1..10_000));
+
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            for i in 1..=log2_ceil(max(a_deg, b_deg)) + 1 {
+                zeknox::init_twiddle_factors_rs(0, i);
+            }
+        }
+
         let a = PolynomialCoeffs::new(F::rand_vec(a_deg));
         let b = PolynomialCoeffs::new(F::rand_vec(b_deg));
         let (q, r) = a.div_rem(&b);
@@ -606,6 +650,7 @@ mod tests {
         let mut rng = OsRng;
         let l = 14;
         let n = 1 << l;
+
         let g = F::primitive_root_of_unity(l);
         let xn_minus_one = {
             let mut xn_min_one_vec = vec![F::ZERO; n + 1];
@@ -616,6 +661,15 @@ mod tests {
 
         let a = g.exp_u64(rng.gen_range(0..(n as u64)));
         let denom = PolynomialCoeffs::new(vec![-a, F::ONE]);
+
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            for i in 1..=l + 1 {
+                zeknox::init_twiddle_factors_rs(0, i);
+            }
+        }
+
         let now = Instant::now();
         xn_minus_one.div_rem(&denom);
         println!("Division time: {:?}", now.elapsed());
diff --git a/field/src/types.rs b/field/src/types.rs
index d714b7a84..5a34bb6a3 100644
--- a/field/src/types.rs
+++ b/field/src/types.rs
@@ -91,6 +91,9 @@ pub trait Field:
     /// The bit length of the field order.
     const BITS: usize;
 
+    /// Whether this field is supported by cuda
+    const CUDA_SUPPORT: bool = false;
+
     fn order() -> BigUint;
     fn characteristic() -> BigUint;
 

From fb3c96f7558ad22c1d9d05083a48aa6150563280 Mon Sep 17 00:00:00 2001
From: zhenfeizhang <zhenfei.zhang@hotmail.com>
Date: Tue, 18 Nov 2025 08:10:50 -0500
Subject: [PATCH 03/37] fix FFT/cosetFFT GPUs

---
 field/src/fft.rs            | 475 +++++++++++++++++++++++++++++++++++-
 field/src/polynomial/mod.rs |   9 +
 2 files changed, 483 insertions(+), 1 deletion(-)

diff --git a/field/src/fft.rs b/field/src/fft.rs
index 85defc48b..eeb86b62d 100644
--- a/field/src/fft.rs
+++ b/field/src/fft.rs
@@ -52,6 +52,177 @@ fn fft_dispatch_gpu<F: Field>(
     }
 }
 
+/// Batch FFT computation for multiple polynomials on GPU
+#[cfg(feature = "cuda")]
+fn fft_batch_dispatch_gpu<F: Field>(
+    inputs: &mut [F],
+    poly_size: usize,
+    num_polys: usize,
+    zero_factor: Option<usize>,
+    root_table: Option<&FftRootTable<F>>,
+) {
+    use zeknox::ntt_batch;
+    use zeknox::types::NTTConfig;
+
+    if F::CUDA_SUPPORT {
+        let mut cfg = NTTConfig::default();
+        cfg.batches = num_polys as u32;
+
+        return ntt_batch(
+            0,
+            inputs.as_mut_ptr(),
+            poly_size.trailing_zeros() as usize,
+            cfg,
+        );
+    } else {
+        // Fallback to CPU: process each polynomial separately
+        for i in 0..num_polys {
+            let start = i * poly_size;
+            let end = start + poly_size;
+            fft_dispatch_cpu(&mut inputs[start..end], zero_factor, root_table);
+        }
+    }
+}
+
+#[cfg(feature = "cuda")]
+pub(crate) fn coset_fft_gpu<F: Field>(
+    poly: PolynomialCoeffs<F>,
+    zero_factor: Option<usize>,
+    root_table: Option<&FftRootTable<F>>,
+) -> PolynomialValues<F> {
+    use zeknox::ntt_batch;
+    use zeknox::types::NTTConfig;
+
+    if !F::CUDA_SUPPORT {
+        // Fallback to CPU if CUDA not supported for this field
+        let modified_poly: PolynomialCoeffs<F> = F::coset_shift()
+            .powers()
+            .zip(&poly.coeffs)
+            .map(|(r, &c)| r * c)
+            .collect::<Vec<_>>()
+            .into();
+        return fft_with_options(modified_poly, zero_factor, root_table);
+    }
+
+    let PolynomialCoeffs { coeffs: mut buffer } = poly;
+    let lg_n = buffer.len().trailing_zeros() as usize;
+
+    // // Initialize coset on GPU
+    // // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR)
+    // // TODO: Make this generic for other fields if needed
+    // let coset_gen_u64 = 7u64;
+    // init_coset_rs(0, lg_n, coset_gen_u64);
+
+    // Configure NTT for coset
+    let mut cfg = NTTConfig::default();
+    cfg.with_coset = true;
+    cfg.ntt_type = zeknox::types::NTTType::Coset;
+
+    // Perform coset NTT on GPU
+    ntt_batch(0, buffer.as_mut_ptr(), lg_n, cfg);
+
+    PolynomialValues::new(buffer)
+}
+
+/// Batch coset FFT computation for multiple polynomials on GPU
+#[cfg(feature = "cuda")]
+fn coset_fft_batch_gpu<F: Field>(
+    polys: Vec<PolynomialCoeffs<F>>,
+    zero_factor: Option<usize>,
+    root_table: Option<&FftRootTable<F>>,
+) -> Vec<PolynomialValues<F>> {
+    use zeknox::ntt_batch;
+    use zeknox::types::NTTConfig;
+
+    if polys.is_empty() {
+        return Vec::new();
+    }
+
+    let num_polys = polys.len();
+    let poly_size = polys[0].len();
+
+    // Verify all polynomials have the same size
+    assert!(
+        polys.iter().all(|p| p.len() == poly_size),
+        "All polynomials must have the same size for batch coset FFT"
+    );
+
+    if !F::CUDA_SUPPORT {
+        // Fallback to CPU if CUDA not supported for this field
+        return polys
+            .into_iter()
+            .map(|poly| {
+                let modified_poly: PolynomialCoeffs<F> = F::coset_shift()
+                    .powers()
+                    .zip(&poly.coeffs)
+                    .map(|(r, &c)| r * c)
+                    .collect::<Vec<_>>()
+                    .into();
+                fft_with_options(modified_poly, zero_factor, root_table)
+            })
+            .collect();
+    }
+
+    // Flatten all polynomials into a single contiguous buffer
+    let mut buffer: Vec<F> = Vec::with_capacity(num_polys * poly_size);
+    for poly in polys {
+        buffer.extend_from_slice(&poly.coeffs);
+    }
+
+    let lg_n = poly_size.trailing_zeros() as usize;
+
+    // Configure NTT for batch coset
+    let mut cfg = NTTConfig::default();
+    cfg.batches = num_polys as u32;
+    cfg.with_coset = true;
+    cfg.ntt_type = zeknox::types::NTTType::Coset;
+
+    // Perform batch coset NTT on GPU
+    ntt_batch(0, buffer.as_mut_ptr(), lg_n, cfg);
+
+    // Split the buffer back into separate polynomials
+    buffer
+        .chunks(poly_size)
+        .map(|chunk| PolynomialValues::new(chunk.to_vec()))
+        .collect()
+}
+
+/// Compute coset FFT for multiple polynomials in batch.
+/// All polynomials must have the same size (power of 2).
+/// Returns a vector of PolynomialValues in the same order as input.
+pub fn coset_fft_batch<F: Field>(polys: Vec<PolynomialCoeffs<F>>) -> Vec<PolynomialValues<F>> {
+    coset_fft_batch_with_options(polys, None, None)
+}
+
+/// Compute coset FFT for multiple polynomials in batch with options.
+/// All polynomials must have the same size (power of 2).
+/// Returns a vector of PolynomialValues in the same order as input.
+pub fn coset_fft_batch_with_options<F: Field>(
+    polys: Vec<PolynomialCoeffs<F>>,
+    zero_factor: Option<usize>,
+    root_table: Option<&FftRootTable<F>>,
+) -> Vec<PolynomialValues<F>> {
+    #[cfg(feature = "cuda")]
+    return coset_fft_batch_gpu(polys, zero_factor, root_table);
+
+    #[cfg(not(feature = "cuda"))]
+    {
+        // CPU fallback: process each polynomial separately
+        polys
+            .into_iter()
+            .map(|poly| {
+                let modified_poly: PolynomialCoeffs<F> = F::coset_shift()
+                    .powers()
+                    .zip(&poly.coeffs)
+                    .map(|(r, &c)| r * c)
+                    .collect::<Vec<_>>()
+                    .into();
+                fft_with_options(modified_poly, zero_factor, root_table)
+            })
+            .collect()
+    }
+}
+
 fn fft_dispatch_cpu<F: Field>(
     input: &mut [F],
     zero_factor: Option<usize>,
@@ -103,6 +274,66 @@ pub fn fft_with_options<F: Field>(
     PolynomialValues::new(buffer)
 }
 
+/// Compute FFT for multiple polynomials in batch.
+/// All polynomials must have the same size (power of 2).
+/// Returns a vector of PolynomialValues in the same order as input.
+#[inline]
+pub fn fft_batch<F: Field>(polys: Vec<PolynomialCoeffs<F>>) -> Vec<PolynomialValues<F>> {
+    fft_batch_with_options(polys, None, None)
+}
+
+/// Compute FFT for multiple polynomials in batch with options.
+/// All polynomials must have the same size (power of 2).
+/// Returns a vector of PolynomialValues in the same order as input.
+pub fn fft_batch_with_options<F: Field>(
+    polys: Vec<PolynomialCoeffs<F>>,
+    zero_factor: Option<usize>,
+    root_table: Option<&FftRootTable<F>>,
+) -> Vec<PolynomialValues<F>> {
+    if polys.is_empty() {
+        return Vec::new();
+    }
+
+    let num_polys = polys.len();
+    let poly_size = polys[0].len();
+
+    // Verify all polynomials have the same size
+    assert!(
+        polys.iter().all(|p| p.len() == poly_size),
+        "All polynomials must have the same size for batch FFT"
+    );
+    assert!(
+        poly_size.is_power_of_two(),
+        "Polynomial size must be a power of 2"
+    );
+
+    // Flatten all polynomials into a single contiguous buffer
+    let mut buffer: Vec<F> = Vec::with_capacity(num_polys * poly_size);
+    for poly in polys {
+        buffer.extend_from_slice(&poly.coeffs);
+    }
+
+    // Dispatch to GPU or CPU batch processing
+    #[cfg(feature = "cuda")]
+    fft_batch_dispatch_gpu(&mut buffer, poly_size, num_polys, zero_factor, root_table);
+
+    #[cfg(not(feature = "cuda"))]
+    {
+        // CPU fallback: process each polynomial separately
+        for i in 0..num_polys {
+            let start = i * poly_size;
+            let end = start + poly_size;
+            fft_dispatch_cpu(&mut buffer[start..end], zero_factor, root_table);
+        }
+    }
+
+    // Split the buffer back into separate polynomials
+    buffer
+        .chunks(poly_size)
+        .map(|chunk| PolynomialValues::new(chunk.to_vec()))
+        .collect()
+}
+
 #[inline]
 pub fn ifft<F: Field>(poly: PolynomialValues<F>) -> PolynomialCoeffs<F> {
     ifft_with_options(poly, None, None)
@@ -252,7 +483,7 @@ mod tests {
     #[cfg(feature = "cuda")]
     use zeknox::init_twiddle_factors_rs;
 
-    use crate::fft::{fft, fft_with_options, ifft};
+    use crate::fft::{coset_fft_batch, fft, fft_batch, fft_with_options, ifft};
     use crate::goldilocks_field::GoldilocksField;
     use crate::polynomial::{PolynomialCoeffs, PolynomialValues};
     use crate::types::Field;
@@ -302,6 +533,248 @@ mod tests {
         }
     }
 
+    #[test]
+    #[cfg(feature = "cuda")]
+    fn test_fft_gpu_vs_cpu_single() {
+        type F = GoldilocksField;
+
+        // Test various polynomial sizes
+        for log_size in [8, 10, 12, 14] {
+            let size = 1 << log_size;
+            zeknox::clear_cuda_errors_rs();
+            init_twiddle_factors_rs(0, log_size);
+
+            // Create a random polynomial
+            let coeffs: Vec<F> = (0..size)
+                .map(|i| F::from_canonical_usize(i * 7919 % 1000000))
+                .collect();
+
+            let poly = PolynomialCoeffs {
+                coeffs: coeffs.clone(),
+            };
+
+            // Compute FFT using GPU (via fft function which dispatches to GPU)
+            let gpu_result = fft(poly.clone());
+
+            // Compute FFT using CPU (force CPU path)
+            let mut cpu_buffer = coeffs.clone();
+            super::fft_dispatch_cpu(&mut cpu_buffer, None, None);
+            let cpu_result = PolynomialValues::new(cpu_buffer);
+
+            // Compare results
+            assert_eq!(
+                gpu_result.len(),
+                cpu_result.len(),
+                "GPU and CPU results have different lengths for size {}",
+                size
+            );
+
+            for i in 0..size {
+                assert_eq!(
+                    gpu_result.values[i], cpu_result.values[i],
+                    "Mismatch at index {} for polynomial size {}",
+                    i, size
+                );
+            }
+        }
+    }
+
+    #[test]
+    #[cfg(feature = "cuda")]
+    fn test_fft_batch_gpu_vs_cpu() {
+        type F = GoldilocksField;
+
+        let poly_size: usize = 1 << 10; // 1024 elements
+        let num_polys = 8;
+        let log_size = poly_size.trailing_zeros() as usize;
+
+        zeknox::clear_cuda_errors_rs();
+        init_twiddle_factors_rs(0, log_size);
+
+        // Create multiple random polynomials
+        let polys: Vec<PolynomialCoeffs<F>> = (0..num_polys)
+            .map(|batch_idx| {
+                let coeffs: Vec<F> = (0..poly_size)
+                    .map(|i| F::from_canonical_usize((i * 7919 + batch_idx * 12345) % 1000000))
+                    .collect();
+                PolynomialCoeffs { coeffs }
+            })
+            .collect();
+
+        // Compute batch FFT using GPU
+        let gpu_results = fft_batch(polys.clone());
+
+        // Compute FFT for each polynomial using CPU
+        let cpu_results: Vec<PolynomialValues<F>> = polys
+            .into_iter()
+            .map(|poly| {
+                let mut buffer = poly.coeffs.clone();
+                super::fft_dispatch_cpu(&mut buffer, None, None);
+                PolynomialValues::new(buffer)
+            })
+            .collect();
+
+        // Compare results
+        assert_eq!(gpu_results.len(), cpu_results.len());
+        for (batch_idx, (gpu_result, cpu_result)) in
+            gpu_results.iter().zip(cpu_results.iter()).enumerate()
+        {
+            assert_eq!(gpu_result.len(), cpu_result.len());
+            for i in 0..poly_size {
+                assert_eq!(
+                    gpu_result.values[i], cpu_result.values[i],
+                    "Batch FFT mismatch at batch {} index {}",
+                    batch_idx, i
+                );
+            }
+        }
+    }
+
+    #[test]
+    #[cfg(feature = "cuda")]
+    fn test_coset_fft_gpu_vs_cpu_single() {
+        use zeknox::init_coset_rs;
+
+        use crate::types::PrimeField64;
+        type F = GoldilocksField;
+
+        for log_size in [8, 10, 12] {
+            let size = 1 << log_size;
+            zeknox::clear_cuda_errors_rs();
+            init_twiddle_factors_rs(0, log_size);
+
+            // Initialize coset for GPU
+            let coset_gen_u64 = F::coset_shift().to_canonical_u64();
+            init_coset_rs(0, log_size, coset_gen_u64);
+
+            // Create a random polynomial
+            let coeffs: Vec<F> = (0..size)
+                .map(|i| F::from_canonical_usize(i * 8191 % 1000000))
+                .collect();
+
+            let poly = PolynomialCoeffs {
+                coeffs: coeffs.clone(),
+            };
+
+            // Compute coset FFT using GPU
+            let gpu_result = super::coset_fft_gpu(poly.clone(), None, None);
+
+            // Compute coset FFT using CPU (apply coset shift then FFT)
+            let modified_poly: PolynomialCoeffs<F> = F::coset_shift()
+                .powers()
+                .zip(&coeffs)
+                .map(|(r, &c)| r * c)
+                .collect::<Vec<_>>()
+                .into();
+
+            let mut cpu_buffer = modified_poly.coeffs;
+            super::fft_dispatch_cpu(&mut cpu_buffer, None, None);
+            let cpu_result = PolynomialValues::new(cpu_buffer);
+
+            // Compare results
+            assert_eq!(
+                gpu_result.len(),
+                cpu_result.len(),
+                "GPU and CPU coset FFT results have different lengths for size {}",
+                size
+            );
+
+            for i in 0..size {
+                assert_eq!(
+                    gpu_result.values[i], cpu_result.values[i],
+                    "Coset FFT mismatch at index {} for polynomial size {}",
+                    i, size
+                );
+            }
+        }
+    }
+
+    #[test]
+    #[cfg(feature = "cuda")]
+    fn test_coset_fft_batch_gpu_vs_cpu() {
+        use zeknox::init_coset_rs;
+
+        use crate::types::PrimeField64;
+        type F = GoldilocksField;
+
+        let poly_size: usize = 1 << 10; // 1024 elements
+        let num_polys = 8;
+        let log_size = poly_size.trailing_zeros() as usize;
+
+        zeknox::clear_cuda_errors_rs();
+        init_twiddle_factors_rs(0, log_size);
+
+        // Initialize coset for GPU
+        let coset_gen_u64 = F::coset_shift().to_canonical_u64();
+        init_coset_rs(0, log_size, coset_gen_u64);
+
+        // Create multiple random polynomials
+        let polys: Vec<PolynomialCoeffs<F>> = (0..num_polys)
+            .map(|batch_idx| {
+                let coeffs: Vec<F> = (0..poly_size)
+                    .map(|i| F::from_canonical_usize((i * 8191 + batch_idx * 54321) % 1000000))
+                    .collect();
+                PolynomialCoeffs { coeffs }
+            })
+            .collect();
+
+        // Compute batch coset FFT using GPU
+        let gpu_results = coset_fft_batch(polys.clone());
+
+        // Compute coset FFT for each polynomial using CPU
+        let cpu_results: Vec<PolynomialValues<F>> = polys
+            .into_iter()
+            .map(|poly| {
+                let modified_poly: PolynomialCoeffs<F> = F::coset_shift()
+                    .powers()
+                    .zip(&poly.coeffs)
+                    .map(|(r, &c)| r * c)
+                    .collect::<Vec<_>>()
+                    .into();
+
+                let mut buffer = modified_poly.coeffs;
+                super::fft_dispatch_cpu(&mut buffer, None, None);
+                PolynomialValues::new(buffer)
+            })
+            .collect();
+
+        // Compare results
+        assert_eq!(gpu_results.len(), cpu_results.len());
+        for (batch_idx, (gpu_result, cpu_result)) in
+            gpu_results.iter().zip(cpu_results.iter()).enumerate()
+        {
+            assert_eq!(gpu_result.len(), cpu_result.len());
+            for i in 0..poly_size {
+                assert_eq!(
+                    gpu_result.values[i], cpu_result.values[i],
+                    "Batch coset FFT mismatch at batch {} index {}",
+                    batch_idx, i
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_batch_fft_empty() {
+        type F = GoldilocksField;
+        let polys: Vec<PolynomialCoeffs<F>> = vec![];
+        let results = fft_batch(polys);
+        assert!(results.is_empty());
+    }
+
+    #[test]
+    #[should_panic(expected = "All polynomials must have the same size")]
+    fn test_batch_fft_different_sizes() {
+        type F = GoldilocksField;
+        let poly1 = PolynomialCoeffs {
+            coeffs: vec![F::ONE; 256],
+        };
+        let poly2 = PolynomialCoeffs {
+            coeffs: vec![F::ONE; 512],
+        };
+        let _ = fft_batch(vec![poly1, poly2]);
+    }
+
     fn evaluate_naive<F: Field>(coefficients: &PolynomialCoeffs<F>) -> PolynomialValues<F> {
         let degree = coefficients.len();
         let degree_padded = 1 << log2_ceil(degree);
diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs
index 2a97352c4..be8bf0ad9 100644
--- a/field/src/polynomial/mod.rs
+++ b/field/src/polynomial/mod.rs
@@ -283,6 +283,15 @@ impl<F: Field> PolynomialCoeffs<F> {
         zero_factor: Option<usize>,
         root_table: Option<&FftRootTable<F>>,
     ) -> PolynomialValues<F> {
+        #[cfg(feature = "cuda")]
+        {
+            if F::CUDA_SUPPORT && shift == F::coset_shift() {
+                // Use GPU coset FFT directly without CPU-side coefficient modification
+                return crate::fft::coset_fft_gpu(self.clone(), zero_factor, root_table);
+            }
+        }
+
+        // CPU path: multiply by powers of shift, then do regular FFT
         let modified_poly: Self = shift
             .powers()
             .zip(&self.coeffs)

From c3aae3d9e6de1ab5ba8e2995eb3815e834890dbd Mon Sep 17 00:00:00 2001
From: lighter-zz <allaboutshop8@163.com>
Date: Thu, 20 Nov 2025 10:48:52 -0500
Subject: [PATCH 04/37] fix merkle tree

---
 Cargo.toml                            |    2 +
 field/Cargo.toml                      |    4 +
 field/src/goldilocks_extensions.rs    |   25 +-
 field/src/goldilocks_field.rs         |    4 +-
 plonky2/Cargo.toml                    |    6 +
 plonky2/benches/merkle.rs             |    2 +-
 plonky2/examples/fibonacci.rs         |   21 +-
 plonky2/src/batch_fri/oracle.rs       |    2 +-
 plonky2/src/batch_fri/prover.rs       |    3 +-
 plonky2/src/fri/oracle.rs             |    4 +-
 plonky2/src/fri/prover.rs             |    3 +-
 plonky2/src/hash/keccak.rs            |    3 +-
 plonky2/src/hash/merkle_proofs.rs     |    8 +-
 plonky2/src/hash/merkle_tree.rs       | 1067 +++++++++++++++++++++++--
 plonky2/src/hash/mod.rs               |    2 +-
 plonky2/src/hash/path_compression.rs  |   15 +-
 plonky2/src/hash/poseidon.rs          |    3 +-
 plonky2/src/lib.rs                    |    2 +-
 plonky2/src/plonk/config.rs           |   10 +
 plonky2/src/util/serialization/mod.rs |   27 +-
 20 files changed, 1083 insertions(+), 130 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 81dbbde49..3bb243ecf 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,10 +9,12 @@ hashbrown = { version = "0.14.3", default-features = false, features = ["ahash",
 itertools = { version = "0.11.0", default-features = false }
 log = { version = "0.4.14", default-features = false }
 num = { version = "0.4", default-features = false, features = ["rand"] }
+once_cell = { version = "1.18.0", default-features = false }
 rand = { version = "0.8.4", default-features = false }
 serde = { version = "1.0", default-features = false, features = ["derive"] }
 static_assertions = { version = "1.1.0", default-features = false }
 unroll = { version = "0.1.5", default-features = false }
+zeknox = { path = "../zeknox/wrappers/rust" }
 
 [profile.release]
 opt-level = 3
diff --git a/field/Cargo.toml b/field/Cargo.toml
index e13f49efd..8ec0fac52 100644
--- a/field/Cargo.toml
+++ b/field/Cargo.toml
@@ -29,3 +29,7 @@ rustdoc-args = ["--html-in-header", ".cargo/katex-header.html"]
 
 [lints]
 workspace = true
+
+[features]
+default = []
+cuda = []
\ No newline at end of file
diff --git a/field/src/goldilocks_extensions.rs b/field/src/goldilocks_extensions.rs
index 8f2d85253..6dd15ce0d 100644
--- a/field/src/goldilocks_extensions.rs
+++ b/field/src/goldilocks_extensions.rs
@@ -21,9 +21,10 @@ impl Extendable<2> for GoldilocksField {
     // DTH_ROOT = W^((ORDER - 1)/2)
     const DTH_ROOT: Self = Self(18446744069414584320);
 
-    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] = [Self(0), Self(11713931119993638672)];
+    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] =
+        [Self(18081566051660590251), Self(16121475356294670766)];
 
-    const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(7226896044987257365)];
+    const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(15659105665374529263)];
 }
 
 impl Mul for QuadraticExtension<GoldilocksField> {
@@ -44,11 +45,15 @@ impl Extendable<4> for GoldilocksField {
     // DTH_ROOT = W^((ORDER - 1)/4)
     const DTH_ROOT: Self = Self(281474976710656);
 
-    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] =
-        [Self(0), Self(8295451483910296135), Self(0), Self(0)];
+    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] = [
+        Self(5024755240244648895),
+        Self(13227474371289740625),
+        Self(3912887029498544536),
+        Self(3900057112666848848),
+    ];
 
     const EXT_POWER_OF_TWO_GENERATOR: [Self; 4] =
-        [Self(0), Self(0), Self(0), Self(17216955519093520442)];
+        [Self(0), Self(0), Self(0), Self(12587610116473453104)];
 }
 
 impl Mul for QuarticExtension<GoldilocksField> {
@@ -70,11 +75,11 @@ impl Extendable<5> for GoldilocksField {
     const DTH_ROOT: Self = Self(1041288259238279555);
 
     const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 5] = [
-        Self(4624713872807171977),
-        Self(381988216716071028),
-        Self(14499722700050429911),
-        Self(4870631734967222356),
-        Self(4518902370426242880),
+        Self(2899034827742553394),
+        Self(13012057356839176729),
+        Self(14593811582388663055),
+        Self(7722900811313895436),
+        Self(4557222484695340057),
     ];
 
     const EXT_POWER_OF_TWO_GENERATOR: [Self; 5] = [
diff --git a/field/src/goldilocks_field.rs b/field/src/goldilocks_field.rs
index b0191ca59..4e459c908 100644
--- a/field/src/goldilocks_field.rs
+++ b/field/src/goldilocks_field.rs
@@ -77,14 +77,14 @@ impl Field for GoldilocksField {
     const CHARACTERISTIC_TWO_ADICITY: usize = Self::TWO_ADICITY;
 
     // Sage: `g = GF(p).multiplicative_generator()`
-    const MULTIPLICATIVE_GROUP_GENERATOR: Self = Self(14293326489335486720);
+    const MULTIPLICATIVE_GROUP_GENERATOR: Self = Self(7);
 
     // Sage:
     // ```
     // g_2 = g^((p - 1) / 2^32)
     // g_2.multiplicative_order().factor()
     // ```
-    const POWER_OF_TWO_GENERATOR: Self = Self(7277203076849721926);
+    const POWER_OF_TWO_GENERATOR: Self = Self(1753635133440165772);
 
     const BITS: usize = 64;
 
diff --git a/plonky2/Cargo.toml b/plonky2/Cargo.toml
index 83ff08519..d9e31168d 100644
--- a/plonky2/Cargo.toml
+++ b/plonky2/Cargo.toml
@@ -13,10 +13,12 @@ categories.workspace = true
 
 [features]
 default = ["gate_testing", "parallel", "rand_chacha", "std", "timing"]
+# default = ["gate_testing", "parallel", "rand_chacha", "std", "timing", "cuda"]
 gate_testing = []
 parallel = ["hashbrown/rayon", "plonky2_maybe_rayon/parallel"]
 std = ["anyhow/std", "rand/std", "itertools/use_std"]
 timing = ["std", "dep:web-time"]
+cuda = ["plonky2_field/cuda"]
 
 [dependencies]
 ahash = { workspace = true }
@@ -26,6 +28,7 @@ itertools = { workspace = true }
 keccak-hash = { version = "0.8.0", default-features = false }
 log = { workspace = true }
 num = { workspace = true }
+once_cell = { workspace = true }
 rand = { workspace = true }
 rand_chacha = { version = "0.3.1", optional = true, default-features = false }
 serde = { workspace = true, features = ["rc"] }
@@ -38,6 +41,9 @@ plonky2_field = { version = "1.0.0", path = "../field", default-features = false
 plonky2_maybe_rayon = { version = "1.0.0", path = "../maybe_rayon", default-features = false }
 plonky2_util = { version = "1.0.0", path = "../util", default-features = false }
 
+# cuda accelerator wrapper
+zeknox = { workspace = true }
+
 
 [target.'cfg(all(target_arch = "wasm32", target_os = "unknown"))'.dependencies]
 getrandom = { version = "0.2", default-features = false, features = ["js"] }
diff --git a/plonky2/benches/merkle.rs b/plonky2/benches/merkle.rs
index 6230c1343..e9995be1a 100644
--- a/plonky2/benches/merkle.rs
+++ b/plonky2/benches/merkle.rs
@@ -23,7 +23,7 @@ pub(crate) fn bench_merkle_tree<F: RichField, H: Hasher<F>>(c: &mut Criterion) {
         let size = 1 << size_log;
         group.bench_with_input(BenchmarkId::from_parameter(size), &size, |b, _| {
             let leaves = vec![F::rand_vec(ELEMS_PER_LEAF); size];
-            b.iter(|| MerkleTree::<F, H>::new(leaves.clone(), 0));
+            b.iter(|| MerkleTree::<F, H>::new_from_2d(leaves.clone(), 0));
         });
     }
 }
diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs
index 578dc2424..79101dd3b 100644
--- a/plonky2/examples/fibonacci.rs
+++ b/plonky2/examples/fibonacci.rs
@@ -1,14 +1,21 @@
-use anyhow::Result;
+use anyhow::{Ok, Result};
+use log::Level;
 use plonky2::field::types::Field;
 use plonky2::iop::witness::{PartialWitness, WitnessWrite};
 use plonky2::plonk::circuit_builder::CircuitBuilder;
 use plonky2::plonk::circuit_data::CircuitConfig;
 use plonky2::plonk::config::{GenericConfig, PoseidonGoldilocksConfig};
+use plonky2::util::timing::TimingTree;
 
 /// An example of using Plonky2 to prove a statement of the form
 /// "I know the 100th element of the Fibonacci sequence, starting with constants a and b."
 /// When a == 0 and b == 1, this is proving knowledge of the 100th (standard) Fibonacci number.
 fn main() -> Result<()> {
+    env_logger::Builder::from_default_env()
+        .format_timestamp(None)
+        .filter_level(log::LevelFilter::Debug)
+        .init();
+
     const D: usize = 2;
     type C = PoseidonGoldilocksConfig;
     type F = <C as GenericConfig<D>>::F;
@@ -21,7 +28,7 @@ fn main() -> Result<()> {
     let initial_b = builder.add_virtual_target();
     let mut prev_target = initial_a;
     let mut cur_target = initial_b;
-    for _ in 0..99 {
+    for _ in 0..999999 {
         let temp = builder.add(prev_target, cur_target);
         prev_target = cur_target;
         cur_target = temp;
@@ -38,12 +45,16 @@ fn main() -> Result<()> {
     pw.set_target(initial_b, F::ONE)?;
 
     let data = builder.build::<C>();
-    let proof = data.prove(pw)?;
+    let mut timing = TimingTree::new("prove", Level::Info);
+    println!("Starting proof generation...");
+    let proof = plonky2::plonk::prover::prove(&data.prover_only, &data.common, pw, &mut timing)?;
 
     println!(
         "100th Fibonacci number mod |F| (starting with {}, {}) is: {}",
         proof.public_inputs[0], proof.public_inputs[1], proof.public_inputs[2]
     );
-
-    data.verify(proof)
+    timing.print();
+    data.verify(proof)?;
+    println!("Proof verified!");
+    Ok(())
 }
diff --git a/plonky2/src/batch_fri/oracle.rs b/plonky2/src/batch_fri/oracle.rs
index 58deeaa3c..bdf6da72a 100644
--- a/plonky2/src/batch_fri/oracle.rs
+++ b/plonky2/src/batch_fri/oracle.rs
@@ -15,7 +15,7 @@ use crate::fri::oracle::PolynomialBatch;
 use crate::fri::proof::FriProof;
 use crate::fri::structure::{FriBatchInfo, FriInstanceInfo};
 use crate::fri::FriParams;
-use crate::hash::batch_merkle_tree::BatchMerkleTree;
+// use crate::hash::batch_merkle_tree::BatchMerkleTree;
 use crate::hash::hash_types::RichField;
 use crate::iop::challenger::Challenger;
 use crate::plonk::config::GenericConfig;
diff --git a/plonky2/src/batch_fri/prover.rs b/plonky2/src/batch_fri/prover.rs
index e71fe25b4..bed056c0b 100644
--- a/plonky2/src/batch_fri/prover.rs
+++ b/plonky2/src/batch_fri/prover.rs
@@ -104,7 +104,8 @@ pub(crate) fn batch_fri_committed_trees<
 
         reverse_index_bits_in_place(&mut final_values.values);
         let chunked_values = final_values.values.par_chunks(arity).map(flatten).collect();
-        let tree = MerkleTree::<F, C::Hasher>::new(chunked_values, fri_params.config.cap_height);
+        let tree =
+            MerkleTree::<F, C::Hasher>::new_from_2d(chunked_values, fri_params.config.cap_height);
 
         challenger.observe_cap(&tree.cap);
         trees.push(tree);
diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs
index e413071a4..bf986fe64 100644
--- a/plonky2/src/fri/oracle.rs
+++ b/plonky2/src/fri/oracle.rs
@@ -99,7 +99,7 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
         let merkle_tree = timed!(
             timing,
             "build Merkle tree",
-            MerkleTree::new(leaves, cap_height)
+            MerkleTree::new_from_2d(leaves, cap_height)
         );
 
         Self {
@@ -142,7 +142,7 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
     pub fn get_lde_values(&self, index: usize, step: usize) -> &[F] {
         let index = index * step;
         let index = reverse_bits(index, self.degree_log + self.rate_bits);
-        let slice = &self.merkle_tree.leaves[index];
+        let slice = &self.merkle_tree.get(index);
         &slice[..slice.len() - if self.blinding { SALT_SIZE } else { 0 }]
     }
 
diff --git a/plonky2/src/fri/prover.rs b/plonky2/src/fri/prover.rs
index 24c88ced7..e5792cb2c 100644
--- a/plonky2/src/fri/prover.rs
+++ b/plonky2/src/fri/prover.rs
@@ -101,7 +101,8 @@ fn fri_committed_trees<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>,
             .par_chunks(arity)
             .map(|chunk: &[F::Extension]| flatten(chunk))
             .collect();
-        let tree = MerkleTree::<F, C::Hasher>::new(chunked_values, fri_params.config.cap_height);
+        let tree =
+            MerkleTree::<F, C::Hasher>::new_from_2d(chunked_values, fri_params.config.cap_height);
 
         challenger.observe_cap(&tree.cap);
         trees.push(tree);
diff --git a/plonky2/src/hash/keccak.rs b/plonky2/src/hash/keccak.rs
index d3fa8c4b2..61e7cb87e 100644
--- a/plonky2/src/hash/keccak.rs
+++ b/plonky2/src/hash/keccak.rs
@@ -7,7 +7,7 @@ use keccak_hash::keccak;
 
 use crate::hash::hash_types::{BytesHash, RichField};
 use crate::hash::hashing::PlonkyPermutation;
-use crate::plonk::config::Hasher;
+use crate::plonk::config::{Hasher, HasherType};
 use crate::util::serialization::Write;
 
 pub const SPONGE_RATE: usize = 8;
@@ -103,6 +103,7 @@ impl<F: RichField> PlonkyPermutation<F> for KeccakPermutation<F> {
 pub struct KeccakHash<const N: usize>;
 impl<F: RichField, const N: usize> Hasher<F> for KeccakHash<N> {
     const HASH_SIZE: usize = N;
+    const HASHER_TYPE: HasherType = HasherType::Keccak;
     type Hash = BytesHash<N>;
     type Permutation = KeccakPermutation<F>;
 
diff --git a/plonky2/src/hash/merkle_proofs.rs b/plonky2/src/hash/merkle_proofs.rs
index 424e03ae6..892564932 100644
--- a/plonky2/src/hash/merkle_proofs.rs
+++ b/plonky2/src/hash/merkle_proofs.rs
@@ -342,7 +342,8 @@ mod tests {
         let n = 1 << log_n;
         let cap_height = 1;
         let leaves = random_data::<F>(n, 7);
-        let tree = MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new(leaves, cap_height);
+        let tree =
+            MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new_from_2d(leaves, cap_height);
         let i: usize = OsRng.gen_range(0..n);
         let proof = tree.prove(i);
 
@@ -359,9 +360,10 @@ mod tests {
         let i_c = builder.constant(F::from_canonical_usize(i));
         let i_bits = builder.split_le(i_c, log_n);
 
-        let data = builder.add_virtual_targets(tree.leaves[i].len());
+        let data = builder.add_virtual_targets(tree.leaf_size);
+        let leaf = tree.get(i);
         for j in 0..data.len() {
-            pw.set_target(data[j], tree.leaves[i][j])?;
+            pw.set_target(data[j], leaf[j])?;
         }
 
         builder.verify_merkle_proof_to_cap::<<C as GenericConfig<D>>::InnerHasher>(
diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs
index 31bcf5e37..b1c5dcc37 100644
--- a/plonky2/src/hash/merkle_tree.rs
+++ b/plonky2/src/hash/merkle_tree.rs
@@ -1,16 +1,51 @@
-#[cfg(not(feature = "std"))]
-use alloc::vec::Vec;
 use core::mem::MaybeUninit;
 use core::slice;
+use std::collections::HashSet;
+#[cfg(feature = "cuda")]
+use std::sync::Arc;
+#[cfg(feature = "cuda")]
+use std::sync::Mutex;
+use std::time::Instant;
+#[cfg(not(feature = "std"))]
+use std::vec::Vec;
 
+use num::range;
+#[cfg(feature = "cuda")]
+use once_cell::sync::Lazy;
 use plonky2_maybe_rayon::*;
 use serde::{Deserialize, Serialize};
+#[cfg(feature = "cuda")]
+use zeknox::device::memory::HostOrDeviceSlice;
+#[cfg(feature = "cuda")]
+use zeknox::device::stream::CudaStream;
+#[cfg(feature = "cuda")]
+use zeknox::fill_digests_buf_linear_gpu_with_gpu_ptr;
+#[cfg(feature = "cuda")]
+use zeknox::fill_digests_buf_linear_multigpu_with_gpu_ptr;
 
 use crate::hash::hash_types::RichField;
+#[cfg(feature = "cuda")]
+use crate::hash::hash_types::NUM_HASH_OUT_ELTS;
 use crate::hash::merkle_proofs::MerkleProof;
+#[cfg(feature = "cuda")]
+use crate::plonk::config::HasherType;
 use crate::plonk::config::{GenericHashOut, Hasher};
 use crate::util::log2_strict;
 
+#[cfg(feature = "cuda")]
+pub static GPU_ID: Lazy<Arc<Mutex<u64>>> = Lazy::new(|| Arc::new(Mutex::new(0)));
+
+#[cfg(all(feature = "timing", feature = "cuda"))]
+fn print_time(now: Instant, msg: &str) {
+    println!("Time {} {} ms", msg, now.elapsed().as_millis());
+}
+
+#[cfg(not(all(feature = "timing", feature = "cuda")))]
+fn print_time(_now: Instant, _msg: &str) {}
+
+#[cfg(feature = "cuda")]
+const FORCE_SINGLE_GPU: bool = true;
+
 /// The Merkle cap of height `h` of a Merkle tree is the `h`-th layer (from the root) of the tree.
 /// It can be used in place of the root to verify Merkle paths, which are `h` elements shorter.
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
@@ -45,7 +80,10 @@ impl<F: RichField, H: Hasher<F>> MerkleCap<F, H> {
 #[derive(Clone, Debug, Eq, PartialEq)]
 pub struct MerkleTree<F: RichField, H: Hasher<F>> {
     /// The data in the leaves of the Merkle tree.
-    pub leaves: Vec<Vec<F>>,
+    // pub leaves: Vec<Vec<F>>,
+    pub leaves: Vec<F>,
+
+    pub leaf_size: usize,
 
     /// The digests in the tree. Consists of `cap.len()` sub-trees, each corresponding to one
     /// element in `cap`. Each subtree is contiguous and located at
@@ -64,6 +102,7 @@ pub struct MerkleTree<F: RichField, H: Hasher<F>> {
 impl<F: RichField, H: Hasher<F>> Default for MerkleTree<F, H> {
     fn default() -> Self {
         Self {
+            leaf_size: 0,
             leaves: Vec::new(),
             digests: Vec::new(),
             cap: MerkleCap::default(),
@@ -71,7 +110,7 @@ impl<F: RichField, H: Hasher<F>> Default for MerkleTree<F, H> {
     }
 }
 
-pub(crate) fn capacity_up_to_mut<T>(v: &mut Vec<T>, len: usize) -> &mut [MaybeUninit<T>] {
+fn capacity_up_to_mut<T>(v: &mut Vec<T>, len: usize) -> &mut [MaybeUninit<T>] {
     assert!(v.capacity() >= len);
     let v_ptr = v.as_mut_ptr().cast::<MaybeUninit<T>>();
     unsafe {
@@ -83,59 +122,105 @@ pub(crate) fn capacity_up_to_mut<T>(v: &mut Vec<T>, len: usize) -> &mut [MaybeUn
     }
 }
 
-pub(crate) fn fill_subtree<F: RichField, H: Hasher<F>>(
+fn fill_subtree<F: RichField, H: Hasher<F>>(
     digests_buf: &mut [MaybeUninit<H::Hash>],
-    leaves: &[Vec<F>],
+    leaves: &[F],
+    leaf_size: usize,
 ) -> H::Hash {
-    assert_eq!(leaves.len(), digests_buf.len() / 2 + 1);
-    if digests_buf.is_empty() {
-        H::hash_or_noop(&leaves[0])
-    } else {
-        // Layout is: left recursive output || left child digest
-        //             || right child digest || right recursive output.
-        // Split `digests_buf` into the two recursive outputs (slices) and two child digests
-        // (references).
-        let (left_digests_buf, right_digests_buf) = digests_buf.split_at_mut(digests_buf.len() / 2);
-        let (left_digest_mem, left_digests_buf) = left_digests_buf.split_last_mut().unwrap();
-        let (right_digest_mem, right_digests_buf) = right_digests_buf.split_first_mut().unwrap();
-        // Split `leaves` between both children.
-        let (left_leaves, right_leaves) = leaves.split_at(leaves.len() / 2);
-
-        let (left_digest, right_digest) = plonky2_maybe_rayon::join(
-            || fill_subtree::<F, H>(left_digests_buf, left_leaves),
-            || fill_subtree::<F, H>(right_digests_buf, right_leaves),
-        );
+    let leaves_count = leaves.len() / leaf_size;
 
-        left_digest_mem.write(left_digest);
-        right_digest_mem.write(right_digest);
-        H::two_to_one(left_digest, right_digest)
+    // if one leaf => return it hash
+    if leaves_count == 1 {
+        let hash = H::hash_or_noop(leaves);
+        digests_buf[0].write(hash);
+        return hash;
+    }
+    // if two leaves => return their concat hash
+    if leaves_count == 2 {
+        let (leaf1, leaf2) = leaves.split_at(leaf_size);
+        let hash_left = H::hash_or_noop(leaf1);
+        let hash_right = H::hash_or_noop(leaf2);
+        digests_buf[0].write(hash_left);
+        digests_buf[1].write(hash_right);
+        return H::two_to_one(hash_left, hash_right);
     }
+
+    assert_eq!(leaves_count, digests_buf.len() / 2 + 1);
+
+    // leaves first - we can do all in parallel
+    let (_, digests_leaves) = digests_buf.split_at_mut(digests_buf.len() - leaves_count);
+    digests_leaves
+        .into_par_iter()
+        .enumerate()
+        .for_each(|(leaf_idx, digest)| {
+            let (_, r) = leaves.split_at(leaf_idx * leaf_size);
+            let (leaf, _) = r.split_at(leaf_size);
+            digest.write(H::hash_or_noop(leaf));
+        });
+
+    // internal nodes - we can do in parallel per level
+    let mut last_index = digests_buf.len() - leaves_count;
+
+    for level_log in range(1, log2_strict(leaves_count)).rev() {
+        let level_size = 1 << level_log;
+        let (_, digests_slice) = digests_buf.split_at_mut(last_index - level_size);
+        let (digests_slice, next_digests) = digests_slice.split_at_mut(level_size);
+
+        digests_slice
+            .into_par_iter()
+            .zip(last_index - level_size..last_index)
+            .for_each(|(digest, idx)| {
+                let left_idx = 2 * (idx + 1) - last_index;
+                let right_idx = left_idx + 1;
+
+                unsafe {
+                    let left_digest = next_digests[left_idx].assume_init();
+                    let right_digest = next_digests[right_idx].assume_init();
+                    digest.write(H::two_to_one(left_digest, right_digest));
+                }
+            });
+        last_index -= level_size;
+    }
+
+    // return cap hash
+    let hash: <H as Hasher<F>>::Hash;
+    unsafe {
+        let left_digest = digests_buf[0].assume_init();
+        let right_digest = digests_buf[1].assume_init();
+        hash = H::two_to_one(left_digest, right_digest);
+    }
+    hash
 }
 
-pub(crate) fn fill_digests_buf<F: RichField, H: Hasher<F>>(
+fn fill_digests_buf<F: RichField, H: Hasher<F>>(
     digests_buf: &mut [MaybeUninit<H::Hash>],
     cap_buf: &mut [MaybeUninit<H::Hash>],
-    leaves: &[Vec<F>],
+    leaves: &Vec<F>,
+    leaf_size: usize,
     cap_height: usize,
 ) {
     // Special case of a tree that's all cap. The usual case will panic because we'll try to split
     // an empty slice into chunks of `0`. (We would not need this if there was a way to split into
     // `blah` chunks as opposed to chunks _of_ `blah`.)
+    let leaves_count = leaves.len() / leaf_size;
+
     if digests_buf.is_empty() {
-        debug_assert_eq!(cap_buf.len(), leaves.len());
+        debug_assert_eq!(cap_buf.len(), leaves_count);
         cap_buf
             .par_iter_mut()
-            .zip(leaves)
-            .for_each(|(cap_buf, leaf)| {
+            .enumerate()
+            .for_each(|(leaf_idx, cap_buf)| {
+                let (_, r) = leaves.split_at(leaf_idx * leaf_size);
+                let (leaf, _) = r.split_at(leaf_size);
                 cap_buf.write(H::hash_or_noop(leaf));
             });
         return;
     }
 
     let subtree_digests_len = digests_buf.len() >> cap_height;
-    let subtree_leaves_len = leaves.len() >> cap_height;
+    let subtree_leaves_len = leaves_count >> cap_height;
     let digests_chunks = digests_buf.par_chunks_exact_mut(subtree_digests_len);
-    let leaves_chunks = leaves.par_chunks_exact(subtree_leaves_len);
+    let leaves_chunks = leaves.par_chunks_exact(subtree_leaves_len * leaf_size);
     assert_eq!(digests_chunks.len(), cap_buf.len());
     assert_eq!(digests_chunks.len(), leaves_chunks.len());
     digests_chunks.zip(cap_buf).zip(leaves_chunks).for_each(
@@ -143,55 +228,245 @@ pub(crate) fn fill_digests_buf<F: RichField, H: Hasher<F>>(
             // We have `1 << cap_height` sub-trees, one for each entry in `cap`. They are totally
             // independent, so we schedule one task for each. `digests_buf` and `leaves` are split
             // into `1 << cap_height` slices, one for each sub-tree.
-            subtree_cap.write(fill_subtree::<F, H>(subtree_digests, subtree_leaves));
+            subtree_cap.write(fill_subtree::<F, H>(
+                subtree_digests,
+                subtree_leaves,
+                leaf_size,
+            ));
         },
     );
+
+    // TODO - debug code - to remove in future
+    /*
+    let digests_count: u64 = digests_buf.len().try_into().unwrap();
+    let leaves_count: u64 = leaves.len().try_into().unwrap();
+    let cap_height: u64  = cap_height.try_into().unwrap();
+    let leaf_size: u64 = leaves[0].len().try_into().unwrap();
+    let fname = format!("cpu-{}-{}-{}-{}.txt", digests_count, leaves_count, leaf_size, cap_height);
+    let mut file = File::create(fname).unwrap();
+    for digest in digests_buf {
+        unsafe {
+            let hash = digest.assume_init().to_vec();
+            for x in hash {
+                let str = format!("{} ", x.to_canonical_u64());
+                file.write_all(str.as_bytes());
+            }
+        }
+        file.write_all(b"\n");
+    }
+    */
+}
+
+#[cfg(feature = "cuda")]
+#[repr(C)]
+union U8U64 {
+    f1: [u8; 32],
+    f2: [u64; 4],
+}
+
+#[cfg(feature = "cuda")]
+fn fill_digests_buf_gpu<F: RichField, H: Hasher<F>>(
+    digests_buf: &mut [MaybeUninit<H::Hash>],
+    cap_buf: &mut [MaybeUninit<H::Hash>],
+    leaves: &Vec<F>,
+    leaf_size: usize,
+    cap_height: usize,
+) {
+    let leaves_count = leaves.len() / leaf_size;
+
+    let num_gpus: usize = std::env::var("NUM_OF_GPUS")
+        .expect("NUM_OF_GPUS should be set")
+        .parse()
+        .unwrap();
+
+    let mut gpu_id_lock = GPU_ID.lock().unwrap();
+    let gpu_id = *gpu_id_lock;
+    *gpu_id_lock += 1;
+    if *gpu_id_lock >= num_gpus as u64 {
+        *gpu_id_lock = 0;
+    }
+
+    let now = Instant::now();
+    let mut gpu_leaves_buf: HostOrDeviceSlice<'_, F> =
+        HostOrDeviceSlice::cuda_malloc(gpu_id as i32, leaves.len()).unwrap();
+    print_time(now, "alloc gpu leaves buffer");
+
+    let now = Instant::now();
+    let _ = gpu_leaves_buf.copy_from_host(leaves.as_slice());
+    print_time(now, "leaves copy to gpu");
+
+    let now = Instant::now();
+    fill_digests_buf_gpu_ptr::<F, H>(
+        digests_buf,
+        cap_buf,
+        gpu_leaves_buf.as_mut_ptr(),
+        leaves_count,
+        leaf_size,
+        cap_height,
+        gpu_id,
+    );
+    print_time(now, "fill_digests_buf_gpu_ptr");
 }
 
-pub(crate) fn merkle_tree_prove<F: RichField, H: Hasher<F>>(
-    leaf_index: usize,
+#[cfg(feature = "cuda")]
+fn fill_digests_buf_gpu_ptr<F: RichField, H: Hasher<F>>(
+    digests_buf: &mut [MaybeUninit<H::Hash>],
+    cap_buf: &mut [MaybeUninit<H::Hash>],
+    leaves_ptr: *const F,
     leaves_len: usize,
+    leaf_len: usize,
     cap_height: usize,
-    digests: &[H::Hash],
-) -> Vec<H::Hash> {
-    let num_layers = log2_strict(leaves_len) - cap_height;
-    debug_assert_eq!(leaf_index >> (cap_height + num_layers), 0);
-
-    let digest_len = 2 * (leaves_len - (1 << cap_height));
-    assert_eq!(digest_len, digests.len());
-
-    let digest_tree: &[H::Hash] = {
-        let tree_index = leaf_index >> num_layers;
-        let tree_len = digest_len >> cap_height;
-        &digests[tree_len * tree_index..tree_len * (tree_index + 1)]
+    gpu_id: u64,
+) {
+    let digests_count: u64 = digests_buf.len().try_into().unwrap();
+    let leaves_count: u64 = leaves_len.try_into().unwrap();
+    let caps_count: u64 = cap_buf.len().try_into().unwrap();
+    let cap_height: u64 = cap_height.try_into().unwrap();
+    let leaf_size: u64 = leaf_len.try_into().unwrap();
+
+    let now = Instant::now();
+    // if digests_buf is empty (size 0), just allocate a few bytes to avoid errors
+    let digests_size = if digests_buf.len() == 0 {
+        NUM_HASH_OUT_ELTS
+    } else {
+        digests_buf.len() * NUM_HASH_OUT_ELTS
+    };
+    let caps_size = if cap_buf.len() == 0 {
+        NUM_HASH_OUT_ELTS
+    } else {
+        cap_buf.len() * NUM_HASH_OUT_ELTS
     };
 
-    // Mask out high bits to get the index within the sub-tree.
-    let mut pair_index = leaf_index & ((1 << num_layers) - 1);
-    (0..num_layers)
-        .map(|i| {
-            let parity = pair_index & 1;
-            pair_index >>= 1;
-
-            // The layers' data is interleaved as follows:
-            // [layer 0, layer 1, layer 0, layer 2, layer 0, layer 1, layer 0, layer 3, ...].
-            // Each of the above is a pair of siblings.
-            // `pair_index` is the index of the pair within layer `i`.
-            // The index of that the pair within `digests` is
-            // `pair_index * 2 ** (i + 1) + (2 ** i - 1)`.
-            let siblings_index = (pair_index << (i + 1)) + (1 << i) - 1;
-            // We have an index for the _pair_, but we want the index of the _sibling_.
-            // Double the pair index to get the index of the left sibling. Conditionally add `1`
-            // if we are to retrieve the right sibling.
-            let sibling_index = 2 * siblings_index + (1 - parity);
-            digest_tree[sibling_index]
-        })
-        .collect()
+    let mut gpu_digests_buf: HostOrDeviceSlice<'_, F> =
+        HostOrDeviceSlice::cuda_malloc(gpu_id as i32, digests_size).unwrap();
+    let mut gpu_cap_buf: HostOrDeviceSlice<'_, F> =
+        HostOrDeviceSlice::cuda_malloc(gpu_id as i32, caps_size).unwrap();
+
+    unsafe {
+        let num_gpus: usize = std::env::var("NUM_OF_GPUS")
+            .expect("NUM_OF_GPUS should be set")
+            .parse()
+            .unwrap();
+        if !FORCE_SINGLE_GPU
+            && leaves_count >= (1 << 12)
+            && cap_height > 0
+            && num_gpus > 1
+            && H::HASHER_TYPE == HasherType::PoseidonBN128
+        {
+            // println!("Multi GPU");
+            fill_digests_buf_linear_multigpu_with_gpu_ptr(
+                gpu_digests_buf.as_mut_ptr() as *mut core::ffi::c_void,
+                gpu_cap_buf.as_mut_ptr() as *mut core::ffi::c_void,
+                leaves_ptr as *mut core::ffi::c_void,
+                digests_count,
+                caps_count,
+                leaves_count,
+                leaf_size,
+                cap_height,
+                H::HASHER_TYPE as u64,
+            );
+        } else {
+            // println!("Single GPU");
+            fill_digests_buf_linear_gpu_with_gpu_ptr(
+                gpu_digests_buf.as_mut_ptr() as *mut core::ffi::c_void,
+                gpu_cap_buf.as_mut_ptr() as *mut core::ffi::c_void,
+                leaves_ptr as *mut core::ffi::c_void,
+                digests_count,
+                caps_count,
+                leaves_count,
+                leaf_size,
+                cap_height,
+                H::HASHER_TYPE as u64,
+                gpu_id,
+            );
+        }
+    }
+    print_time(now, "fill init");
+
+    let mut host_digests: Vec<F> = vec![F::ZERO; digests_size];
+    let mut host_caps: Vec<F> = vec![F::ZERO; caps_size];
+    let stream1 = CudaStream::create().unwrap();
+    let stream2 = CudaStream::create().unwrap();
+
+    gpu_digests_buf
+        .copy_to_host_async(host_digests.as_mut_slice(), &stream1)
+        .expect("copy digests");
+    gpu_cap_buf
+        .copy_to_host_async(host_caps.as_mut_slice(), &stream2)
+        .expect("copy caps");
+    stream1.synchronize().expect("cuda sync");
+    stream2.synchronize().expect("cuda sync");
+    stream1.destroy().expect("cuda stream destroy");
+    stream2.destroy().expect("cuda stream destroy");
+
+    let now = Instant::now();
+
+    if digests_buf.len() > 0 {
+        host_digests
+            .chunks_exact(4)
+            .zip(digests_buf)
+            .for_each(|(x, y)| {
+                unsafe {
+                    let mut parts = U8U64 { f1: [0; 32] };
+                    parts.f2[0] = x[0].to_canonical_u64();
+                    parts.f2[1] = x[1].to_canonical_u64();
+                    parts.f2[2] = x[2].to_canonical_u64();
+                    parts.f2[3] = x[3].to_canonical_u64();
+                    let (slice, _) = parts.f1.split_at(H::HASH_SIZE);
+                    let h: H::Hash = H::Hash::from_bytes(slice);
+                    y.write(h);
+                };
+            });
+    }
+
+    if cap_buf.len() > 0 {
+        host_caps.chunks_exact(4).zip(cap_buf).for_each(|(x, y)| {
+            unsafe {
+                let mut parts = U8U64 { f1: [0; 32] };
+                parts.f2[0] = x[0].to_canonical_u64();
+                parts.f2[1] = x[1].to_canonical_u64();
+                parts.f2[2] = x[2].to_canonical_u64();
+                parts.f2[3] = x[3].to_canonical_u64();
+                let (slice, _) = parts.f1.split_at(H::HASH_SIZE);
+                let h: H::Hash = H::Hash::from_bytes(slice);
+                y.write(h);
+            };
+        });
+    }
+    print_time(now, "copy results");
+}
+
+#[cfg(feature = "cuda")]
+fn fill_digests_buf_meta<F: RichField, H: Hasher<F>>(
+    digests_buf: &mut [MaybeUninit<H::Hash>],
+    cap_buf: &mut [MaybeUninit<H::Hash>],
+    leaves: &Vec<F>,
+    leaf_size: usize,
+    cap_height: usize,
+) {
+    // if the input is small or if it Keccak hashing, just do the hashing on CPU
+    if leaf_size <= H::HASH_SIZE / 8 {
+        fill_digests_buf::<F, H>(digests_buf, cap_buf, leaves, leaf_size, cap_height);
+    } else {
+        fill_digests_buf_gpu::<F, H>(digests_buf, cap_buf, leaves, leaf_size, cap_height);
+    }
+}
+
+#[cfg(not(feature = "cuda"))]
+fn fill_digests_buf_meta<F: RichField, H: Hasher<F>>(
+    digests_buf: &mut [MaybeUninit<H::Hash>],
+    cap_buf: &mut [MaybeUninit<H::Hash>],
+    leaves: &Vec<F>,
+    leaf_size: usize,
+    cap_height: usize,
+) {
+    fill_digests_buf::<F, H>(digests_buf, cap_buf, leaves, leaf_size, cap_height);
 }
 
 impl<F: RichField, H: Hasher<F>> MerkleTree<F, H> {
-    pub fn new(leaves: Vec<Vec<F>>, cap_height: usize) -> Self {
-        let log2_leaves_len = log2_strict(leaves.len());
+    pub fn new_from_1d(leaves_1d: Vec<F>, leaf_size: usize, cap_height: usize) -> Self {
+        let leaves_len = leaves_1d.len() / leaf_size;
+        let log2_leaves_len = log2_strict(leaves_len);
         assert!(
             cap_height <= log2_leaves_len,
             "cap_height={} should be at most log2(leaves.len())={}",
@@ -199,7 +474,7 @@ impl<F: RichField, H: Hasher<F>> MerkleTree<F, H> {
             log2_leaves_len
         );
 
-        let num_digests = 2 * (leaves.len() - (1 << cap_height));
+        let num_digests = 2 * (leaves_len - (1 << cap_height));
         let mut digests = Vec::with_capacity(num_digests);
 
         let len_cap = 1 << cap_height;
@@ -207,7 +482,9 @@ impl<F: RichField, H: Hasher<F>> MerkleTree<F, H> {
 
         let digests_buf = capacity_up_to_mut(&mut digests, num_digests);
         let cap_buf = capacity_up_to_mut(&mut cap, len_cap);
-        fill_digests_buf::<F, H>(digests_buf, cap_buf, &leaves[..], cap_height);
+        let now = Instant::now();
+        fill_digests_buf_meta::<F, H>(digests_buf, cap_buf, &leaves_1d, leaf_size, cap_height);
+        print_time(now, "fill digests buffer");
 
         unsafe {
             // SAFETY: `fill_digests_buf` and `cap` initialized the spare capacity up to
@@ -215,38 +492,363 @@ impl<F: RichField, H: Hasher<F>> MerkleTree<F, H> {
             digests.set_len(num_digests);
             cap.set_len(len_cap);
         }
+        /*
+        println!{"Digest Buffer"};
+        for dg in &digests {
+            println!("{:?}", dg);
+        }
+        println!{"Cap Buffer"};
+        for dg in &cap {
+            println!("{:?}", dg);
+        }
+        */
+        Self {
+            leaves: leaves_1d,
+            leaf_size,
+            digests,
+            cap: MerkleCap(cap),
+        }
+    }
+
+    pub fn new_from_2d(leaves_2d: Vec<Vec<F>>, cap_height: usize) -> Self {
+        let leaf_size = leaves_2d[0].len();
+        let leaves_count = leaves_2d.len();
+        let zeros = vec![F::from_canonical_u64(0); leaf_size];
+        let mut leaves_1d: Vec<F> = Vec::with_capacity(leaves_count * leaf_size);
+        for idx in 0..leaves_count {
+            if leaves_2d[idx].len() == 0 {
+                leaves_1d.extend(zeros.clone());
+            } else {
+                leaves_1d.extend(leaves_2d[idx].clone());
+            }
+        }
+        Self::new_from_1d(leaves_1d, leaf_size, cap_height)
+    }
 
+    pub fn new_from_fields(
+        leaves_1d: Vec<F>,
+        leaf_size: usize,
+        digests: Vec<H::Hash>,
+        cap: MerkleCap<F, H>,
+    ) -> Self {
         Self {
-            leaves,
+            leaves: leaves_1d,
+            leaf_size,
+            digests,
+            cap,
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    pub fn new_from_gpu_leaves(
+        leaves_gpu_ptr: &HostOrDeviceSlice<'_, F>,
+        leaves_len: usize,
+        leaf_len: usize,
+        cap_height: usize,
+    ) -> Self {
+        let log2_leaves_len = log2_strict(leaves_len);
+        assert!(
+            cap_height <= log2_leaves_len,
+            "cap_height={} should be at most log2(leaves.len())={}",
+            cap_height,
+            log2_leaves_len
+        );
+
+        // copy data from GPU in async mode
+        let mut host_leaves: Vec<F> = vec![F::ZERO; leaves_len * leaf_len];
+        let stream_copy = CudaStream::create().unwrap();
+
+        let start = std::time::Instant::now();
+        leaves_gpu_ptr
+            .copy_to_host_async(host_leaves.as_mut_slice(), &stream_copy)
+            .expect("copy to host error");
+        print_time(start, "copy leaves from GPU async");
+
+        let num_digests = 2 * (leaves_len - (1 << cap_height));
+        let mut digests = Vec::with_capacity(num_digests);
+
+        let len_cap = 1 << cap_height;
+        let mut cap = Vec::with_capacity(len_cap);
+
+        let digests_buf = capacity_up_to_mut(&mut digests, num_digests);
+        let cap_buf = capacity_up_to_mut(&mut cap, len_cap);
+        let now = Instant::now();
+        let gpu_id = 0;
+        fill_digests_buf_gpu_ptr::<F, H>(
+            digests_buf,
+            cap_buf,
+            leaves_gpu_ptr.as_ptr(),
+            leaves_len,
+            leaf_len,
+            cap_height,
+            gpu_id,
+        );
+        print_time(now, "fill digests buffer");
+
+        unsafe {
+            // SAFETY: `fill_digests_buf` and `cap` initialized the spare capacity up to
+            // `num_digests` and `len_cap`, resp.
+            digests.set_len(num_digests);
+            cap.set_len(len_cap);
+        }
+        /*
+        println!{"Digest Buffer"};
+        for dg in &digests {
+            println!("{:?}", dg);
+        }
+        println!{"Cap Buffer"};
+        for dg in &cap {
+            println!("{:?}", dg);
+        }
+        */
+        let _ = stream_copy.synchronize();
+        let _ = stream_copy.destroy();
+
+        Self {
+            leaves: host_leaves,
+            leaf_size: leaf_len,
             digests,
             cap: MerkleCap(cap),
         }
     }
 
     pub fn get(&self, i: usize) -> &[F] {
-        &self.leaves[i]
+        let (_, v) = self.leaves.split_at(i * self.leaf_size);
+        let (v, _) = v.split_at(self.leaf_size);
+        v
+    }
+
+    pub fn get_leaves_1d(&self) -> Vec<F> {
+        self.leaves.clone()
+    }
+
+    pub fn get_leaves_2d(&self) -> Vec<Vec<F>> {
+        let v2d: Vec<Vec<F>> = self
+            .leaves
+            .chunks_exact(self.leaf_size)
+            .map(|leaf| leaf.to_vec())
+            .collect();
+        v2d
+    }
+
+    pub fn get_leaves_count(&self) -> usize {
+        self.leaves.len() / self.leaf_size
+    }
+
+    pub fn change_leaf_and_update(&mut self, leaf: Vec<F>, leaf_index: usize) {
+        assert_eq!(leaf.len(), self.leaf_size);
+        let leaves_count = self.leaves.len() / self.leaf_size;
+        assert!(leaf_index < leaves_count);
+
+        let cap_height = log2_strict(self.cap.len());
+        let mut leaves = self.leaves.clone();
+        let start = leaf_index * self.leaf_size;
+        let leaf_copy = leaf.clone();
+        leaf.into_iter()
+            .enumerate()
+            .for_each(|(i, el)| leaves[start + i] = el);
+
+        let digests_len = self.digests.len();
+        let cap_len = self.cap.0.len();
+        let digests_buf = capacity_up_to_mut(&mut self.digests, digests_len);
+        let cap_buf = capacity_up_to_mut(&mut self.cap.0, cap_len);
+        self.leaves = leaves;
+        if digests_buf.is_empty() {
+            cap_buf[leaf_index].write(H::hash_or_noop(leaf_copy.as_slice()));
+        } else {
+            let subtree_leaves_len = leaves_count >> cap_height;
+            let subtree_idx = leaf_index / subtree_leaves_len;
+            let subtree_digests_len = digests_buf.len() >> cap_height;
+            let subtree_offset = subtree_idx * subtree_digests_len;
+            let idx_in_subtree =
+                subtree_digests_len - subtree_leaves_len + leaf_index % subtree_leaves_len;
+            if subtree_leaves_len == 2 {
+                digests_buf[subtree_offset + idx_in_subtree]
+                    .write(H::hash_or_noop(leaf_copy.as_slice()));
+            } else {
+                assert!(subtree_leaves_len > 2);
+                let idx = subtree_offset + idx_in_subtree;
+                digests_buf[idx].write(H::hash_or_noop(leaf_copy.as_slice()));
+                let mut child_idx: i64 = idx_in_subtree as i64;
+                let mut parent_idx: i64 = child_idx / 2 - 1;
+                while child_idx > 1 {
+                    unsafe {
+                        let mut left_idx = subtree_offset + child_idx as usize;
+                        let mut right_idx = subtree_offset + child_idx as usize + 1;
+                        if child_idx & 1 == 1 {
+                            left_idx = subtree_offset + child_idx as usize - 1;
+                            right_idx = subtree_offset + child_idx as usize;
+                        }
+                        let left_digest = digests_buf[left_idx].assume_init();
+                        let right_digest = digests_buf[right_idx].assume_init();
+                        digests_buf[subtree_offset + parent_idx as usize]
+                            .write(H::two_to_one(left_digest, right_digest));
+                    }
+                    child_idx = parent_idx;
+                    parent_idx = child_idx / 2 - 1;
+                }
+            }
+            unsafe {
+                let left_digest = digests_buf[subtree_offset].assume_init();
+                let right_digest = digests_buf[subtree_offset + 1].assume_init();
+                cap_buf[subtree_idx].write(H::two_to_one(left_digest, right_digest));
+            }
+        }
+    }
+
+    pub fn change_leaves_in_range_and_update(
+        &mut self,
+        new_leaves: Vec<Vec<F>>,
+        start_index: usize,
+        end_index: usize,
+    ) {
+        assert_eq!(new_leaves.len(), end_index - start_index);
+        assert_eq!(new_leaves[0].len(), self.leaf_size);
+
+        let tree_leaves_count = self.leaves.len() / self.leaf_size;
+        assert!(start_index < end_index);
+        assert!(end_index < tree_leaves_count);
+
+        let cap_height = log2_strict(self.cap.len());
+        let mut leaves = self.leaves.clone();
+
+        leaves[start_index * self.leaf_size..end_index * self.leaf_size]
+            .chunks_exact_mut(self.leaf_size)
+            .zip(new_leaves.clone())
+            .for_each(|(x, y)| {
+                for j in 0..self.leaf_size {
+                    x[j] = y[j];
+                }
+            });
+
+        let digests_len = self.digests.len();
+        let cap_len = self.cap.0.len();
+        let digests_buf = capacity_up_to_mut(&mut self.digests, digests_len);
+        let cap_buf = capacity_up_to_mut(&mut self.cap.0, cap_len);
+        self.leaves = leaves;
+        if digests_buf.is_empty() {
+            cap_buf[start_index..end_index]
+                .par_iter_mut()
+                .zip(new_leaves)
+                .for_each(|(cap, leaf)| {
+                    cap.write(H::hash_or_noop(leaf.as_slice()));
+                });
+        } else {
+            let subtree_leaves_len = tree_leaves_count >> cap_height;
+            let subtree_digests_len = digests_buf.len() >> cap_height;
+
+            let mut positions: Vec<usize> = (start_index..end_index)
+                .map(|idx| {
+                    let subtree_idx = idx / subtree_leaves_len;
+                    let subtree_offset = subtree_idx * subtree_digests_len;
+                    let idx_in_subtree =
+                        subtree_digests_len - subtree_leaves_len + idx % subtree_leaves_len;
+                    subtree_offset + idx_in_subtree
+                })
+                .collect();
+
+            // TODO change to parallel loop
+            for i in 0..positions.len() {
+                digests_buf[positions[i]].write(H::hash_or_noop(new_leaves[i].as_slice()));
+            }
+
+            if subtree_digests_len > 2 {
+                let rounds = log2_strict(tree_leaves_count) - cap_height - 1;
+                for _ in 0..rounds {
+                    let mut parent_indexes: HashSet<usize> = HashSet::new();
+                    let parents: Vec<usize> = positions
+                        .par_iter()
+                        .map(|pos| {
+                            let subtree_offset = pos / subtree_digests_len;
+                            let idx_in_subtree = pos % subtree_digests_len;
+                            let mut parent_idx = 0;
+                            if idx_in_subtree > 1 {
+                                parent_idx = idx_in_subtree / 2 - 1;
+                            }
+                            subtree_offset * subtree_digests_len + parent_idx
+                        })
+                        .collect();
+                    for p in parents {
+                        parent_indexes.insert(p);
+                    }
+                    positions = parent_indexes.into_iter().collect();
+
+                    // TODO change to parallel loop
+                    for i in 0..positions.len() {
+                        let subtree_offset = positions[i] / subtree_digests_len;
+                        let idx_in_subtree = positions[i] % subtree_digests_len;
+                        let digest_idx =
+                            subtree_offset * subtree_digests_len + 2 * (idx_in_subtree + 1);
+                        unsafe {
+                            let left_digest = digests_buf[digest_idx].assume_init();
+                            let right_digest = digests_buf[digest_idx + 1].assume_init();
+                            digests_buf[positions[i]]
+                                .write(H::two_to_one(left_digest, right_digest));
+                        }
+                    }
+                }
+            }
+
+            let mut cap_indexes: HashSet<usize> = HashSet::new();
+            for idx in start_index..end_index {
+                cap_indexes.insert(idx / subtree_leaves_len);
+            }
+
+            unsafe {
+                for idx in cap_indexes {
+                    let digest_idx = idx * subtree_digests_len;
+                    let left_digest = digests_buf[digest_idx].assume_init();
+                    let right_digest = digests_buf[digest_idx + 1].assume_init();
+                    cap_buf[idx].write(H::two_to_one(left_digest, right_digest));
+                }
+            }
+        }
     }
 
     /// Create a Merkle proof from a leaf index.
     pub fn prove(&self, leaf_index: usize) -> MerkleProof<F, H> {
         let cap_height = log2_strict(self.cap.len());
-        let siblings =
-            merkle_tree_prove::<F, H>(leaf_index, self.leaves.len(), cap_height, &self.digests);
+        let num_layers = log2_strict(self.get_leaves_count()) - cap_height;
+        let subtree_digest_size = (1 << (num_layers + 1)) - 2; // 2 ^ (k+1) - 2
+        let subtree_idx = leaf_index / (1 << num_layers);
+
+        let siblings: Vec<<H as Hasher<F>>::Hash> = Vec::with_capacity(num_layers);
+        if num_layers == 0 {
+            return MerkleProof { siblings };
+        }
+
+        // digests index where we start
+        let idx = subtree_digest_size - (1 << num_layers) + (leaf_index % (1 << num_layers));
+
+        let siblings = (0..num_layers)
+            .map(|i| {
+                // relative index
+                let rel_idx = (idx + 2 - (1 << i + 1)) / (1 << i);
+                // absolute index
+                let mut abs_idx = subtree_idx * subtree_digest_size + rel_idx;
+                if (rel_idx & 1) == 1 {
+                    abs_idx -= 1;
+                } else {
+                    abs_idx += 1;
+                }
+                self.digests[abs_idx]
+            })
+            .collect();
 
         MerkleProof { siblings }
     }
 }
 
 #[cfg(test)]
-pub(crate) mod tests {
+mod tests {
     use anyhow::Result;
 
     use super::*;
     use crate::field::extension::Extendable;
     use crate::hash::merkle_proofs::verify_merkle_proof_to_cap;
-    use crate::plonk::config::{GenericConfig, PoseidonGoldilocksConfig};
+    use crate::plonk::config::{GenericConfig, KeccakGoldilocksConfig, PoseidonGoldilocksConfig};
 
-    pub(crate) fn random_data<F: RichField>(n: usize, k: usize) -> Vec<Vec<F>> {
+    fn random_data<F: RichField>(n: usize, k: usize) -> Vec<Vec<F>> {
         (0..n).map(|_| F::rand_vec(k)).collect()
     }
 
@@ -258,7 +860,7 @@ pub(crate) mod tests {
         leaves: Vec<Vec<F>>,
         cap_height: usize,
     ) -> Result<()> {
-        let tree = MerkleTree::<F, C::Hasher>::new(leaves.clone(), cap_height);
+        let tree = MerkleTree::<F, C::Hasher>::new_from_2d(leaves.clone(), cap_height);
         for (i, leaf) in leaves.into_iter().enumerate() {
             let proof = tree.prove(i);
             verify_merkle_proof_to_cap(leaf, i, &tree.cap, &proof)?;
@@ -266,6 +868,224 @@ pub(crate) mod tests {
         Ok(())
     }
 
+    fn verify_change_leaf_and_update(log_n: usize, cap_h: usize) {
+        const D: usize = 2;
+        type C = PoseidonGoldilocksConfig;
+        type F = <C as GenericConfig<D>>::F;
+
+        let n = 1 << log_n;
+        let k = 7;
+        let mut leaves = random_data::<F>(n, k);
+
+        let mut mt1 =
+            MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new_from_2d(leaves.clone(), cap_h);
+
+        let tmp = random_data::<F>(1, k);
+        leaves[0] = tmp[0].clone();
+        let mt2 = MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new_from_2d(leaves, cap_h);
+
+        mt1.change_leaf_and_update(tmp[0].clone(), 0);
+
+        /*
+        println!("Tree 1");
+        mt1.digests.into_iter().for_each(
+            |x| {
+                println!("{:?}", x);
+            }
+        );
+        println!("Tree 2");
+        mt2.digests.into_iter().for_each(
+            |x| {
+                println!("{:?}", x);
+            }
+        );
+        */
+
+        mt1.digests
+            .into_par_iter()
+            .zip(mt2.digests)
+            .for_each(|(d1, d2)| {
+                assert_eq!(d1, d2);
+            });
+
+        mt1.cap
+            .0
+            .into_par_iter()
+            .zip(mt2.cap.0)
+            .for_each(|(d1, d2)| {
+                assert_eq!(d1, d2);
+            });
+    }
+
+    fn verify_change_leaf_and_update_range_one_by_one(
+        leaves_count: usize,
+        leaf_size: usize,
+        cap_height: usize,
+        start_index: usize,
+        end_index: usize,
+    ) {
+        use plonky2_field::types::Field;
+
+        const D: usize = 2;
+        type C = PoseidonGoldilocksConfig;
+        type F = <C as GenericConfig<D>>::F;
+
+        let raw_leaves: Vec<Vec<F>> = random_data::<F>(leaves_count, leaf_size);
+        let vals: Vec<Vec<F>> = random_data::<F>(end_index - start_index, leaf_size);
+
+        let mut leaves1_1d: Vec<F> = raw_leaves.into_iter().flatten().collect();
+        let leaves2_1d: Vec<F> = leaves1_1d.clone();
+
+        let mut tree2 = MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new_from_1d(
+            leaves2_1d, leaf_size, cap_height,
+        );
+
+        // v1
+        let now = Instant::now();
+        for i in start_index..end_index {
+            for j in 0..leaf_size {
+                leaves1_1d[i * leaf_size + j] = vals[i - start_index][j];
+            }
+        }
+        let tree1 = MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new_from_1d(
+            leaves1_1d, leaf_size, cap_height,
+        );
+        println!("Time V1: {} ms", now.elapsed().as_millis());
+
+        // v2
+        let now = Instant::now();
+        for idx in start_index..end_index {
+            let mut leaf: Vec<F> = vec![F::from_canonical_u64(0); leaf_size];
+            for j in 0..leaf_size {
+                leaf[j] = vals[idx - start_index][j];
+            }
+            tree2.change_leaf_and_update(leaf, idx);
+        }
+        println!("Time V2: {} ms", now.elapsed().as_millis());
+
+        // compare leaves
+        let t2leaves = tree2.get_leaves_1d();
+        tree1
+            .get_leaves_1d()
+            .chunks_exact(leaf_size)
+            .enumerate()
+            .for_each(|(i, x)| {
+                let mut ok = true;
+                for j in 0..leaf_size {
+                    if x[j] != t2leaves[i * leaf_size + j] {
+                        ok = false;
+                        break;
+                    }
+                }
+                if !ok {
+                    println!("Leaves different at index {:?}", i);
+                }
+                assert!(ok);
+            });
+
+        // compare trees
+        tree1.digests.into_iter().enumerate().for_each(|(i, x)| {
+            let y = tree2.digests[i];
+            if x != y {
+                println!("Digests different at index {:?}", i);
+            }
+            assert_eq!(x, y);
+        });
+        tree1.cap.0.into_iter().enumerate().for_each(|(i, x)| {
+            let y = tree2.cap.0[i];
+            if x != y {
+                println!("Cap different at index {:?}", i);
+            }
+            assert_eq!(x, y);
+        });
+    }
+
+    fn verify_change_leaf_and_update_range(
+        leaves_count: usize,
+        leaf_size: usize,
+        cap_height: usize,
+        start_index: usize,
+        end_index: usize,
+    ) {
+        // use plonky2_field::types::Field;
+
+        const D: usize = 2;
+        type C = PoseidonGoldilocksConfig;
+        type F = <C as GenericConfig<D>>::F;
+
+        let raw_leaves: Vec<Vec<F>> = random_data::<F>(leaves_count, leaf_size);
+        let vals: Vec<Vec<F>> = random_data::<F>(end_index - start_index, leaf_size);
+
+        let mut leaves1_1d: Vec<F> = raw_leaves.into_iter().flatten().collect();
+        let leaves2_1d: Vec<F> = leaves1_1d.clone();
+
+        let mut tree2 = MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new_from_1d(
+            leaves2_1d, leaf_size, cap_height,
+        );
+
+        // v1
+        let now = Instant::now();
+        for i in start_index..end_index {
+            for j in 0..leaf_size {
+                leaves1_1d[i * leaf_size + j] = vals[i - start_index][j];
+            }
+        }
+        let tree1 = MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new_from_1d(
+            leaves1_1d, leaf_size, cap_height,
+        );
+        println!("Time V1: {} ms", now.elapsed().as_millis());
+
+        // v2
+        let now = Instant::now();
+        /*
+        for idx in start_index..end_index {
+            let mut leaf: Vec<F> = vec![F::from_canonical_u64(0); leaf_size];
+            for j in 0..leaf_size {
+                leaf[j] = vals[idx - start_index][j];
+            }
+            tree2.change_leaf_and_update(leaf, idx);
+        }
+        */
+        tree2.change_leaves_in_range_and_update(vals, start_index, end_index);
+        println!("Time V2: {} ms", now.elapsed().as_millis());
+
+        // compare leaves
+        let t2leaves = tree2.get_leaves_1d();
+        tree1
+            .get_leaves_1d()
+            .chunks_exact(leaf_size)
+            .enumerate()
+            .for_each(|(i, x)| {
+                let mut ok = true;
+                for j in 0..leaf_size {
+                    if x[j] != t2leaves[i * leaf_size + j] {
+                        ok = false;
+                        break;
+                    }
+                }
+                if !ok {
+                    println!("Leaves different at index {:?}", i);
+                }
+                assert!(ok);
+            });
+
+        // compare trees
+        tree1.digests.into_iter().enumerate().for_each(|(i, x)| {
+            let y = tree2.digests[i];
+            if x != y {
+                println!("Digests different at index {:?}", i);
+            }
+            assert_eq!(x, y);
+        });
+        tree1.cap.0.into_iter().enumerate().for_each(|(i, x)| {
+            let y = tree2.cap.0[i];
+            if x != y {
+                println!("Cap different at index {:?}", i);
+            }
+            assert_eq!(x, y);
+        });
+    }
+
     #[test]
     #[should_panic]
     fn test_cap_height_too_big() {
@@ -277,7 +1097,7 @@ pub(crate) mod tests {
         let cap_height = log_n + 1; // Should panic if `cap_height > len_n`.
 
         let leaves = random_data::<F>(1 << log_n, 7);
-        let _ = MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new(leaves, cap_height);
+        let _ = MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new_from_2d(leaves, cap_height);
     }
 
     #[test]
@@ -296,12 +1116,89 @@ pub(crate) mod tests {
     }
 
     #[test]
-    fn test_merkle_trees() -> Result<()> {
+    fn test_change_leaf_and_update() -> Result<()> {
+        // small tree, 1 cap
+        verify_change_leaf_and_update(3, 0);
+        // small tree, 2 cap
+        verify_change_leaf_and_update(3, 1);
+        // small tree, 4 cap
+        verify_change_leaf_and_update(3, 2);
+        // small tree, all cap
+        verify_change_leaf_and_update(3, 3);
+
+        // big tree
+        verify_change_leaf_and_update(12, 3);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_change_leaf_and_update_range() -> Result<()> {
+        for h in 0..11 {
+            println!(
+                "Run verify_change_leaf_and_update_range_one_by_one() for height {:?}",
+                h
+            );
+            verify_change_leaf_and_update_range_one_by_one(1024, 68, h, 32, 48);
+            println!(
+                "Run verify_change_leaf_and_update_range() for height {:?}",
+                h
+            );
+            verify_change_leaf_and_update_range(1024, 68, h, 32, 48);
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_merkle_trees_poseidon_g64() -> Result<()> {
         const D: usize = 2;
         type C = PoseidonGoldilocksConfig;
         type F = <C as GenericConfig<D>>::F;
 
-        let log_n = 8;
+        // GPU warmup
+        #[cfg(feature = "cuda")]
+        let _x: HostOrDeviceSlice<'_, F> = HostOrDeviceSlice::cuda_malloc(0, 64).unwrap();
+
+        let log_n = 12;
+        let n = 1 << log_n;
+        let leaves = random_data::<F>(n, 7);
+
+        verify_all_leaves::<F, C, D>(leaves, 1)?;
+
+        Ok(())
+    }
+
+    #[cfg(feature = "cuda")]
+    #[test]
+    fn test_merkle_trees_cuda_poseidon_g64() -> Result<()> {
+        const D: usize = 2;
+        type C = PoseidonGoldilocksConfig;
+        type F = <C as GenericConfig<D>>::F;
+
+        let log_n = 14;
+        let n = 1 << log_n;
+        let leaves = random_data::<F>(n, 7);
+        let leaves_1d: Vec<F> = leaves.into_iter().flatten().collect();
+
+        let mut gpu_data: HostOrDeviceSlice<'_, F> =
+            HostOrDeviceSlice::cuda_malloc(0, n * 7).unwrap();
+        gpu_data
+            .copy_from_host(leaves_1d.as_slice())
+            .expect("copy data to gpu");
+
+        MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new_from_gpu_leaves(&gpu_data, n, 7, 1);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_merkle_trees_keccak() -> Result<()> {
+        const D: usize = 2;
+        type C = KeccakGoldilocksConfig;
+        type F = <C as GenericConfig<D>>::F;
+
+        let log_n = 12;
         let n = 1 << log_n;
         let leaves = random_data::<F>(n, 7);
 
diff --git a/plonky2/src/hash/mod.rs b/plonky2/src/hash/mod.rs
index 0e4bb8a59..3d2e3b161 100644
--- a/plonky2/src/hash/mod.rs
+++ b/plonky2/src/hash/mod.rs
@@ -2,7 +2,7 @@
 //! as well as specific hash functions implementation.
 
 mod arch;
-pub mod batch_merkle_tree;
+// pub mod batch_merkle_tree;
 pub mod hash_types;
 pub mod hashing;
 pub mod keccak;
diff --git a/plonky2/src/hash/path_compression.rs b/plonky2/src/hash/path_compression.rs
index 517576bf0..5a7f7e1ca 100644
--- a/plonky2/src/hash/path_compression.rs
+++ b/plonky2/src/hash/path_compression.rs
@@ -129,8 +129,14 @@ mod tests {
         type F = <C as GenericConfig<D>>::F;
         let h = 10;
         let cap_height = 3;
-        let vs = (0..1 << h).map(|_| vec![F::rand()]).collect::<Vec<_>>();
-        let mt = MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new(vs.clone(), cap_height);
+        let vs = (0..1 << h)
+            .flat_map(|_| vec![F::rand()])
+            .collect::<Vec<_>>();
+        let mt = MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new_from_1d(
+            vs.clone(),
+            1,
+            cap_height,
+        );
 
         let mut rng = OsRng;
         let k = rng.gen_range(1..=1 << h);
@@ -139,7 +145,10 @@ mod tests {
 
         let compressed_proofs = compress_merkle_proofs(cap_height, &indices, &proofs);
         let decompressed_proofs = decompress_merkle_proofs(
-            &indices.iter().map(|&i| vs[i].clone()).collect::<Vec<_>>(),
+            &indices
+                .iter()
+                .map(|&i| vec![vs[i].clone()])
+                .collect::<Vec<_>>(),
             &indices,
             &compressed_proofs,
             h,
diff --git a/plonky2/src/hash/poseidon.rs b/plonky2/src/hash/poseidon.rs
index a7c763252..2b6dd0ac2 100644
--- a/plonky2/src/hash/poseidon.rs
+++ b/plonky2/src/hash/poseidon.rs
@@ -18,7 +18,7 @@ use crate::hash::hashing::{compress, hash_n_to_hash_no_pad, PlonkyPermutation};
 use crate::iop::ext_target::ExtensionTarget;
 use crate::iop::target::{BoolTarget, Target};
 use crate::plonk::circuit_builder::CircuitBuilder;
-use crate::plonk::config::{AlgebraicHasher, Hasher};
+use crate::plonk::config::{AlgebraicHasher, Hasher, HasherType};
 
 pub const SPONGE_RATE: usize = 8;
 pub const SPONGE_CAPACITY: usize = 4;
@@ -874,6 +874,7 @@ impl<T: Copy + Debug + Default + Eq + Permuter + Send + Sync> PlonkyPermutation<
 pub struct PoseidonHash;
 impl<F: RichField> Hasher<F> for PoseidonHash {
     const HASH_SIZE: usize = 4 * 8;
+    const HASHER_TYPE: HasherType = HasherType::Poseidon;
     type Hash = HashOut<F>;
     type Permutation = PoseidonPermutation<F>;
 
diff --git a/plonky2/src/lib.rs b/plonky2/src/lib.rs
index 8772ecfc0..4bf8cc982 100644
--- a/plonky2/src/lib.rs
+++ b/plonky2/src/lib.rs
@@ -11,7 +11,7 @@ pub extern crate alloc;
 #[doc(inline)]
 pub use plonky2_field as field;
 
-pub mod batch_fri;
+// pub mod batch_fri;
 pub mod fri;
 pub mod gadgets;
 pub mod gates;
diff --git a/plonky2/src/plonk/config.rs b/plonky2/src/plonk/config.rs
index 217c88976..f4fe480cb 100644
--- a/plonky2/src/plonk/config.rs
+++ b/plonky2/src/plonk/config.rs
@@ -23,6 +23,14 @@ use crate::hash::poseidon::PoseidonHash;
 use crate::iop::target::{BoolTarget, Target};
 use crate::plonk::circuit_builder::CircuitBuilder;
 
+#[derive(PartialEq, Debug, Copy, Clone)]
+pub enum HasherType {
+    Poseidon = 0,
+    Keccak = 1,
+    PoseidonBN128 = 2,
+    Poseidon2 = 3,
+}
+
 pub trait GenericHashOut<F: RichField>:
     Copy + Clone + Debug + Eq + PartialEq + Send + Sync + Serialize + DeserializeOwned
 {
@@ -34,6 +42,8 @@ pub trait GenericHashOut<F: RichField>:
 
 /// Trait for hash functions.
 pub trait Hasher<F: RichField>: Sized + Copy + Debug + Eq + PartialEq {
+    const HASHER_TYPE: HasherType;
+
     /// Size of `Hash` in bytes.
     const HASH_SIZE: usize;
 
diff --git a/plonky2/src/util/serialization/mod.rs b/plonky2/src/util/serialization/mod.rs
index 3755851ac..fbfd7974f 100644
--- a/plonky2/src/util/serialization/mod.rs
+++ b/plonky2/src/util/serialization/mod.rs
@@ -321,21 +321,22 @@ pub trait Read {
         H: Hasher<F>,
     {
         let leaves_len = self.read_usize()?;
-        let mut leaves = Vec::with_capacity(leaves_len);
+        let leaf_len = self.read_usize()?;
+        let mut leaves_2d = Vec::with_capacity(leaves_len * leaf_len);
         for _ in 0..leaves_len {
-            let leaf_len = self.read_usize()?;
-            leaves.push(self.read_field_vec(leaf_len)?);
+            // let leaf_len = self.read_usize()?;
+            leaves_2d.push(self.read_field_vec(leaf_len)?);
         }
 
+        let leaves_1d = leaves_2d.into_iter().flatten().collect();
+
         let digests_len = self.read_usize()?;
         let digests = self.read_hash_vec::<F, H>(digests_len)?;
         let cap_height = self.read_usize()?;
         let cap = self.read_merkle_cap::<F, H>(cap_height)?;
-        Ok(MerkleTree {
-            leaves,
-            digests,
-            cap,
-        })
+        Ok(MerkleTree::new_from_fields(
+            leaves_1d, leaf_len, digests, cap,
+        ))
     }
 
     /// Reads a value of type [`OpeningSet`] from `self` with the given `common_data`.
@@ -1421,10 +1422,12 @@ pub trait Write {
         F: RichField,
         H: Hasher<F>,
     {
-        self.write_usize(tree.leaves.len())?;
-        for i in 0..tree.leaves.len() {
-            self.write_usize(tree.leaves[i].len())?;
-            self.write_field_vec(&tree.leaves[i])?;
+        let leaves_count = tree.get_leaves_count();
+        self.write_usize(leaves_count)?;
+        self.write_usize(tree.leaf_size)?;
+        for i in 0..leaves_count {
+            // self.write_usize(tree.leaf_size)?;
+            self.write_field_vec(&tree.get(i))?;
         }
         self.write_hash_vec::<F, H>(&tree.digests)?;
         self.write_usize(tree.cap.height())?;

From ea7334c71bda6cf19a15f0f6ec48987d7ec7b95a Mon Sep 17 00:00:00 2001
From: zhenfeizhang <zhenfei.zhang@hotmail.com>
Date: Sun, 23 Nov 2025 19:22:23 -0500
Subject: [PATCH 05/37] wip

---
 field/Cargo.toml            |   1 +
 field/src/fft.rs            | 190 ++++++++++++++++++++++++++++++------
 field/src/polynomial/mod.rs |  14 +--
 3 files changed, 168 insertions(+), 37 deletions(-)

diff --git a/field/Cargo.toml b/field/Cargo.toml
index 49cb04494..12e38e354 100644
--- a/field/Cargo.toml
+++ b/field/Cargo.toml
@@ -11,6 +11,7 @@ keywords.workspace = true
 categories.workspace = true
 
 [dependencies]
+ark-std = "0.5.0"
 anyhow = { workspace = true }
 itertools = { workspace = true, features = ["use_alloc"] }
 num = { workspace = true, features = ["alloc"] }
diff --git a/field/src/fft.rs b/field/src/fft.rs
index eeb86b62d..9c6c8fed6 100644
--- a/field/src/fft.rs
+++ b/field/src/fft.rs
@@ -40,16 +40,41 @@ fn fft_dispatch_gpu<F: Field>(
 ) {
     use zeknox::ntt_batch;
     use zeknox::types::NTTConfig;
-    if F::CUDA_SUPPORT {
-        return ntt_batch(
-            0,
-            input.as_mut_ptr(),
-            input.len().trailing_zeros() as usize,
-            NTTConfig::default(),
-        );
-    } else {
-        return fft_dispatch_cpu(input, zero_factor, root_table);
-    }
+
+    let mut a = input.to_vec();
+    let mut b = input.to_vec();
+
+    ntt_batch(
+        0,
+        a.as_mut_ptr(),
+        input.len().trailing_zeros() as usize,
+        NTTConfig::default(),
+    );
+
+    fft_dispatch_cpu(&mut b, zero_factor, root_table);
+    ark_std::println!("a: {:?}", a);
+    ark_std::println!("b: {:?}", b);
+
+    assert_eq!(
+        a, b,
+        "failed GPU FFT vs CPU FFT comparison\ngpu:{:?}\ncpu:{:?}\ninput:{:?}",
+        a, b, input
+    );
+
+    input.copy_from_slice(&a);
+
+    // use zeknox::ntt_batch;
+    // use zeknox::types::NTTConfig;
+    // if F::CUDA_SUPPORT {
+    //     return ntt_batch(
+    //         0,
+    //         input.as_mut_ptr(),
+    //         input.len().trailing_zeros() as usize,
+    //         NTTConfig::default(),
+    //     );
+    // } else {
+    //     return fft_dispatch_cpu(input, zero_factor, root_table);
+    // }
 }
 
 /// Batch FFT computation for multiple polynomials on GPU
@@ -202,25 +227,46 @@ pub fn coset_fft_batch_with_options<F: Field>(
     zero_factor: Option<usize>,
     root_table: Option<&FftRootTable<F>>,
 ) -> Vec<PolynomialValues<F>> {
-    #[cfg(feature = "cuda")]
-    return coset_fft_batch_gpu(polys, zero_factor, root_table);
-
-    #[cfg(not(feature = "cuda"))]
-    {
-        // CPU fallback: process each polynomial separately
-        polys
-            .into_iter()
-            .map(|poly| {
-                let modified_poly: PolynomialCoeffs<F> = F::coset_shift()
-                    .powers()
-                    .zip(&poly.coeffs)
-                    .map(|(r, &c)| r * c)
-                    .collect::<Vec<_>>()
-                    .into();
-                fft_with_options(modified_poly, zero_factor, root_table)
-            })
-            .collect()
-    }
+    // #[cfg(feature = "cuda")]
+    // {
+    //     let a = coset_fft_batch_gpu(polys.clone(), zero_factor, root_table);
+    //     let b = polys
+    //         .into_iter()
+    //         .map(|poly| {
+    //             let modified_poly: PolynomialCoeffs<F> = F::coset_shift()
+    //                 .powers()
+    //                 .zip(&poly.coeffs)
+    //                 .map(|(r, &c)| r * c)
+    //                 .collect::<Vec<_>>()
+    //                 .into();
+    //             fft_with_options(modified_poly, zero_factor, root_table)
+    //         })
+    //         .collect::<Vec<_>>();
+    //     assert_eq!(a.len(), b.len());
+
+    //     for (i, (val_a, val_b)) in a.iter().zip(b.iter()).enumerate() {
+    //         assert_eq!(val_a, val_b, "Mismatch at index {}", i);
+    //     }
+
+    //     return a;
+    // }
+
+    // #[cfg(not(feature = "cuda"))]
+    // {
+    // CPU fallback: process each polynomial separately
+    polys
+        .into_iter()
+        .map(|poly| {
+            let modified_poly: PolynomialCoeffs<F> = F::coset_shift()
+                .powers()
+                .zip(&poly.coeffs)
+                .map(|(r, &c)| r * c)
+                .collect::<Vec<_>>()
+                .into();
+            fft_with_options(modified_poly, zero_factor, root_table)
+        })
+        .collect()
+    // }
 }
 
 fn fft_dispatch_cpu<F: Field>(
@@ -483,11 +529,95 @@ mod tests {
     #[cfg(feature = "cuda")]
     use zeknox::init_twiddle_factors_rs;
 
-    use crate::fft::{coset_fft_batch, fft, fft_batch, fft_with_options, ifft};
+    use crate::fft::{
+        coset_fft_batch, fft, fft_batch, fft_dispatch_cpu, fft_dispatch_gpu, fft_with_options, ifft,
+    };
     use crate::goldilocks_field::GoldilocksField;
     use crate::polynomial::{PolynomialCoeffs, PolynomialValues};
     use crate::types::Field;
 
+    #[test]
+    fn test_kat() {
+        init_twiddle_factors_rs(0, 4);
+
+        let input = [
+            16807u64,
+            10376289027450995739,
+            18446743787439915009,
+            1905022641934172156,
+            4730749933575995392,
+            68841472,
+            18428264577490855681,
+            18445589101169082369,
+            18446744069414567514,
+            8070455041963588582,
+            49,
+            1625527855624486912,
+            7,
+            18446744069414555649,
+            7696581392640,
+            481036337152,
+        ];
+        let input_field: Vec<GoldilocksField> = input
+            .iter()
+            .map(|&x| GoldilocksField::from_canonical_u64(x))
+            .collect();
+
+        let res_cpu = [
+            8241673866677297204,
+            18443207692673526440,
+            3336172192632445894,
+            12915814655533318448,
+            5977358399840934215,
+            2796120128477098295,
+            16099264885043452953,
+            1114428869533774434,
+            1182881845840683068,
+            18442399148451944616,
+            5639697009785877037,
+            5534977815694745617,
+            3521085621945067109,
+            15650623939293352472,
+            11342098386477995483,
+            17336148097415430195,
+        ];
+        let res_cpu_field: Vec<GoldilocksField> = res_cpu
+            .iter()
+            .map(|&x| GoldilocksField::from_canonical_u64(x))
+            .collect();
+
+        let res_gpu = [
+            8241673866677297204,
+            18443207692673526440,
+            3336172192632445894,
+            12915814655533318448,
+            5977358399840934215,
+            2796120128477098295,
+            16099264885043452953,
+            1114428869533774434,
+            1182881845840683068,
+            18442399148451944616,
+            5639697009785877037,
+            5534977815694745617,
+            3521085621945067109,
+            15650623939293352472,
+            11342098386477995483,
+            17336148097415430195,
+        ];
+        let res_gpu_field: Vec<GoldilocksField> = res_gpu
+            .iter()
+            .map(|&x| GoldilocksField::from_canonical_u64(x))
+            .collect();
+
+        let mut input_cpu = input_field.clone();
+        fft_dispatch_cpu(&mut input_cpu, None, None);
+        assert_eq!(input_cpu, res_cpu_field);
+
+        let mut input_gpu = input_field.clone();
+        fft_dispatch_gpu(&mut input_gpu, None, None);
+        assert_eq!(input_gpu, res_gpu_field);
+    }
+
     #[test]
     fn fft_and_ifft() {
         type F = GoldilocksField;
diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs
index be8bf0ad9..28f6e8ed3 100644
--- a/field/src/polynomial/mod.rs
+++ b/field/src/polynomial/mod.rs
@@ -283,13 +283,13 @@ impl<F: Field> PolynomialCoeffs<F> {
         zero_factor: Option<usize>,
         root_table: Option<&FftRootTable<F>>,
     ) -> PolynomialValues<F> {
-        #[cfg(feature = "cuda")]
-        {
-            if F::CUDA_SUPPORT && shift == F::coset_shift() {
-                // Use GPU coset FFT directly without CPU-side coefficient modification
-                return crate::fft::coset_fft_gpu(self.clone(), zero_factor, root_table);
-            }
-        }
+        // #[cfg(feature = "cuda")]
+        // {
+        //     if F::CUDA_SUPPORT && shift == F::coset_shift() {
+        //         // Use GPU coset FFT directly without CPU-side coefficient modification
+        //         return crate::fft::coset_fft_gpu(self.clone(), zero_factor, root_table);
+        //     }
+        // }
 
         // CPU path: multiply by powers of shift, then do regular FFT
         let modified_poly: Self = shift

From 912f1a2bd7b857dcd313f40c57e55de00540d18b Mon Sep 17 00:00:00 2001
From: zhenfeizhang <zhenfei.zhang@hotmail.com>
Date: Sun, 23 Nov 2025 19:22:52 -0500
Subject: [PATCH 06/37] wip

---
 Cargo.toml                           |   2 +-
 plonky2/Cargo.toml                   |   6 +-
 plonky2/examples/fibonacci.rs        |  76 ++++++++++--
 plonky2/src/batch_fri/oracle.rs      |  12 ++
 plonky2/src/batch_fri/prover.rs      |  21 ++++
 plonky2/src/fri/oracle.rs            |  27 ++++-
 plonky2/src/gadgets/interpolation.rs |   2 +-
 plonky2/src/hash/merkle_tree.rs      | 168 ++++++++++++++++++++++++++-
 plonky2/src/iop/generator.rs         |   6 +-
 plonky2/src/plonk/prover.rs          |  32 ++++-
 plonky2/src/plonk/verifier.rs        |   5 +
 plonky2/src/util/mod.rs              | 108 ++++++++++++++++-
 12 files changed, 443 insertions(+), 22 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index eed6218b8..d8ebd31d6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,7 +13,7 @@ rand = { version = "0.8.4", default-features = false }
 serde = { version = "1.0", default-features = false, features = ["derive"] }
 static_assertions = { version = "1.1.0", default-features = false }
 unroll = { version = "0.1.5", default-features = false }
-zeknox_= { path = "../zeknox/wrappers/rust"}
+zeknox = { path = "../zeknox/wrappers/rust" }
 
 
 [profile.release]
diff --git a/plonky2/Cargo.toml b/plonky2/Cargo.toml
index 83ff08519..c5c282928 100644
--- a/plonky2/Cargo.toml
+++ b/plonky2/Cargo.toml
@@ -12,11 +12,12 @@ keywords.workspace = true
 categories.workspace = true
 
 [features]
-default = ["gate_testing", "parallel", "rand_chacha", "std", "timing"]
+default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda"]
 gate_testing = []
 parallel = ["hashbrown/rayon", "plonky2_maybe_rayon/parallel"]
 std = ["anyhow/std", "rand/std", "itertools/use_std"]
 timing = ["std", "dep:web-time"]
+cuda = ["plonky2_field/cuda"]
 
 [dependencies]
 ahash = { workspace = true }
@@ -38,6 +39,9 @@ plonky2_field = { version = "1.0.0", path = "../field", default-features = false
 plonky2_maybe_rayon = { version = "1.0.0", path = "../maybe_rayon", default-features = false }
 plonky2_util = { version = "1.0.0", path = "../util", default-features = false }
 
+# cuda accelerator wrapper
+zeknox = { workspace = true }
+
 
 [target.'cfg(all(target_arch = "wasm32", target_os = "unknown"))'.dependencies]
 getrandom = { version = "0.2", default-features = false, features = ["js"] }
diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs
index 578dc2424..dfce8de8b 100644
--- a/plonky2/examples/fibonacci.rs
+++ b/plonky2/examples/fibonacci.rs
@@ -9,13 +9,19 @@ use plonky2::plonk::config::{GenericConfig, PoseidonGoldilocksConfig};
 /// "I know the 100th element of the Fibonacci sequence, starting with constants a and b."
 /// When a == 0 and b == 1, this is proving knowledge of the 100th (standard) Fibonacci number.
 fn main() -> Result<()> {
+    // Initialize logger to see timing output
+    env_logger::Builder::from_default_env()
+        .format_timestamp(None)
+        .filter_level(log::LevelFilter::Debug)
+        .init();
     const D: usize = 2;
     type C = PoseidonGoldilocksConfig;
     type F = <C as GenericConfig<D>>::F;
 
     let config = CircuitConfig::standard_recursion_config();
+    println!("Building circuit...");
     let mut builder = CircuitBuilder::<F, D>::new(config);
-
+    println!("Building arithmetic circuit...");
     // The arithmetic circuit.
     let initial_a = builder.add_virtual_target();
     let initial_b = builder.add_virtual_target();
@@ -26,6 +32,28 @@ fn main() -> Result<()> {
         prev_target = cur_target;
         cur_target = temp;
     }
+    println!("Circuit built.");
+
+    #[cfg(feature = "cuda")]
+    {
+        zeknox::clear_cuda_errors_rs();
+        println!("Initializing CUDA twiddle factors...");
+        // Initialize twiddle factors for all dimensions that will be used
+        // This test involves multiple polynomials and recursive verification,
+        // so we initialize a wider range of dimensions to be safe
+        // for i in 0..=19 {
+        //     zeknox::init_twiddle_factors_rs(0, i);
+        // }
+
+        zeknox::init_twiddle_factors_rs(0, 3);
+        zeknox::init_twiddle_factors_rs(0, 6);
+        // Initialize coset on GPU
+        // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR)
+        // TODO: Make this generic for other fields if needed
+        let coset_gen_u64 = 7u64;
+        // zeknox::init_coset_rs(0, 19, coset_gen_u64);
+        zeknox::init_coset_rs(0, 6, coset_gen_u64);
+    }
 
     // Public inputs are the two initial values (provided below) and the result (which is generated).
     builder.register_public_input(initial_a);
@@ -38,12 +66,46 @@ fn main() -> Result<()> {
     pw.set_target(initial_b, F::ONE)?;
 
     let data = builder.build::<C>();
-    let proof = data.prove(pw)?;
 
-    println!(
-        "100th Fibonacci number mod |F| (starting with {}, {}) is: {}",
-        proof.public_inputs[0], proof.public_inputs[1], proof.public_inputs[2]
-    );
+    #[cfg(feature = "timing")]
+    {
+        use log::Level;
+        use plonky2::util::timing::TimingTree;
+        let mut timing = TimingTree::new("prove", Level::Info);
+        println!("Starting proof generation...");
+        let proof =
+            plonky2::plonk::prover::prove(&data.prover_only, &data.common, pw, &mut timing)?;
+
+        println!(
+            "100th Fibonacci number mod |F| (starting with {}, {}) is: {}",
+            proof.public_inputs[0], proof.public_inputs[1], proof.public_inputs[2]
+        );
+
+        // Print first few elements of wires_cap for comparison
+        println!("First wires_cap hash: {:?}", proof.proof.wires_cap.0[0]);
+        println!(
+            "First plonk_zs hash: {:?}",
+            proof.proof.plonk_zs_partial_products_cap.0[0]
+        );
+        println!(
+            "First quotient hash: {:?}",
+            proof.proof.quotient_polys_cap.0[0]
+        );
+
+        timing.print();
+        data.verify(proof)?;
+    }
+
+    #[cfg(not(feature = "timing"))]
+    {
+        let proof = data.prove(pw)?;
+        println!(
+            "100th Fibonacci number mod |F| (starting with {}, {}) is: {}",
+            proof.public_inputs[0], proof.public_inputs[1], proof.public_inputs[2]
+        );
+        data.verify(proof)?;
+    }
 
-    data.verify(proof)
+    println!("finished");
+    Ok(())
 }
diff --git a/plonky2/src/batch_fri/oracle.rs b/plonky2/src/batch_fri/oracle.rs
index 58deeaa3c..1f31b8cf1 100644
--- a/plonky2/src/batch_fri/oracle.rs
+++ b/plonky2/src/batch_fri/oracle.rs
@@ -300,6 +300,18 @@ mod test {
             reduction_arity_bits,
         };
 
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            // Initialize twiddle factors for all dimensions that will be used
+            // This test involves multiple polynomials and recursive verification,
+            // so we initialize a wider range of dimensions to be safe
+            let current_log_size = k0 + fri_params.config.rate_bits;
+            for i in 0..=current_log_size + 5 {
+                zeknox::init_twiddle_factors_rs(0, i);
+            }
+        }
+
         let n0 = 1 << k0;
         let n1 = 1 << k1;
         let n2 = 1 << k2;
diff --git a/plonky2/src/batch_fri/prover.rs b/plonky2/src/batch_fri/prover.rs
index e71fe25b4..6815bb1af 100644
--- a/plonky2/src/batch_fri/prover.rs
+++ b/plonky2/src/batch_fri/prover.rs
@@ -263,6 +263,17 @@ mod tests {
         };
 
         let n = 1 << k;
+
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            // Initialize twiddle factors for all dimensions that will be used
+            let current_log_size = k + fri_params.config.rate_bits;
+            for i in 0..=current_log_size {
+                zeknox::init_twiddle_factors_rs(0, i);
+            }
+        }
+
         let trace = PolynomialValues::new((1..n + 1).map(F::from_canonical_i64).collect_vec());
 
         let polynomial_batch: BatchFriOracle<GoldilocksField, C, D> = BatchFriOracle::from_values(
@@ -359,6 +370,16 @@ mod tests {
             reduction_arity_bits,
         };
 
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            // Initialize twiddle factors for all dimensions that will be used
+            let current_log_size = k0 + fri_params.config.rate_bits;
+            for i in 0..=current_log_size {
+                zeknox::init_twiddle_factors_rs(0, i);
+            }
+        }
+
         let n0 = 1 << k0;
         let n1 = 1 << k1;
         let n2 = 1 << k2;
diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs
index e413071a4..803747e34 100644
--- a/plonky2/src/fri/oracle.rs
+++ b/plonky2/src/fri/oracle.rs
@@ -65,7 +65,8 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
         let coeffs = timed!(
             timing,
             "IFFT",
-            values.into_par_iter().map(|v| v.ifft()).collect::<Vec<_>>()
+            // Use sequential iteration for deterministic results
+            values.into_iter().map(|v| v.ifft()).collect::<Vec<_>>()
         );
 
         Self::from_coeffs(
@@ -95,7 +96,20 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
         );
 
         let mut leaves = timed!(timing, "transpose LDEs", transpose(&lde_values));
+        // Debug: Print first leaf for determinism check
+        if !leaves.is_empty() && !leaves[0].is_empty() {
+            println!(
+                "First leaf before reverse_bits: {:?}",
+                &leaves[0][..4.min(leaves[0].len())]
+            );
+        }
         reverse_index_bits_in_place(&mut leaves);
+        if !leaves.is_empty() && !leaves[0].is_empty() {
+            println!(
+                "First leaf after reverse_bits: {:?}",
+                &leaves[0][..4.min(leaves[0].len())]
+            );
+        }
         let merkle_tree = timed!(
             timing,
             "build Merkle tree",
@@ -121,9 +135,16 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
 
         // If blinding, salt with two random elements to each leaf vector.
         let salt_size = if blinding { SALT_SIZE } else { 0 };
+        println!(
+            "lde_values: num_polys={}, degree={}, blinding={}, salt_size={}",
+            polynomials.len(),
+            degree,
+            blinding,
+            salt_size
+        );
 
         polynomials
-            .par_iter()
+            .iter()
             .map(|p| {
                 assert_eq!(p.len(), degree, "Polynomial degrees inconsistent");
                 p.lde(rate_bits)
@@ -132,7 +153,7 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
             })
             .chain(
                 (0..salt_size)
-                    .into_par_iter()
+                    .into_iter()
                     .map(|_| F::rand_vec(degree << rate_bits)),
             )
             .collect()
diff --git a/plonky2/src/gadgets/interpolation.rs b/plonky2/src/gadgets/interpolation.rs
index 39b048af4..9aedf7143 100644
--- a/plonky2/src/gadgets/interpolation.rs
+++ b/plonky2/src/gadgets/interpolation.rs
@@ -86,7 +86,7 @@ mod tests {
 
         let value_targets = values
             .iter()
-            .map(|&v| (builder.constant_extension(v)))
+            .map(|&v| builder.constant_extension(v))
             .collect::<Vec<_>>();
 
         let zt = builder.constant_extension(z);
diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs
index 31bcf5e37..0846b7224 100644
--- a/plonky2/src/hash/merkle_tree.rs
+++ b/plonky2/src/hash/merkle_tree.rs
@@ -5,8 +5,16 @@ use core::slice;
 
 use plonky2_maybe_rayon::*;
 use serde::{Deserialize, Serialize};
+#[cfg(feature = "cuda")]
+use zeknox::device::{memory::HostOrDeviceSlice, stream::CudaStream};
+#[cfg(feature = "cuda")]
+use zeknox::{
+    fill_digests_buf_linear_gpu_with_gpu_ptr, fill_digests_buf_linear_multigpu_with_gpu_ptr,
+};
 
 use crate::hash::hash_types::RichField;
+#[cfg(feature = "cuda")]
+use crate::hash::hash_types::NUM_HASH_OUT_ELTS;
 use crate::hash::merkle_proofs::MerkleProof;
 use crate::plonk::config::{GenericHashOut, Hasher};
 use crate::util::log2_strict;
@@ -148,6 +156,125 @@ pub(crate) fn fill_digests_buf<F: RichField, H: Hasher<F>>(
     );
 }
 
+#[cfg(feature = "cuda")]
+fn fill_digests_buf_gpu_ptr<F: RichField, H: Hasher<F>>(
+    digests_buf: &mut [MaybeUninit<H::Hash>],
+    cap_buf: &mut [MaybeUninit<H::Hash>],
+    leaves_ptr: *const F,
+    leaves_len: usize,
+    leaf_len: usize,
+    cap_height: usize,
+    gpu_id: u64,
+) {
+    let digests_count: u64 = digests_buf.len().try_into().unwrap();
+    let leaves_count: u64 = leaves_len.try_into().unwrap();
+    let caps_count: u64 = cap_buf.len().try_into().unwrap();
+    let cap_height: u64 = cap_height.try_into().unwrap();
+    let leaf_size: u64 = leaf_len.try_into().unwrap();
+
+    // if digests_buf is empty (size 0), just allocate a few bytes to avoid errors
+    let digests_size = if digests_buf.len() == 0 {
+        NUM_HASH_OUT_ELTS
+    } else {
+        digests_buf.len() * NUM_HASH_OUT_ELTS
+    };
+    let caps_size = if cap_buf.len() == 0 {
+        NUM_HASH_OUT_ELTS
+    } else {
+        cap_buf.len() * NUM_HASH_OUT_ELTS
+    };
+
+    let mut gpu_digests_buf: HostOrDeviceSlice<'_, F> =
+        HostOrDeviceSlice::cuda_malloc(gpu_id as i32, digests_size).unwrap();
+    let mut gpu_cap_buf: HostOrDeviceSlice<'_, F> =
+        HostOrDeviceSlice::cuda_malloc(gpu_id as i32, caps_size).unwrap();
+
+    unsafe {
+        let num_gpus: usize = std::env::var("NUM_OF_GPUS")
+            .unwrap_or_else(|_| "1".to_string())
+            .parse()
+            .unwrap_or(1);
+
+        if leaves_count >= (1 << 12) && cap_height > 0 && num_gpus > 1 {
+            // Multi-GPU path
+            fill_digests_buf_linear_multigpu_with_gpu_ptr(
+                gpu_digests_buf.as_mut_ptr() as *mut core::ffi::c_void,
+                gpu_cap_buf.as_mut_ptr() as *mut core::ffi::c_void,
+                leaves_ptr as *mut core::ffi::c_void,
+                digests_count,
+                caps_count,
+                leaves_count,
+                leaf_size,
+                cap_height,
+                0, // hash_type: 0 for Poseidon
+            );
+        } else {
+            // Single GPU path
+            fill_digests_buf_linear_gpu_with_gpu_ptr(
+                gpu_digests_buf.as_mut_ptr() as *mut core::ffi::c_void,
+                gpu_cap_buf.as_mut_ptr() as *mut core::ffi::c_void,
+                leaves_ptr as *mut core::ffi::c_void,
+                digests_count,
+                caps_count,
+                leaves_count,
+                leaf_size,
+                cap_height,
+                0, // hash_type: 0 for Poseidon
+                gpu_id,
+            );
+        }
+    }
+
+    let stream1 = CudaStream::create().unwrap();
+    let stream2 = CudaStream::create().unwrap();
+
+    gpu_digests_buf
+        .copy_to_host_ptr_async(
+            digests_buf.as_mut_ptr() as *mut core::ffi::c_void,
+            digests_size,
+            &stream1,
+        )
+        .expect("copy digests");
+    gpu_cap_buf
+        .copy_to_host_ptr_async(
+            cap_buf.as_mut_ptr() as *mut core::ffi::c_void,
+            caps_size,
+            &stream2,
+        )
+        .expect("copy caps");
+    stream1.synchronize().expect("cuda sync");
+    stream2.synchronize().expect("cuda sync");
+    stream1.destroy().expect("cuda stream destroy");
+    stream2.destroy().expect("cuda stream destroy");
+}
+
+#[cfg(feature = "cuda")]
+fn fill_digests_buf_gpu<F: RichField, H: Hasher<F>>(
+    digests_buf: &mut [MaybeUninit<H::Hash>],
+    cap_buf: &mut [MaybeUninit<H::Hash>],
+    leaves: &Vec<F>,
+    leaf_size: usize,
+    cap_height: usize,
+) {
+    let leaves_count = leaves.len() / leaf_size;
+    let gpu_id = 0;
+
+    let mut gpu_leaves_buf: HostOrDeviceSlice<'_, F> =
+        HostOrDeviceSlice::cuda_malloc(gpu_id as i32, leaves.len()).unwrap();
+
+    let _ = gpu_leaves_buf.copy_from_host(leaves.as_slice());
+
+    fill_digests_buf_gpu_ptr::<F, H>(
+        digests_buf,
+        cap_buf,
+        gpu_leaves_buf.as_mut_ptr(),
+        leaves_count,
+        leaf_size,
+        cap_height,
+        gpu_id,
+    );
+}
+
 pub(crate) fn merkle_tree_prove<F: RichField, H: Hasher<F>>(
     leaf_index: usize,
     leaves_len: usize,
@@ -207,10 +334,47 @@ impl<F: RichField, H: Hasher<F>> MerkleTree<F, H> {
 
         let digests_buf = capacity_up_to_mut(&mut digests, num_digests);
         let cap_buf = capacity_up_to_mut(&mut cap, len_cap);
-        fill_digests_buf::<F, H>(digests_buf, cap_buf, &leaves[..], cap_height);
+
+        // #[cfg(feature = "cuda")]
+        // {
+        //     // Check if we should use GPU acceleration
+        //     // Use GPU for large trees (>= 1024 leaves) or if CUDA_MERKLE_THRESHOLD is set
+        //     let use_gpu = if let Ok(threshold_str) = std::env::var("CUDA_MERKLE_THRESHOLD") {
+        //         if let Ok(threshold) = threshold_str.parse::<usize>() {
+        //             leaves.len() >= threshold
+        //         } else {
+        //             leaves.len() >= 1024
+        //         }
+        //     } else {
+        //         leaves.len() >= 1024
+        //     };
+
+        //     if use_gpu {
+        //         // Flatten leaves into 1D vector for GPU
+        //         let leaf_size = if leaves.is_empty() { 0 } else { leaves[0].len() };
+        //         let zeros = vec![F::ZERO; leaf_size];
+        //         let mut leaves_1d: Vec<F> = Vec::with_capacity(leaves.len() * leaf_size);
+        //         for leaf in &leaves {
+        //             if leaf.is_empty() {
+        //                 leaves_1d.extend(zeros.clone());
+        //             } else {
+        //                 leaves_1d.extend(leaf.clone());
+        //             }
+        //         }
+
+        //         fill_digests_buf_gpu::<F, H>(digests_buf, cap_buf, &leaves_1d, leaf_size, cap_height);
+        //     } else {
+        //         fill_digests_buf::<F, H>(digests_buf, cap_buf, &leaves[..], cap_height);
+        //     }
+        // }
+
+        // #[cfg(not(feature = "cuda"))]
+        {
+            fill_digests_buf::<F, H>(digests_buf, cap_buf, &leaves[..], cap_height);
+        }
 
         unsafe {
-            // SAFETY: `fill_digests_buf` and `cap` initialized the spare capacity up to
+            // SAFETY: `fill_digests_buf` or `fill_digests_buf_gpu` initialized the spare capacity up to
             // `num_digests` and `len_cap`, resp.
             digests.set_len(num_digests);
             cap.set_len(len_cap);
diff --git a/plonky2/src/iop/generator.rs b/plonky2/src/iop/generator.rs
index f81508b7a..8e387c4cd 100644
--- a/plonky2/src/iop/generator.rs
+++ b/plonky2/src/iop/generator.rs
@@ -36,7 +36,7 @@ pub fn generate_partial_witness<
     let config = &common_data.config;
     let generators = &prover_data.generators;
     let generator_indices_by_watches = &prover_data.generator_indices_by_watches;
-
+    println!("Initializing witness.");
     let mut witness = PartitionWitness::new(
         config.num_wires,
         common_data.degree(),
@@ -57,6 +57,8 @@ pub fn generate_partial_witness<
 
     let mut buffer = GeneratedValues::empty();
 
+    println!("Starting generator execution.");
+
     // Keep running generators until we fail to make progress.
     while !pending_generator_indices.is_empty() {
         let mut next_pending_generator_indices = Vec::new();
@@ -96,6 +98,8 @@ pub fn generate_partial_witness<
         pending_generator_indices = next_pending_generator_indices;
     }
 
+    println!("Finished generator execution.");
+
     if remaining_generators != 0 {
         return Err(anyhow!("{} generators weren't run", remaining_generators));
     }
diff --git a/plonky2/src/plonk/prover.rs b/plonky2/src/plonk/prover.rs
index ac0a683cf..649c811a1 100644
--- a/plonky2/src/plonk/prover.rs
+++ b/plonky2/src/plonk/prover.rs
@@ -150,6 +150,7 @@ where
     let degree = common_data.degree();
 
     set_lookup_wires(prover_data, common_data, &mut partition_witness)?;
+    println!("Set lookup wires.");
 
     let public_inputs = partition_witness.get_targets(&prover_data.public_inputs);
     let public_inputs_hash = C::InnerHasher::hash_no_pad(&public_inputs);
@@ -159,17 +160,25 @@ where
         "compute full witness",
         partition_witness.full_witness()
     );
-
+    println!("Computed full witness.");
     let wires_values: Vec<PolynomialValues<F>> = timed!(
         timing,
         "compute wire polynomials",
+        // Use sequential iteration for deterministic results
         witness
             .wire_values
-            .par_iter()
+            .iter()
             .map(|column| PolynomialValues::new(column.clone()))
             .collect()
     );
-
+    println!("Computed wire polynomials.");
+    // Debug: Print first few wire values to check determinism
+    if !wires_values.is_empty() && !wires_values[0].values.is_empty() {
+        println!(
+            "First wire poly first 5 values: {:?}",
+            &wires_values[0].values[..5.min(wires_values[0].values.len())]
+        );
+    }
     let wires_commitment = timed!(
         timing,
         "compute wires commitment",
@@ -182,7 +191,7 @@ where
             prover_data.fft_root_table.as_ref(),
         )
     );
-
+    println!("Computed wires commitment.");
     let mut challenger = Challenger::<F, C::Hasher>::new();
 
     // Observe the FRI config
@@ -230,6 +239,7 @@ where
         .collect();
     let zs_partial_products = [plonk_z_vecs, partial_products_and_zs.concat()].concat();
 
+    println!("Computed Z and partial products.");
     // All lookup polys: RE and partial SLDCs.
     let lookup_polys =
         compute_all_lookup_polys(&witness, &deltas, prover_data, common_data, has_lookup);
@@ -240,6 +250,7 @@ where
         zs_partial_products
     };
 
+    println!("Computed lookup polynomials.");
     let partial_products_zs_and_lookup_commitment = timed!(
         timing,
         "commit to partial products, Z's and, if any, lookup polynomials",
@@ -272,7 +283,12 @@ where
             &alphas,
         )
     );
+    println!("prover alphas: {:?}", alphas);
+    println!("prover betas: {:?}", betas);
+    println!("prover gammas: {:?}", gammas);
+    println!("prover deltas: {:?}", deltas);
 
+    println!("Split up quotient polys.");
     let all_quotient_poly_chunks: Vec<PolynomialCoeffs<F>> = timed!(
         timing,
         "split up quotient polys",
@@ -288,6 +304,7 @@ where
             .collect()
     );
 
+    println!("Committed to quotient polys.");
     let quotient_polys_commitment = timed!(
         timing,
         "commit to quotient polys",
@@ -301,9 +318,11 @@ where
         )
     );
 
+    println!("Committed to quotient polys.");
     challenger.observe_cap::<C::Hasher>(&quotient_polys_commitment.merkle_tree.cap);
 
     let zeta = challenger.get_extension_challenge::<D>();
+    println!("prover zeta: {:?}", zeta);
     // To avoid leaking witness data, we want to ensure that our opening locations, `zeta` and
     // `g * zeta`, are not in our subgroup `H`. It suffices to check `zeta` only, since
     // `(g * zeta)^n = zeta^n`, where `n` is the order of `g`.
@@ -313,6 +332,7 @@ where
         "Opening point is in the subgroup."
     );
 
+    println!("Constructing the opening set, including lookups.");
     let openings = timed!(
         timing,
         "construct the opening set, including lookups",
@@ -326,6 +346,8 @@ where
             common_data
         )
     );
+    println!("Computed openings.");
+
     challenger.observe_openings(&openings.to_fri_openings());
     let instance = common_data.get_fri_instance(zeta);
 
@@ -347,7 +369,7 @@ where
             timing,
         )
     );
-
+    println!("Computed opening proofs.");
     let proof = Proof::<F, C, D> {
         wires_cap: wires_commitment.merkle_tree.cap,
         plonk_zs_partial_products_cap: partial_products_zs_and_lookup_commitment.merkle_tree.cap,
diff --git a/plonky2/src/plonk/verifier.rs b/plonky2/src/plonk/verifier.rs
index fa1bc14b8..d369656c6 100644
--- a/plonky2/src/plonk/verifier.rs
+++ b/plonky2/src/plonk/verifier.rs
@@ -27,6 +27,11 @@ pub(crate) fn verify<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, c
         &verifier_data.circuit_digest,
         common_data,
     )?;
+    println!("verifier alphas: {:?}", challenges.plonk_alphas);
+    println!("verifier betas: {:?}", challenges.plonk_betas);
+    println!("verifier gammas: {:?}", challenges.plonk_gammas);
+    println!("verifier deltas: {:?}", challenges.plonk_deltas);
+    println!("verifier zeta: {:?}", challenges.plonk_zeta);
 
     verify_with_challenges::<F, C, D>(
         proof_with_pis.proof,
diff --git a/plonky2/src/util/mod.rs b/plonky2/src/util/mod.rs
index 8f9960034..6f2ae608e 100644
--- a/plonky2/src/util/mod.rs
+++ b/plonky2/src/util/mod.rs
@@ -6,6 +6,8 @@ use alloc::vec::Vec;
 use plonky2_maybe_rayon::*;
 #[doc(inline)]
 pub use plonky2_util::*;
+#[cfg(feature = "cuda")]
+use zeknox::{device::memory::HostOrDeviceSlice, transpose_rev_batch, types::TransposeConfig};
 
 use crate::field::polynomial::PolynomialValues;
 use crate::field::types::Field;
@@ -22,10 +24,114 @@ pub(crate) fn transpose_poly_values<F: Field>(polys: Vec<PolynomialValues<F>>) -
     transpose(&poly_values)
 }
 
+#[cfg(feature = "cuda")]
+fn transpose_gpu<T: Send + Sync + Copy>(matrix: &[Vec<T>]) -> Vec<Vec<T>> {
+    use std::time::Instant;
+
+    if matrix.is_empty() || matrix[0].is_empty() {
+        return vec![];
+    }
+
+    let num_rows = matrix.len();
+    let num_cols = matrix[0].len();
+    let total_elements = num_rows * num_cols;
+
+    // Flatten the 2D matrix into a 1D vector for GPU
+    let mut flat_input: Vec<T> = Vec::with_capacity(total_elements);
+    for row in matrix {
+        flat_input.extend_from_slice(row);
+    }
+
+    let gpu_id = 0;
+    let log_n = (num_cols as f64).log2().ceil() as usize;
+
+    // Allocate GPU memory for input and output
+    let mut gpu_input: HostOrDeviceSlice<'_, T> =
+        HostOrDeviceSlice::cuda_malloc(gpu_id, total_elements).unwrap();
+    let mut gpu_output: HostOrDeviceSlice<'_, T> =
+        HostOrDeviceSlice::cuda_malloc(gpu_id, total_elements).unwrap();
+
+    // Copy input to GPU
+    gpu_input.copy_from_host(&flat_input).unwrap();
+
+    // Configure transpose
+    let mut cfg = TransposeConfig::default();
+    cfg.batches = num_rows as u32;
+    cfg.are_inputs_on_device = true;
+    cfg.are_outputs_on_device = true;
+
+    let timers = Instant::now();
+    // Perform GPU transpose
+    transpose_rev_batch(
+        gpu_id,
+        gpu_output.as_mut_ptr(),
+        gpu_input.as_mut_ptr(),
+        log_n,
+        cfg,
+    );
+    println!(
+        "CUDA transpose of {}x{} took {:?}",
+        num_rows,
+        num_cols,
+        timers.elapsed()
+    );
+
+    let timer = Instant::now();
+    // Copy result back to host
+    let mut flat_output = vec![unsafe { std::mem::zeroed() }; total_elements];
+    gpu_output
+        .copy_to_host(&mut flat_output, total_elements)
+        .unwrap();
+    println!(
+        "CUDA transpose copy back and reshape of {}x{} took {:?}",
+        num_rows,
+        num_cols,
+        timer.elapsed()
+    );
+
+    // Reshape back to 2D (transposed) using chunks_exact for better performance
+    // The GPU transpose outputs in column-major order, so we can just chunk by num_rows
+    let result: Vec<Vec<T>> = flat_output
+        .chunks_exact(num_rows)
+        .map(|chunk| chunk.to_vec())
+        .collect();
+
+    result
+}
+
 pub fn transpose<T: Send + Sync + Copy>(matrix: &[Vec<T>]) -> Vec<Vec<T>> {
+    if matrix.is_empty() {
+        return vec![];
+    }
+
     let len = matrix[0].len();
+
+    // #[cfg(feature = "cuda")]
+    // {
+    //     // Use GPU for large matrices
+    //     // Threshold: use GPU if total elements >= 2^16 (65536) or if CUDA_TRANSPOSE_THRESHOLD is set
+    //     let num_rows = matrix.len();
+    //     let num_cols = len;
+    //     let total_elements = num_rows * num_cols;
+
+    //     let use_gpu = if let Ok(threshold_str) = std::env::var("CUDA_TRANSPOSE_THRESHOLD") {
+    //         if let Ok(threshold) = threshold_str.parse::<usize>() {
+    //             total_elements >= threshold
+    //         } else {
+    //             total_elements >= 65536
+    //         }
+    //     } else {
+    //         total_elements >= 65536
+    //     };
+
+    //     if use_gpu && num_cols.is_power_of_two() {
+    //         return transpose_gpu(matrix);
+    //     }
+    // }
+
+    // CPU fallback
+    // Use sequential iteration for deterministic results
     (0..len)
-        .into_par_iter()
         .map(|i| matrix.iter().map(|row| row[i]).collect())
         .collect()
 }

From a04950fda99e1a8da2c24cf39a336543ba55ed22 Mon Sep 17 00:00:00 2001
From: zhenfeizhang <zhenfei.zhang@hotmail.com>
Date: Mon, 24 Nov 2025 08:54:30 -0500
Subject: [PATCH 07/37] wip

---
 field/Cargo.toml                |  4 +-
 field/perm_comp.md              | 34 +++++++++++++
 field/src/fft.rs                | 85 ++++++++++++++++++---------------
 field/src/polynomial/mod.rs     | 24 ++++++----
 plonky2/Cargo.toml              |  3 +-
 plonky2/examples/fibonacci.rs   | 10 ++--
 plonky2/src/hash/merkle_tree.rs | 68 +++++++++++++-------------
 7 files changed, 138 insertions(+), 90 deletions(-)
 create mode 100644 field/perm_comp.md

diff --git a/field/Cargo.toml b/field/Cargo.toml
index 12e38e354..1b6a62d71 100644
--- a/field/Cargo.toml
+++ b/field/Cargo.toml
@@ -36,6 +36,6 @@ workspace = true
 
 
 [features]
-# default = []
-default = [ "cuda" ]
+default = []
+# default = [ "cuda" ]
 cuda = []
\ No newline at end of file
diff --git a/field/perm_comp.md b/field/perm_comp.md
new file mode 100644
index 000000000..dec02f91f
--- /dev/null
+++ b/field/perm_comp.md
@@ -0,0 +1,34 @@
+# Performance comparison
+- CPU: AMD 7950x3d 16 core
+- GPU: 4080 super; single card
+- 
+
+| Operation | CPU (s) | GPU (s) | Speedup | GPU Tuned? |
+|-----------|---------|---------|---------|------------|
+| **Run generators** | 1.7767 | 1.7899 | 0.99x | ✗ Not accelerated |
+| **Compute full witness** | 0.3369 | 0.3362 | 1.00x | ✗ Not accelerated |
+| **Compute wire polynomials** | 0.0396 | 0.0392 | 1.01x | ✗ Not accelerated |
+| **Compute wires commitment** | 20.1902 | 10.0548 | **2.01x** | ✓ Yes |
+| └─ IFFT | 1.2070 | 0.1587 | **7.61x** | ✓ **Highly tuned** |
+| └─ FFT + blinding | 11.4267 | 3.6139 | **3.16x** | ✓ **Highly tuned** |
+| └─ Transpose LDEs | 2.8010 | 2.7881 | 1.00x | ✗ Not accelerated |
+| └─ Build Merkle tree | 4.5166 | 3.2734 | **1.38x** | ✓ Tuned |
+| **Compute partial products** | 0.1700 | 0.1671 | 1.02x | ✗ Not accelerated |
+| **Commit to partial products/Z's** | 3.4213 | 1.6982 | **2.01x** | ✓ Yes |
+| └─ IFFT | 0.1860 | 0.0241 | **7.72x** | ✓ **Highly tuned** |
+| └─ FFT + blinding | 1.7627 | 0.4778 | **3.69x** | ✓ **Highly tuned** |
+| └─ Transpose LDEs | 0.3906 | 0.3874 | 1.01x | ✗ Not accelerated |
+| └─ Build Merkle tree | 1.0253 | 0.7573 | **1.35x** | ✓ Tuned |
+| **Compute quotient polys** | 1.4041 | 1.3128 | 1.07x | ✗ Not accelerated |
+| **Split quotient polys** | 0.0098 | 0.0212 | 0.46x | ✗ Not accelerated|
+| **Commit to quotient polys** | 2.6641 | 1.4077 | **1.89x** | ✓ Yes |
+| └─ FFT + blinding | 1.5496 | 0.4315 | **3.59x** | ✓ **Highly tuned** |
+| └─ Transpose LDEs | 0.2952 | 0.2908 | 1.02x | ✗ Not accelerated |
+| └─ Build Merkle tree | 0.7756 | 0.6453 | **1.20x** | ✓ Tuned |
+| **Construct opening set** | 0.1609 | 0.1600 | 1.01x | ✗ Not accelerated |
+| **Compute opening proofs** | 1.3580 | 1.2919 | 1.05x | ✗ Not accelerated |
+| └─ Reduce 255 polynomials | 0.8715 | 0.8518 | 1.02x | ✗ Not accelerated |
+| └─ Reduce 2 polynomials | 0.0087 | 0.0085 | 1.02x | ✗ Not accelerated |
+| └─ Final FFT 4194304 | 0.3083 | 0.3023 | 1.02x | ✗ Not accelerated |
+| └─ Fold codewords | 0.1312 | 0.0904 | **1.45x** | ✗ Not accelerated |
+| └─ Find PoW witness | 0.0014 | 0.0038 | 0.37x | ✗ Not accelerated |
\ No newline at end of file
diff --git a/field/src/fft.rs b/field/src/fft.rs
index 9c6c8fed6..bccfb3486 100644
--- a/field/src/fft.rs
+++ b/field/src/fft.rs
@@ -38,43 +38,46 @@ fn fft_dispatch_gpu<F: Field>(
     zero_factor: Option<usize>,
     root_table: Option<&FftRootTable<F>>,
 ) {
-    use zeknox::ntt_batch;
-    use zeknox::types::NTTConfig;
-
-    let mut a = input.to_vec();
-    let mut b = input.to_vec();
-
-    ntt_batch(
-        0,
-        a.as_mut_ptr(),
-        input.len().trailing_zeros() as usize,
-        NTTConfig::default(),
-    );
-
-    fft_dispatch_cpu(&mut b, zero_factor, root_table);
-    ark_std::println!("a: {:?}", a);
-    ark_std::println!("b: {:?}", b);
-
-    assert_eq!(
-        a, b,
-        "failed GPU FFT vs CPU FFT comparison\ngpu:{:?}\ncpu:{:?}\ninput:{:?}",
-        a, b, input
-    );
+    // if F::CUDA_SUPPORT {
+    //     use zeknox::ntt_batch;
+    //     use zeknox::types::NTTConfig;
 
-    input.copy_from_slice(&a);
+    //     let mut a = input.to_vec();
+    //     let mut b = input.to_vec();
 
-    // use zeknox::ntt_batch;
-    // use zeknox::types::NTTConfig;
-    // if F::CUDA_SUPPORT {
-    //     return ntt_batch(
+    //     ntt_batch(
     //         0,
-    //         input.as_mut_ptr(),
+    //         a.as_mut_ptr(),
     //         input.len().trailing_zeros() as usize,
     //         NTTConfig::default(),
     //     );
-    // } else {
-    //     return fft_dispatch_cpu(input, zero_factor, root_table);
+
+    //     fft_dispatch_cpu(&mut b, zero_factor, root_table);
+    //     ark_std::println!("a: {:?}", a);
+    //     ark_std::println!("b: {:?}", b);
+
+    //     assert_eq!(
+    //         a, b,
+    //         "failed GPU FFT vs CPU FFT comparison\ngpu:{:?}\ncpu:{:?}\ninput:{:?}",
+    //         a, b, input
+    //     );
+
+    //     input.copy_from_slice(&a);
     // }
+    // return fft_dispatch_cpu(input, zero_factor, root_table);
+
+    use zeknox::ntt_batch;
+    use zeknox::types::NTTConfig;
+    if F::CUDA_SUPPORT {
+        return ntt_batch(
+            0,
+            input.as_mut_ptr(),
+            input.len().trailing_zeros() as usize,
+            NTTConfig::default(),
+        );
+    } else {
+        return fft_dispatch_cpu(input, zero_factor, root_table);
+    }
 }
 
 /// Batch FFT computation for multiple polynomials on GPU
@@ -269,7 +272,7 @@ pub fn coset_fft_batch_with_options<F: Field>(
     // }
 }
 
-fn fft_dispatch_cpu<F: Field>(
+pub(crate) fn fft_dispatch_cpu<F: Field>(
     input: &mut [F],
     zero_factor: Option<usize>,
     root_table: Option<&FftRootTable<F>>,
@@ -298,10 +301,15 @@ fn fft_dispatch<F: Field>(
     root_table: Option<&FftRootTable<F>>,
 ) {
     #[cfg(feature = "cuda")]
-    return fft_dispatch_gpu(input, zero_factor, root_table);
-
+    {
+        // ark_std::println!("Using GPU FFT dispatch");
+        return fft_dispatch_gpu(input, zero_factor, root_table);
+    }
     #[cfg(not(feature = "cuda"))]
-    return fft_dispatch_cpu(input, zero_factor, root_table);
+    {
+        // ark_std::println!("Using CPU FFT dispatch");
+        return fft_dispatch_cpu(input, zero_factor, root_table);
+    }
 }
 
 #[inline]
@@ -529,14 +537,15 @@ mod tests {
     #[cfg(feature = "cuda")]
     use zeknox::init_twiddle_factors_rs;
 
-    use crate::fft::{
-        coset_fft_batch, fft, fft_batch, fft_dispatch_cpu, fft_dispatch_gpu, fft_with_options, ifft,
-    };
+    #[cfg(feature = "cuda")]
+    use crate::fft::{coset_fft_batch, fft_dispatch_cpu, fft_dispatch_gpu};
+    use crate::fft::{fft, fft_batch, fft_with_options, ifft};
     use crate::goldilocks_field::GoldilocksField;
     use crate::polynomial::{PolynomialCoeffs, PolynomialValues};
     use crate::types::Field;
 
     #[test]
+    #[cfg(feature = "cuda")]
     fn test_kat() {
         init_twiddle_factors_rs(0, 4);
 
@@ -669,7 +678,7 @@ mod tests {
         type F = GoldilocksField;
 
         // Test various polynomial sizes
-        for log_size in [8, 10, 12, 14] {
+        for log_size in [8, 10, 12, 14,16,18,20] {
             let size = 1 << log_size;
             zeknox::clear_cuda_errors_rs();
             init_twiddle_factors_rs(0, log_size);
diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs
index 28f6e8ed3..a78cc10d1 100644
--- a/field/src/polynomial/mod.rs
+++ b/field/src/polynomial/mod.rs
@@ -12,7 +12,7 @@ use plonky2_util::log2_strict;
 use serde::{Deserialize, Serialize};
 
 use crate::extension::{Extendable, FieldExtension};
-use crate::fft::{fft, fft_with_options, ifft, FftRootTable};
+use crate::fft::{fft, fft_dispatch_cpu, fft_with_options, ifft, FftRootTable};
 use crate::types::Field;
 
 /// A polynomial in point-value form.
@@ -283,22 +283,26 @@ impl<F: Field> PolynomialCoeffs<F> {
         zero_factor: Option<usize>,
         root_table: Option<&FftRootTable<F>>,
     ) -> PolynomialValues<F> {
-        // #[cfg(feature = "cuda")]
-        // {
-        //     if F::CUDA_SUPPORT && shift == F::coset_shift() {
-        //         // Use GPU coset FFT directly without CPU-side coefficient modification
-        //         return crate::fft::coset_fft_gpu(self.clone(), zero_factor, root_table);
-        //     }
-        // }
+        #[cfg(feature = "cuda")]
+        {
+            if F::CUDA_SUPPORT && shift == F::coset_shift() {
+                // Use GPU coset FFT directly without CPU-side coefficient modification
+                // ark_std::println!("Using GPU coset FFT: degree {}", self.len() - 1);
+                return crate::fft::coset_fft_gpu(self.clone(), zero_factor, root_table);
+            }
+        }
 
         // CPU path: multiply by powers of shift, then do regular FFT
-        let modified_poly: Self = shift
+        let mut modified_poly: Self = shift
             .powers()
             .zip(&self.coeffs)
             .map(|(r, &c)| r * c)
             .collect::<Vec<_>>()
             .into();
-        modified_poly.fft_with_options(zero_factor, root_table)
+
+        fft_dispatch_cpu(&mut modified_poly.coeffs, zero_factor, root_table);
+        modified_poly.coeffs.into()
+        // modified_poly.fft_with_options(zero_factor, root_table)
     }
 
     pub fn to_extension<const D: usize>(&self) -> PolynomialCoeffs<F::Extension>
diff --git a/plonky2/Cargo.toml b/plonky2/Cargo.toml
index c5c282928..c84876752 100644
--- a/plonky2/Cargo.toml
+++ b/plonky2/Cargo.toml
@@ -12,7 +12,8 @@ keywords.workspace = true
 categories.workspace = true
 
 [features]
-default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda"]
+default = ["gate_testing", "parallel", "rand_chacha", "std", "timing"]
+# default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing"]
 gate_testing = []
 parallel = ["hashbrown/rayon", "plonky2_maybe_rayon/parallel"]
 std = ["anyhow/std", "rand/std", "itertools/use_std"]
diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs
index dfce8de8b..f456ce46c 100644
--- a/plonky2/examples/fibonacci.rs
+++ b/plonky2/examples/fibonacci.rs
@@ -27,7 +27,7 @@ fn main() -> Result<()> {
     let initial_b = builder.add_virtual_target();
     let mut prev_target = initial_a;
     let mut cur_target = initial_b;
-    for _ in 0..99 {
+    for _ in 0..9999999 {
         let temp = builder.add(prev_target, cur_target);
         prev_target = cur_target;
         cur_target = temp;
@@ -45,14 +45,14 @@ fn main() -> Result<()> {
         //     zeknox::init_twiddle_factors_rs(0, i);
         // }
 
-        zeknox::init_twiddle_factors_rs(0, 3);
-        zeknox::init_twiddle_factors_rs(0, 6);
+        zeknox::init_twiddle_factors_rs(0, 19);
+        zeknox::init_twiddle_factors_rs(0, 22);
         // Initialize coset on GPU
         // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR)
         // TODO: Make this generic for other fields if needed
         let coset_gen_u64 = 7u64;
-        // zeknox::init_coset_rs(0, 19, coset_gen_u64);
-        zeknox::init_coset_rs(0, 6, coset_gen_u64);
+        zeknox::init_coset_rs(0, 22, coset_gen_u64);
+        // zeknox::init_coset_rs(0, 16, coset_gen_u64);
     }
 
     // Public inputs are the two initial values (provided below) and the result (which is generated).
diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs
index 0846b7224..495825147 100644
--- a/plonky2/src/hash/merkle_tree.rs
+++ b/plonky2/src/hash/merkle_tree.rs
@@ -335,40 +335,40 @@ impl<F: RichField, H: Hasher<F>> MerkleTree<F, H> {
         let digests_buf = capacity_up_to_mut(&mut digests, num_digests);
         let cap_buf = capacity_up_to_mut(&mut cap, len_cap);
 
-        // #[cfg(feature = "cuda")]
-        // {
-        //     // Check if we should use GPU acceleration
-        //     // Use GPU for large trees (>= 1024 leaves) or if CUDA_MERKLE_THRESHOLD is set
-        //     let use_gpu = if let Ok(threshold_str) = std::env::var("CUDA_MERKLE_THRESHOLD") {
-        //         if let Ok(threshold) = threshold_str.parse::<usize>() {
-        //             leaves.len() >= threshold
-        //         } else {
-        //             leaves.len() >= 1024
-        //         }
-        //     } else {
-        //         leaves.len() >= 1024
-        //     };
-
-        //     if use_gpu {
-        //         // Flatten leaves into 1D vector for GPU
-        //         let leaf_size = if leaves.is_empty() { 0 } else { leaves[0].len() };
-        //         let zeros = vec![F::ZERO; leaf_size];
-        //         let mut leaves_1d: Vec<F> = Vec::with_capacity(leaves.len() * leaf_size);
-        //         for leaf in &leaves {
-        //             if leaf.is_empty() {
-        //                 leaves_1d.extend(zeros.clone());
-        //             } else {
-        //                 leaves_1d.extend(leaf.clone());
-        //             }
-        //         }
-
-        //         fill_digests_buf_gpu::<F, H>(digests_buf, cap_buf, &leaves_1d, leaf_size, cap_height);
-        //     } else {
-        //         fill_digests_buf::<F, H>(digests_buf, cap_buf, &leaves[..], cap_height);
-        //     }
-        // }
-
-        // #[cfg(not(feature = "cuda"))]
+        #[cfg(feature = "cuda")]
+        {
+            // Check if we should use GPU acceleration
+            // Use GPU for large trees (>= 1024 leaves) or if CUDA_MERKLE_THRESHOLD is set
+            let use_gpu = if let Ok(threshold_str) = std::env::var("CUDA_MERKLE_THRESHOLD") {
+                if let Ok(threshold) = threshold_str.parse::<usize>() {
+                    leaves.len() >= threshold
+                } else {
+                    leaves.len() >= 1024
+                }
+            } else {
+                leaves.len() >= 1024
+            };
+
+            if use_gpu {
+                // Flatten leaves into 1D vector for GPU
+                let leaf_size = if leaves.is_empty() { 0 } else { leaves[0].len() };
+                let zeros = vec![F::ZERO; leaf_size];
+                let mut leaves_1d: Vec<F> = Vec::with_capacity(leaves.len() * leaf_size);
+                for leaf in &leaves {
+                    if leaf.is_empty() {
+                        leaves_1d.extend(zeros.clone());
+                    } else {
+                        leaves_1d.extend(leaf.clone());
+                    }
+                }
+
+                fill_digests_buf_gpu::<F, H>(digests_buf, cap_buf, &leaves_1d, leaf_size, cap_height);
+            } else {
+                fill_digests_buf::<F, H>(digests_buf, cap_buf, &leaves[..], cap_height);
+            }
+        }
+
+        #[cfg(not(feature = "cuda"))]
         {
             fill_digests_buf::<F, H>(digests_buf, cap_buf, &leaves[..], cap_height);
         }

From 5f625e41a6d72df3b03e89a91b0348d11df9d271 Mon Sep 17 00:00:00 2001
From: lighter-zz <allaboutshop8@163.com>
Date: Mon, 24 Nov 2025 09:19:24 -0500
Subject: [PATCH 08/37] clean up

---
 BENCHMARK_RESULTS.md               | 166 ------------
 arch.md                            | 417 -----------------------------
 field/perm_comp.md => perm_comp.md |   0
 task.md                            |   1 -
 4 files changed, 584 deletions(-)
 delete mode 100644 BENCHMARK_RESULTS.md
 delete mode 100644 arch.md
 rename field/perm_comp.md => perm_comp.md (100%)
 delete mode 100644 task.md

diff --git a/BENCHMARK_RESULTS.md b/BENCHMARK_RESULTS.md
deleted file mode 100644
index 11cd934e4..000000000
--- a/BENCHMARK_RESULTS.md
+++ /dev/null
@@ -1,166 +0,0 @@
-# Poseidon vs Poseidon2 Performance Benchmark Results
-
-## Summary
-
-This document presents benchmark results comparing **PoseidonGoldilocksConfig** (original Poseidon hash) vs **Poseidon2GoldilocksConfig** (hybrid configuration using Poseidon2 for Merkle trees).
-
-## Configuration Details
-
-- **PoseidonGoldilocksConfig**: Uses Poseidon hash for both external (Merkle trees) and internal (circuit) hashing
-- **Poseidon2GoldilocksConfig**: Uses Poseidon2 for external hashing (Merkle trees), Poseidon for internal hashing (circuits)
-
-## Benchmark Results
-
-### Circuit Size: 100 iterations
-
-| Configuration | Build Time | Prove Time | Verify Time | Total Time | Speedup |
-|--------------|------------|------------|-------------|------------|---------|
-| Poseidon     | 4.57ms     | 6.21ms     | 1.36ms      | 12.14ms    | -       |
-| Poseidon2    | 2.41ms     | 6.36ms     | 1.88ms      | 10.65ms    | **1.14x** |
-
-**Build speedup: 1.90x** (4.57ms → 2.41ms)
-
-### Circuit Size: 500 iterations
-
-| Configuration | Build Time | Prove Time | Verify Time | Total Time | Speedup |
-|--------------|------------|------------|-------------|------------|---------|
-| Poseidon     | 3.32ms     | 2.82ms     | 1.50ms      | 7.63ms     | -       |
-| Poseidon2    | 3.28ms     | 13.71ms    | 2.17ms      | 19.16ms    | **0.40x** ⚠️ |
-
-**Note**: Poseidon2 is slower here - likely due to AVX2 warmup or different circuit structure.
-
-### Circuit Size: 1000 iterations
-
-| Configuration | Build Time | Prove Time | Verify Time | Total Time | Speedup |
-|--------------|------------|------------|-------------|------------|---------|
-| Poseidon     | 5.05ms     | 8.90ms     | 1.87ms      | 15.82ms    | -       |
-| Poseidon2    | 5.86ms     | 6.97ms     | 2.77ms      | 15.60ms    | **1.01x** |
-
-**Prove speedup: 1.28x** (8.90ms → 6.97ms)
-
-## Analysis
-
-### Key Observations
-
-1. **Build Time**: Poseidon2 shows significant improvement for small circuits (1.90x faster at size 100) but becomes comparable or slightly slower for larger circuits.
-
-2. **Proof Generation**:
-   - For small circuits (100): Similar performance (6.21ms vs 6.36ms)
-   - For medium circuits (500): Poseidon2 is unexpectedly slower (needs investigation)
-   - For large circuits (1000): Poseidon2 shows **1.28x speedup** (8.90ms → 6.97ms)
-
-3. **Verification Time**: Poseidon2 is consistently slower in verification (1.36ms → 1.88ms for size 100), likely due to different hash function overhead.
-
-4. **Overall Performance**: Mixed results, with best performance at small (100) and large (1000) circuit sizes.
-
-### Performance Breakdown
-
-#### Circuit Size: 100
-```
-Poseidon:  Build 37.6% | Prove 51.2% | Verify 11.2%
-Poseidon2: Build 22.6% | Prove 59.7% | Verify 17.7%
-```
-
-#### Circuit Size: 1000
-```
-Poseidon:  Build 31.9% | Prove 56.3% | Verify 11.8%
-Poseidon2: Build 37.5% | Prove 44.7% | Verify 17.8%
-```
-
-## Performance Characteristics
-
-### Where Poseidon2 Excels
-
-✅ **Proof generation for larger circuits** (1.28x speedup at 1000 iterations)
-- Better performance in Merkle tree construction
-- More efficient FRI commitments with AVX2 optimizations
-- Improved matrix multiplication in Poseidon2 hash
-
-✅ **Circuit building for small circuits** (1.90x speedup at 100 iterations)
-- Faster initial setup
-- Efficient sponge construction
-
-### Where Poseidon2 Shows No Improvement
-
-⚠️ **Medium-sized circuits** (500 iterations)
-- Unexpected slowdown in proof generation
-- Possibly due to CPU cache effects or AVX2 warmup
-- Requires further investigation
-
-❌ **Verification time**
-- Consistently 30-40% slower
-- Likely due to Poseidon2 hash computation overhead in verification
-
-## Recommendations
-
-### When to Use Poseidon2GoldilocksConfig
-
-1. **Large circuits with many constraints** - Shows clear proof generation speedup
-2. **Applications prioritizing proof generation over verification** - If prover performance is critical
-3. **Batch proof generation** - Amortizes the warmup cost
-
-### When to Use PoseidonGoldilocksConfig
-
-1. **Applications with frequent verification** - Original Poseidon verifies faster
-2. **Medium-sized circuits** - More consistent performance
-3. **When stability is critical** - Well-tested, mature implementation
-
-## Technical Details
-
-### Hash Function Differences
-
-**Poseidon**:
-- 12-element state width
-- 8 full rounds + 22 partial rounds
-- Standard MDS matrix
-
-**Poseidon2**:
-- 12-element state width
-- 8 full rounds + 22 partial rounds
-- Optimized M_E (external) matrix using M_4 blocks
-- AVX2-accelerated matrix multiplication
-- More efficient internal diffusion layer
-
-### AVX2 Optimizations
-
-Both implementations use AVX2 SIMD instructions for:
-- S-box computation (x^7 in Goldilocks field)
-- Matrix-vector multiplication
-- Round constant addition
-
-Poseidon2 additionally optimizes:
-- Block-wise M_4 matrix application
-- Internal layer diffusion with diagonal matrix
-
-## Future Work
-
-1. **Investigate 500-iteration slowdown** - Profile to understand performance regression
-2. **Benchmark with different circuit types** - Test with other operations beyond Fibonacci
-3. **Measure memory usage** - Compare memory footprint between configurations
-4. **Test on different hardware** - Verify AVX2 benefits across CPUs
-5. **Implement Poseidon2Gate** - Enable full Poseidon2 support for in-circuit hashing
-
-## Running the Benchmark
-
-To reproduce these results:
-
-```bash
-cargo run --release --example bench_poseidon_vs_poseidon2
-```
-
-To benchmark with custom circuit sizes, modify the `circuit_sizes` vector in `main()`:
-
-```rust
-let circuit_sizes = vec![100, 500, 1000, 2000, 5000];
-```
-
-## System Information
-
-- **CPU**: x86_64 with AVX2 support
-- **Compiler**: rustc with release optimizations
-- **Build**: `--release` with target-cpu=native recommended for best performance
-
----
-
-**Generated**: 2025-11-07
-**Benchmark Tool**: [bench_poseidon_vs_poseidon2.rs](plonky2/examples/bench_poseidon_vs_poseidon2.rs)
diff --git a/arch.md b/arch.md
deleted file mode 100644
index 257652a62..000000000
--- a/arch.md
+++ /dev/null
@@ -1,417 +0,0 @@
-# Plonky2 Circuit Architecture
-
-This document explains how circuits are laid out and structured in Plonky2.
-
-## Table of Contents
-- [Circuit Matrix Structure](#circuit-matrix-structure)
-- [Gate Placement](#gate-placement)
-- [Wire Organization](#wire-organization)
-- [Data Structure Hierarchy](#data-structure-hierarchy)
-- [Constraint System](#constraint-system)
-- [Copy Constraints & Permutation](#copy-constraints--permutation)
-- [Witness Generation Pipeline](#witness-generation-pipeline)
-- [Polynomial Commitments](#polynomial-commitments)
-- [Key Design Principles](#key-design-principles)
-
-## Circuit Matrix Structure
-
-The circuit is fundamentally a **2D matrix**:
-- **Rows**: Gates (operations), numbered 0 to `degree`
-- **Columns**: 135 wires total
-  - 80 routed wires (participate in copy constraints/permutation argument)
-  - 55 advice wires (local to gates, used for intermediate values)
-
-**Reference**: [plonky2/src/plonk/circuit_builder.rs:141-207](plonky2/src/plonk/circuit_builder.rs#L141-L207)
-
-### Wire Layout
-```
-Wire Index │ Type          │ Purpose
-───────────┼───────────────┼──────────────────────────────
-0-79       │ Routed        │ Can be connected across gates via permutation
-80-134     │ Advice        │ Local helper wires, not part of permutation
-```
-
-**Reference**: [plonky2/src/plonk/circuit_data.rs:56-88](plonky2/src/plonk/circuit_data.rs#L56-L88)
-
-```rust
-pub const NUM_ROUTED_WIRES: usize = 80;
-pub const NUM_ADVICE_WIRES: usize = 55;
-pub const NUM_WIRES: usize = NUM_ROUTED_WIRES + NUM_ADVICE_WIRES;
-```
-
-## Gate Placement
-
-Gates are placed **sequentially** in the circuit matrix using a greedy algorithm:
-
-1. Each gate type defines how many constraint "slots" it needs
-2. The builder searches for the next available slot using `find_slot()`
-3. Gates are packed efficiently to minimize circuit size
-4. Gates with the same degree are grouped together
-
-**Reference**: [plonky2/src/plonk/circuit_builder.rs:815-845](plonky2/src/plonk/circuit_builder.rs#L815-L845)
-
-### Selector Polynomials
-
-Instead of having one selector per gate type, Plonky2 uses **selector polynomials** that partition gates by degree:
-- Gates of degree D are grouped together
-- Selector polynomial is 1 for gates of that degree, 0 elsewhere
-- This enables efficient constraint evaluation without per-gate filtering
-
-**Reference**: [plonky2/src/plonk/get_vecs.rs:12-68](plonky2/src/plonk/get_vecs.rs#L12-L68)
-
-## Wire Organization
-
-### Routed Wires (0-79)
-- Participate in the **permutation argument**
-- Can be connected across different gates
-- Used for inputs/outputs that need to be constrained equal
-- Example: connecting output of gate A to input of gate B
-
-### Advice Wires (80-134)
-- Local to individual gates
-- Do NOT participate in permutation
-- Used for intermediate computations
-- Reduces pressure on routed wires
-- Example: temporary values in arithmetic operations
-
-**Reference**: [plonky2/src/plonk/circuit_data.rs:56-88](plonky2/src/plonk/circuit_data.rs#L56-L88)
-
-## Data Structure Hierarchy
-
-The circuit data is split into three components for efficiency:
-
-```
-CircuitData
-├── ProverOnlyCircuitData
-│   ├── Generators (compute witness values)
-│   ├── Sigma polynomials (permutation mappings)
-│   ├── Forest (union-find for copy constraints)
-│   ├── Representative map
-│   └── FFT precomputation tables
-├── VerifierOnlyCircuitData
-│   ├── Constants Merkle cap
-│   └── Circuit digest
-└── CommonCircuitData
-    ├── CircuitConfig
-    ├── Gates (list of all gate instances)
-    ├── Selectors (degree-based partitioning)
-    ├── Quotient degree factor
-    ├── Public input indices
-    ├── FRI parameters
-    └── Circuit digest
-```
-
-### CircuitData
-Main container holding all circuit information.
-
-**Reference**: [plonky2/src/plonk/circuit_data.rs:185-191](plonky2/src/plonk/circuit_data.rs#L185-L191)
-
-### ProverOnlyCircuitData
-Information needed only by the prover:
-- **Generators**: Compute witness values from partial witness
-- **Sigma polynomials**: Encode the permutation mapping for copy constraints
-- **Forest**: Union-find data structure tracking which wires are constrained equal
-- **FFT tables**: Precomputed for polynomial operations
-
-**Reference**: [plonky2/src/plonk/circuit_data.rs:428-440](plonky2/src/plonk/circuit_data.rs#L428-L440)
-
-### VerifierOnlyCircuitData
-Minimal information for verification:
-- Constants Merkle cap (commitment to constants)
-- Circuit digest (hash of circuit structure)
-
-**Reference**: [plonky2/src/plonk/circuit_data.rs:402-426](plonky2/src/plonk/circuit_data.rs#L402-L426)
-
-### CommonCircuitData
-Shared between prover and verifier:
-- Configuration parameters
-- Gate definitions
-- Selector polynomials
-- Public input locations
-- FRI parameters
-
-**Reference**: [plonky2/src/plonk/circuit_data.rs:442-480](plonky2/src/plonk/circuit_data.rs#L442-L480)
-
-## Constraint System
-
-Each gate implements the `Gate` trait which defines:
-
-```rust
-pub trait Gate<F: RichField + Extendable<D>, const D: usize>: ... {
-    fn num_wires(&self) -> usize;           // How many wires it uses
-    fn num_constants(&self) -> usize;        // How many constants it needs
-    fn degree(&self) -> usize;               // Max degree of constraints
-    fn num_constraints(&self) -> usize;      // Number of polynomial equations
-
-    fn eval_unfiltered(&self, ...);          // Evaluate constraints
-    fn eval_filtered(&self, ...);            // Evaluate with selector
-}
-```
-
-**Reference**: [plonky2/src/plonk/gates/gate.rs:53-260](plonky2/src/plonk/gates/gate.rs#L53-L260)
-
-### Constraint Evaluation
-
-Constraints are evaluated in **point-major order**:
-- Evaluate all constraints at point 1
-- Then all constraints at point 2
-- Then all constraints at point 3
-- ...
-
-This is more SIMD-friendly than gate-major order.
-
-**Reference**: [plonky2/src/plonk/get_vecs.rs:70-110](plonky2/src/plonk/get_vecs.rs#L70-L110)
-
-### Gate Instance
-
-A gate instance consists of:
-- Gate index (which gate definition)
-- Row index (which row in the circuit matrix)
-
-**Reference**: [plonky2/src/plonk/gates/gate.rs:319-322](plonky2/src/plonk/gates/gate.rs#L319-L322)
-
-## Copy Constraints & Permutation
-
-Plonky2 uses the **PLONK permutation argument** to enforce that wires constrained to be equal actually have equal values.
-
-### Forest (Union-Find)
-
-Tracks which wires are constrained to be equal:
-- Each wire starts in its own set
-- `copy_constraint(w1, w2)` unions the sets
-- Eventually computes a permutation mapping
-
-**Reference**: [plonky2/src/plonk/permutation_argument.rs:13-156](plonky2/src/plonk/permutation_argument.rs#L13-L156)
-
-### Sigma Polynomials
-
-Encode the permutation mapping:
-- For each wire `w`, `sigma(w)` tells you the next wire in its equivalence class
-- Forms a cycle through all wires that must be equal
-- Committed as part of the circuit structure
-
-**Reference**: [plonky2/src/plonk/permutation_argument.rs:45-91](plonky2/src/plonk/permutation_argument.rs#L45-L91)
-
-### Permutation Argument
-
-During proving:
-1. Compute partial products based on wires and sigmas
-2. Accumulate these into the `Z` polynomial
-3. Prove that `Z` forms a valid permutation product
-
-During verification:
-- Check that permutation constraints hold at random point
-
-**Reference**: [plonky2/src/plonk/prover.rs:250-289](plonky2/src/plonk/prover.rs#L250-L289)
-
-## Witness Generation Pipeline
-
-The witness goes through three forms:
-
-```
-PartialWitness (sparse, user-provided)
-    ↓ (apply generators)
-PartitionWitness (respects copy constraints)
-    ↓ (flatten to column-major matrix)
-MatrixWitness (dense, ready for polynomials)
-```
-
-### PartialWitness
-
-- Sparse representation (HashMap)
-- User provides initial values (public inputs, private inputs)
-- Not all wires need to be set
-
-**Reference**: [plonky2/src/iop/witness.rs:283-308](plonky2/src/iop/witness.rs#L283-L308)
-
-### PartitionWitness
-
-- Organized by copy-constraint partitions
-- Each partition has one representative wire
-- Setting a wire sets all wires in its partition
-- Generators fill in missing values
-
-**Reference**: [plonky2/src/iop/witness.rs:310-377](plonky2/src/iop/witness.rs#L310-L377)
-
-### MatrixWitness
-
-- Dense 2D array
-- Column-major layout (wires are contiguous)
-- Ready to be converted to polynomials via FFT
-- Used for final proof generation
-
-**Reference**: [plonky2/src/iop/witness.rs:379-402](plonky2/src/iop/witness.rs#L379-L402)
-
-### Generators
-
-Generators compute derived witness values:
-- Take some inputs and compute outputs
-- Run in topological order based on dependencies
-- Examples: arithmetic operations, hash outputs, lookup multiplicities
-
-**Reference**: [plonky2/src/iop/generator.rs:33-142](plonky2/src/iop/generator.rs#L33-L142)
-
-## Polynomial Commitments
-
-Plonky2 uses FRI for polynomial commitments. There are four oracles:
-
-### Oracle 1: CONSTANTS_SIGMAS
-- Constants (gate constants, public inputs)
-- Sigma polynomials (permutation mappings)
-- **Not blinded** (deterministic, part of circuit structure)
-
-**Reference**: [plonky2/src/plonk/prover.rs:119-148](plonky2/src/plonk/prover.rs#L119-L148)
-
-### Oracle 2: WIRES
-- Wire witness values
-- **Blinded** with random salt
-- Committed after witness generation
-
-**Reference**: [plonky2/src/plonk/prover.rs:153-181](plonky2/src/plonk/prover.rs#L153-L181)
-
-### Oracle 3: ZS_PARTIAL_PRODUCTS
-- Permutation product polynomial (Z)
-- Partial products for permutation argument
-- **Blinded**
-
-**Reference**: [plonky2/src/plonk/prover.rs:250-289](plonky2/src/plonk/prover.rs#L250-L289)
-
-### Oracle 4: QUOTIENT
-- Quotient polynomial from constraint division
-- Proves all constraints are satisfied
-- **Blinded**
-
-**Reference**: [plonky2/src/plonk/prover.rs:291-339](plonky2/src/plonk/prover.rs#L291-L339)
-
-### Polynomial Batch Process
-
-For each oracle:
-1. Coefficients in evaluation form
-2. **FFT** to coefficient form
-3. **Low-degree extension** (LDE) by interpolation
-4. Add **blinding salt** (random polynomial)
-5. Evaluate LDE on larger domain
-6. Build **Merkle tree** over evaluations
-7. Return Merkle cap as commitment
-
-**Reference**: [plonky2/src/plonk/prover.rs:73-111](plonky2/src/plonk/prover.rs#L73-L111)
-
-### Opening Points
-
-Polynomials are opened at two points:
-- `zeta`: Random challenge point
-- `g * zeta`: Next point in coset (for permutation argument)
-
-**Reference**: [plonky2/src/plonk/verifier.rs:42-167](plonky2/src/plonk/verifier.rs#L42-L167)
-
-## Key Design Principles
-
-### 1. Efficiency Through Selectors
-Selector polynomials group gates by degree, enabling constraint evaluation without filtering by individual gate type. This is more efficient than standard PLONK.
-
-### 2. Routed vs Advice Wires
-Separating routed wires (participate in permutation) from advice wires (local to gates) reduces the cost of the permutation argument while maintaining flexibility.
-
-### 3. Modularity
-Gates are self-contained with their own constraint logic. New gates can be added without modifying the core proving system.
-
-### 4. SIMD-Friendly Layout
-- Point-major constraint evaluation
-- Column-major witness layout
-- Both enable efficient vectorization
-
-### 5. Prover/Verifier Separation
-Splitting data into ProverOnly, VerifierOnly, and Common minimizes what the verifier needs, reducing verification cost.
-
-### 6. Generator Pipeline
-The generator system allows complex witness computation while maintaining a clean separation between circuit definition and witness generation.
-
-### 7. Lookup Arguments
-Lookup tables enable efficient range checks, XOR operations, and other lookups without expensive bitwise constraints.
-
-**Reference**: [plonky2/src/plonk/circuit_builder.rs:1357-1472](plonky2/src/plonk/circuit_builder.rs#L1357-L1472)
-
-## Advanced Features
-
-### Recursion
-Plonky2 can verify its own proofs:
-- Verifier circuit is built using the circuit builder
-- Enables proof composition and aggregation
-- Special gates for efficient field arithmetic
-
-**Reference**: [plonky2/src/recursion/](plonky2/src/recursion/)
-
-### Custom Gates
-Users can define custom gates for specific operations:
-- Implement the `Gate` trait
-- Define constraints and evaluation logic
-- Register with the circuit builder
-
-**Reference**: [plonky2/src/plonk/gates/gate.rs:53-260](plonky2/src/plonk/gates/gate.rs#L53-L260)
-
-### Lookup Tables
-Efficient lookups for operations like:
-- Range checks
-- Bitwise operations (XOR, AND)
-- Small field operations
-- S-boxes (for hash functions)
-
-**Reference**: [plonky2/src/gates/lookup.rs](plonky2/src/gates/lookup.rs), [plonky2/src/gates/lookup_table.rs](plonky2/src/gates/lookup_table.rs)
-
-## File Reference Index
-
-### Core Circuit Structure
-- [plonky2/src/plonk/circuit_data.rs](plonky2/src/plonk/circuit_data.rs) - Main data structures
-- [plonky2/src/plonk/circuit_builder.rs](plonky2/src/plonk/circuit_builder.rs) - Circuit construction
-
-### Gates
-- [plonky2/src/plonk/gates/gate.rs](plonky2/src/plonk/gates/gate.rs) - Gate trait and instances
-- [plonky2/src/gates/](plonky2/src/gates/) - Concrete gate implementations
-
-### Witness
-- [plonky2/src/iop/witness.rs](plonky2/src/iop/witness.rs) - Witness types
-- [plonky2/src/iop/generator.rs](plonky2/src/iop/generator.rs) - Generator system
-
-### Proving & Verification
-- [plonky2/src/plonk/prover.rs](plonky2/src/plonk/prover.rs) - Proof generation
-- [plonky2/src/plonk/verifier.rs](plonky2/src/plonk/verifier.rs) - Proof verification
-
-### Permutation Argument
-- [plonky2/src/plonk/permutation_argument.rs](plonky2/src/plonk/permutation_argument.rs) - Copy constraints
-
-### Polynomials
-- [plonky2/src/plonk/get_vecs.rs](plonky2/src/plonk/get_vecs.rs) - Polynomial evaluation
-- [plonky2/src/fri/](plonky2/src/fri/) - FRI commitment scheme
-
-### Recursion
-- [plonky2/src/recursion/](plonky2/src/recursion/) - Recursive proof verification
-
-## Example: Simple Circuit
-
-Here's how a simple circuit `c = a + b * 3` would be laid out:
-
-```
-Row 0: PublicInput gate (for input a)
-Row 1: PublicInput gate (for input b)
-Row 2: ArithmeticGate (b * 3)
-Row 3: ArithmeticGate (a + result_from_row2)
-Row 4-N: Padding to reach power-of-2 degree
-
-Copy constraints:
-- a (row 0, wire 0) = a (row 3, wire 0)
-- b (row 1, wire 0) = b (row 2, wire 0)
-- result (row 2, wire 2) = operand (row 3, wire 1)
-```
-
-The witness generation would:
-1. User provides `a` and `b` in PartialWitness
-2. Generators compute intermediate values
-3. PartitionWitness ensures copy constraints are satisfied
-4. MatrixWitness provides final polynomial values
-
-See [plonky2/examples/](plonky2/examples/) for complete working examples.
-
----
-
-**Last Updated**: 2025-11-04
-**Plonky2 Version**: Based on plonky2-lighter repository
diff --git a/field/perm_comp.md b/perm_comp.md
similarity index 100%
rename from field/perm_comp.md
rename to perm_comp.md
diff --git a/task.md b/task.md
deleted file mode 100644
index 11fb7869c..000000000
--- a/task.md
+++ /dev/null
@@ -1 +0,0 @@
-i need to do bench
\ No newline at end of file

From d48ec0de47bf9fb473e793b62ce8441e85d2ceb1 Mon Sep 17 00:00:00 2001
From: lighter-zz <allaboutshop8@163.com>
Date: Mon, 24 Nov 2025 09:21:13 -0500
Subject: [PATCH 09/37] Update perm_comp.md

---
 perm_comp.md | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/perm_comp.md b/perm_comp.md
index dec02f91f..ec1886a01 100644
--- a/perm_comp.md
+++ b/perm_comp.md
@@ -5,30 +5,30 @@
 
 | Operation | CPU (s) | GPU (s) | Speedup | GPU Tuned? |
 |-----------|---------|---------|---------|------------|
-| **Run generators** | 1.7767 | 1.7899 | 0.99x | ✗ Not accelerated |
-| **Compute full witness** | 0.3369 | 0.3362 | 1.00x | ✗ Not accelerated |
-| **Compute wire polynomials** | 0.0396 | 0.0392 | 1.01x | ✗ Not accelerated |
+| **Run generators** | 1.7767 | 1.7899 | - | ✗ Not accelerated |
+| **Compute full witness** | 0.3369 | 0.3362 | - | ✗ Not accelerated |
+| **Compute wire polynomials** | 0.0396 | 0.0392 | - | ✗ Not accelerated |
 | **Compute wires commitment** | 20.1902 | 10.0548 | **2.01x** | ✓ Yes |
 | └─ IFFT | 1.2070 | 0.1587 | **7.61x** | ✓ **Highly tuned** |
 | └─ FFT + blinding | 11.4267 | 3.6139 | **3.16x** | ✓ **Highly tuned** |
-| └─ Transpose LDEs | 2.8010 | 2.7881 | 1.00x | ✗ Not accelerated |
+| └─ Transpose LDEs | 2.8010 | 2.7881 | - | ✗ Not accelerated |
 | └─ Build Merkle tree | 4.5166 | 3.2734 | **1.38x** | ✓ Tuned |
-| **Compute partial products** | 0.1700 | 0.1671 | 1.02x | ✗ Not accelerated |
+| **Compute partial products** | 0.1700 | 0.1671 | - | ✗ Not accelerated |
 | **Commit to partial products/Z's** | 3.4213 | 1.6982 | **2.01x** | ✓ Yes |
 | └─ IFFT | 0.1860 | 0.0241 | **7.72x** | ✓ **Highly tuned** |
 | └─ FFT + blinding | 1.7627 | 0.4778 | **3.69x** | ✓ **Highly tuned** |
-| └─ Transpose LDEs | 0.3906 | 0.3874 | 1.01x | ✗ Not accelerated |
+| └─ Transpose LDEs | 0.3906 | 0.3874 | - | ✗ Not accelerated |
 | └─ Build Merkle tree | 1.0253 | 0.7573 | **1.35x** | ✓ Tuned |
-| **Compute quotient polys** | 1.4041 | 1.3128 | 1.07x | ✗ Not accelerated |
-| **Split quotient polys** | 0.0098 | 0.0212 | 0.46x | ✗ Not accelerated|
+| **Compute quotient polys** | 1.4041 | 1.3128 | - | ✗ Not accelerated |
+| **Split quotient polys** | 0.0098 | 0.0212 | - | ✗ Not accelerated|
 | **Commit to quotient polys** | 2.6641 | 1.4077 | **1.89x** | ✓ Yes |
 | └─ FFT + blinding | 1.5496 | 0.4315 | **3.59x** | ✓ **Highly tuned** |
-| └─ Transpose LDEs | 0.2952 | 0.2908 | 1.02x | ✗ Not accelerated |
+| └─ Transpose LDEs | 0.2952 | 0.2908 | - | ✗ Not accelerated |
 | └─ Build Merkle tree | 0.7756 | 0.6453 | **1.20x** | ✓ Tuned |
-| **Construct opening set** | 0.1609 | 0.1600 | 1.01x | ✗ Not accelerated |
-| **Compute opening proofs** | 1.3580 | 1.2919 | 1.05x | ✗ Not accelerated |
-| └─ Reduce 255 polynomials | 0.8715 | 0.8518 | 1.02x | ✗ Not accelerated |
-| └─ Reduce 2 polynomials | 0.0087 | 0.0085 | 1.02x | ✗ Not accelerated |
-| └─ Final FFT 4194304 | 0.3083 | 0.3023 | 1.02x | ✗ Not accelerated |
-| └─ Fold codewords | 0.1312 | 0.0904 | **1.45x** | ✗ Not accelerated |
-| └─ Find PoW witness | 0.0014 | 0.0038 | 0.37x | ✗ Not accelerated |
\ No newline at end of file
+| **Construct opening set** | 0.1609 | 0.1600 | - | ✗ Not accelerated |
+| **Compute opening proofs** | 1.3580 | 1.2919 | - | ✗ Not accelerated |
+| └─ Reduce 255 polynomials | 0.8715 | 0.8518 | - | ✗ Not accelerated |
+| └─ Reduce 2 polynomials | 0.0087 | 0.0085 | - | ✗ Not accelerated |
+| └─ Final FFT 4194304 | 0.3083 | 0.3023 | - | ✗ Not accelerated |
+| └─ Fold codewords | 0.1312 | 0.0904 | - | ✗ Not accelerated |
+| └─ Find PoW witness | 0.0014 | 0.0038 | - | ✗ Not accelerated |
\ No newline at end of file

From 6139c76e56d97eeaff478490d3ce370902939aad Mon Sep 17 00:00:00 2001
From: lighter-zz <allaboutshop8@163.com>
Date: Mon, 24 Nov 2025 09:23:57 -0500
Subject: [PATCH 10/37] Update perm_comp.md

---
 perm_comp.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/perm_comp.md b/perm_comp.md
index ec1886a01..333134394 100644
--- a/perm_comp.md
+++ b/perm_comp.md
@@ -1,7 +1,9 @@
 # Performance comparison
-- CPU: AMD 7950x3d 16 core
-- GPU: 4080 super; single card
-- 
+- CPU: AMD 7950x3d; 16 core
+- GPU: NVidia 4080; single card
+- Circuit size: 2^19 gates
+- Total CPU time: **32.97 s**
+- Total GPU time: **19.71 s**
 
 | Operation | CPU (s) | GPU (s) | Speedup | GPU Tuned? |
 |-----------|---------|---------|---------|------------|

From 2669e9be44f40cf0af4eeb0c251c94b10b6318c9 Mon Sep 17 00:00:00 2001
From: lighter-zz <allaboutshop8@163.com>
Date: Tue, 9 Dec 2025 13:33:34 -0500
Subject: [PATCH 11/37] fix

---
 field/src/fft.rs                |  2 +-
 plonky2/src/hash/merkle_tree.rs | 40 ++++++++++-----------------------
 2 files changed, 13 insertions(+), 29 deletions(-)

diff --git a/field/src/fft.rs b/field/src/fft.rs
index bccfb3486..682a1e33d 100644
--- a/field/src/fft.rs
+++ b/field/src/fft.rs
@@ -678,7 +678,7 @@ mod tests {
         type F = GoldilocksField;
 
         // Test various polynomial sizes
-        for log_size in [8, 10, 12, 14,16,18,20] {
+        for log_size in [8, 10, 12, 14, 16, 18, 20] {
             let size = 1 << log_size;
             zeknox::clear_cuda_errors_rs();
             init_twiddle_factors_rs(0, log_size);
diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs
index 8865cf1c2..109eedeb0 100644
--- a/plonky2/src/hash/merkle_tree.rs
+++ b/plonky2/src/hash/merkle_tree.rs
@@ -15,21 +15,11 @@ use once_cell::sync::Lazy;
 use plonky2_maybe_rayon::*;
 use serde::{Deserialize, Serialize};
 #[cfg(feature = "cuda")]
-<<<<<<< HEAD
-use zeknox::device::memory::HostOrDeviceSlice;
-#[cfg(feature = "cuda")]
-use zeknox::device::stream::CudaStream;
-#[cfg(feature = "cuda")]
-use zeknox::fill_digests_buf_linear_gpu_with_gpu_ptr;
-#[cfg(feature = "cuda")]
-use zeknox::fill_digests_buf_linear_multigpu_with_gpu_ptr;
-=======
 use zeknox::device::{memory::HostOrDeviceSlice, stream::CudaStream};
 #[cfg(feature = "cuda")]
 use zeknox::{
     fill_digests_buf_linear_gpu_with_gpu_ptr, fill_digests_buf_linear_multigpu_with_gpu_ptr,
 };
->>>>>>> zz-lighter/zz/cuda_integration
 
 use crate::hash::hash_types::RichField;
 #[cfg(feature = "cuda")]
@@ -266,12 +256,6 @@ fn fill_digests_buf<F: RichField, H: Hasher<F>>(
 }
 
 #[cfg(feature = "cuda")]
-<<<<<<< HEAD
-#[repr(C)]
-union U8U64 {
-    f1: [u8; 32],
-    f2: [u64; 4],
-=======
 fn fill_digests_buf_gpu_ptr<F: RichField, H: Hasher<F>>(
     digests_buf: &mut [MaybeUninit<H::Hash>],
     cap_buf: &mut [MaybeUninit<H::Hash>],
@@ -361,7 +345,6 @@ fn fill_digests_buf_gpu_ptr<F: RichField, H: Hasher<F>>(
     stream2.synchronize().expect("cuda sync");
     stream1.destroy().expect("cuda stream destroy");
     stream2.destroy().expect("cuda stream destroy");
->>>>>>> zz-lighter/zz/cuda_integration
 }
 
 #[cfg(feature = "cuda")]
@@ -370,8 +353,6 @@ fn fill_digests_buf_gpu<F: RichField, H: Hasher<F>>(
     cap_buf: &mut [MaybeUninit<H::Hash>],
     leaves: &Vec<F>,
     leaf_size: usize,
-<<<<<<< HEAD
-=======
     cap_height: usize,
 ) {
     let leaves_count = leaves.len() / leaf_size;
@@ -396,7 +377,6 @@ fn fill_digests_buf_gpu<F: RichField, H: Hasher<F>>(
 pub(crate) fn merkle_tree_prove<F: RichField, H: Hasher<F>>(
     leaf_index: usize,
     leaves_len: usize,
->>>>>>> zz-lighter/zz/cuda_integration
     cap_height: usize,
 ) {
     let leaves_count = leaves.len() / leaf_size;
@@ -609,11 +589,6 @@ impl<F: RichField, H: Hasher<F>> MerkleTree<F, H> {
 
         let digests_buf = capacity_up_to_mut(&mut digests, num_digests);
         let cap_buf = capacity_up_to_mut(&mut cap, len_cap);
-<<<<<<< HEAD
-        let now = Instant::now();
-        fill_digests_buf_meta::<F, H>(digests_buf, cap_buf, &leaves_1d, leaf_size, cap_height);
-        print_time(now, "fill digests buffer");
-=======
 
         #[cfg(feature = "cuda")]
         {
@@ -631,7 +606,11 @@ impl<F: RichField, H: Hasher<F>> MerkleTree<F, H> {
 
             if use_gpu {
                 // Flatten leaves into 1D vector for GPU
-                let leaf_size = if leaves.is_empty() { 0 } else { leaves[0].len() };
+                let leaf_size = if leaves.is_empty() {
+                    0
+                } else {
+                    leaves[0].len()
+                };
                 let zeros = vec![F::ZERO; leaf_size];
                 let mut leaves_1d: Vec<F> = Vec::with_capacity(leaves.len() * leaf_size);
                 for leaf in &leaves {
@@ -642,7 +621,13 @@ impl<F: RichField, H: Hasher<F>> MerkleTree<F, H> {
                     }
                 }
 
-                fill_digests_buf_gpu::<F, H>(digests_buf, cap_buf, &leaves_1d, leaf_size, cap_height);
+                fill_digests_buf_gpu::<F, H>(
+                    digests_buf,
+                    cap_buf,
+                    &leaves_1d,
+                    leaf_size,
+                    cap_height,
+                );
             } else {
                 fill_digests_buf::<F, H>(digests_buf, cap_buf, &leaves[..], cap_height);
             }
@@ -652,7 +637,6 @@ impl<F: RichField, H: Hasher<F>> MerkleTree<F, H> {
         {
             fill_digests_buf::<F, H>(digests_buf, cap_buf, &leaves[..], cap_height);
         }
->>>>>>> zz-lighter/zz/cuda_integration
 
         unsafe {
             // SAFETY: `fill_digests_buf` or `fill_digests_buf_gpu` initialized the spare capacity up to

From 6db10e90208f78e4f28c809276edc9b9b1c16fa4 Mon Sep 17 00:00:00 2001
From: lighter-zz <allaboutshop8@163.com>
Date: Tue, 9 Dec 2025 13:43:53 -0500
Subject: [PATCH 12/37] fix again

---
 plonky2/src/hash/merkle_tree.rs | 206 ++++----------------------------
 1 file changed, 26 insertions(+), 180 deletions(-)

diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs
index 109eedeb0..dfdf94421 100644
--- a/plonky2/src/hash/merkle_tree.rs
+++ b/plonky2/src/hash/merkle_tree.rs
@@ -1,25 +1,27 @@
+#[cfg(feature = "cuda")]
+use alloc::sync::Arc;
+#[cfg(not(feature = "std"))]
+use alloc::vec::Vec;
 use core::mem::MaybeUninit;
 use core::slice;
 use std::collections::HashSet;
 #[cfg(feature = "cuda")]
-use std::sync::Arc;
-#[cfg(feature = "cuda")]
 use std::sync::Mutex;
 use std::time::Instant;
-#[cfg(not(feature = "std"))]
-use std::vec::Vec;
 
-use num::range;
 #[cfg(feature = "cuda")]
-use once_cell::sync::Lazy;
-use plonky2_maybe_rayon::*;
-use serde::{Deserialize, Serialize};
+use cryptography_cuda::device::memory::HostOrDeviceSlice;
 #[cfg(feature = "cuda")]
-use zeknox::device::{memory::HostOrDeviceSlice, stream::CudaStream};
+use cryptography_cuda::device::stream::CudaStream;
 #[cfg(feature = "cuda")]
-use zeknox::{
+use cryptography_cuda::merkle::bindings::{
     fill_digests_buf_linear_gpu_with_gpu_ptr, fill_digests_buf_linear_multigpu_with_gpu_ptr,
 };
+use num::range;
+#[cfg(feature = "cuda")]
+use once_cell::sync::Lazy;
+use plonky2_maybe_rayon::*;
+use serde::{Deserialize, Serialize};
 
 use crate::hash::hash_types::RichField;
 #[cfg(feature = "cuda")]
@@ -33,14 +35,10 @@ use crate::util::log2_strict;
 #[cfg(feature = "cuda")]
 pub static GPU_ID: Lazy<Arc<Mutex<u64>>> = Lazy::new(|| Arc::new(Mutex::new(0)));
 
-#[cfg(all(feature = "timing", feature = "cuda"))]
 fn print_time(now: Instant, msg: &str) {
     println!("Time {} {} ms", msg, now.elapsed().as_millis());
 }
 
-#[cfg(not(all(feature = "timing", feature = "cuda")))]
-fn print_time(_now: Instant, _msg: &str) {}
-
 #[cfg(feature = "cuda")]
 const FORCE_SINGLE_GPU: bool = true;
 
@@ -256,95 +254,10 @@ fn fill_digests_buf<F: RichField, H: Hasher<F>>(
 }
 
 #[cfg(feature = "cuda")]
-fn fill_digests_buf_gpu_ptr<F: RichField, H: Hasher<F>>(
-    digests_buf: &mut [MaybeUninit<H::Hash>],
-    cap_buf: &mut [MaybeUninit<H::Hash>],
-    leaves_ptr: *const F,
-    leaves_len: usize,
-    leaf_len: usize,
-    cap_height: usize,
-    gpu_id: u64,
-) {
-    let digests_count: u64 = digests_buf.len().try_into().unwrap();
-    let leaves_count: u64 = leaves_len.try_into().unwrap();
-    let caps_count: u64 = cap_buf.len().try_into().unwrap();
-    let cap_height: u64 = cap_height.try_into().unwrap();
-    let leaf_size: u64 = leaf_len.try_into().unwrap();
-
-    // if digests_buf is empty (size 0), just allocate a few bytes to avoid errors
-    let digests_size = if digests_buf.len() == 0 {
-        NUM_HASH_OUT_ELTS
-    } else {
-        digests_buf.len() * NUM_HASH_OUT_ELTS
-    };
-    let caps_size = if cap_buf.len() == 0 {
-        NUM_HASH_OUT_ELTS
-    } else {
-        cap_buf.len() * NUM_HASH_OUT_ELTS
-    };
-
-    let mut gpu_digests_buf: HostOrDeviceSlice<'_, F> =
-        HostOrDeviceSlice::cuda_malloc(gpu_id as i32, digests_size).unwrap();
-    let mut gpu_cap_buf: HostOrDeviceSlice<'_, F> =
-        HostOrDeviceSlice::cuda_malloc(gpu_id as i32, caps_size).unwrap();
-
-    unsafe {
-        let num_gpus: usize = std::env::var("NUM_OF_GPUS")
-            .unwrap_or_else(|_| "1".to_string())
-            .parse()
-            .unwrap_or(1);
-
-        if leaves_count >= (1 << 12) && cap_height > 0 && num_gpus > 1 {
-            // Multi-GPU path
-            fill_digests_buf_linear_multigpu_with_gpu_ptr(
-                gpu_digests_buf.as_mut_ptr() as *mut core::ffi::c_void,
-                gpu_cap_buf.as_mut_ptr() as *mut core::ffi::c_void,
-                leaves_ptr as *mut core::ffi::c_void,
-                digests_count,
-                caps_count,
-                leaves_count,
-                leaf_size,
-                cap_height,
-                0, // hash_type: 0 for Poseidon
-            );
-        } else {
-            // Single GPU path
-            fill_digests_buf_linear_gpu_with_gpu_ptr(
-                gpu_digests_buf.as_mut_ptr() as *mut core::ffi::c_void,
-                gpu_cap_buf.as_mut_ptr() as *mut core::ffi::c_void,
-                leaves_ptr as *mut core::ffi::c_void,
-                digests_count,
-                caps_count,
-                leaves_count,
-                leaf_size,
-                cap_height,
-                0, // hash_type: 0 for Poseidon
-                gpu_id,
-            );
-        }
-    }
-
-    let stream1 = CudaStream::create().unwrap();
-    let stream2 = CudaStream::create().unwrap();
-
-    gpu_digests_buf
-        .copy_to_host_ptr_async(
-            digests_buf.as_mut_ptr() as *mut core::ffi::c_void,
-            digests_size,
-            &stream1,
-        )
-        .expect("copy digests");
-    gpu_cap_buf
-        .copy_to_host_ptr_async(
-            cap_buf.as_mut_ptr() as *mut core::ffi::c_void,
-            caps_size,
-            &stream2,
-        )
-        .expect("copy caps");
-    stream1.synchronize().expect("cuda sync");
-    stream2.synchronize().expect("cuda sync");
-    stream1.destroy().expect("cuda stream destroy");
-    stream2.destroy().expect("cuda stream destroy");
+#[repr(C)]
+union U8U64 {
+    f1: [u8; 32],
+    f2: [u64; 4],
 }
 
 #[cfg(feature = "cuda")]
@@ -356,30 +269,6 @@ fn fill_digests_buf_gpu<F: RichField, H: Hasher<F>>(
     cap_height: usize,
 ) {
     let leaves_count = leaves.len() / leaf_size;
-    let gpu_id = 0;
-
-    let mut gpu_leaves_buf: HostOrDeviceSlice<'_, F> =
-        HostOrDeviceSlice::cuda_malloc(gpu_id as i32, leaves.len()).unwrap();
-
-    let _ = gpu_leaves_buf.copy_from_host(leaves.as_slice());
-
-    fill_digests_buf_gpu_ptr::<F, H>(
-        digests_buf,
-        cap_buf,
-        gpu_leaves_buf.as_mut_ptr(),
-        leaves_count,
-        leaf_size,
-        cap_height,
-        gpu_id,
-    );
-}
-
-pub(crate) fn merkle_tree_prove<F: RichField, H: Hasher<F>>(
-    leaf_index: usize,
-    leaves_len: usize,
-    cap_height: usize,
-) {
-    let leaves_count = leaves.len() / leaf_size;
 
     let num_gpus: usize = std::env::var("NUM_OF_GPUS")
         .expect("NUM_OF_GPUS should be set")
@@ -552,7 +441,7 @@ fn fill_digests_buf_meta<F: RichField, H: Hasher<F>>(
     cap_height: usize,
 ) {
     // if the input is small or if it Keccak hashing, just do the hashing on CPU
-    if leaf_size <= H::HASH_SIZE / 8 {
+    if leaf_size <= H::HASH_SIZE / 8 || H::HASHER_TYPE == HasherType::Keccak {
         fill_digests_buf::<F, H>(digests_buf, cap_buf, leaves, leaf_size, cap_height);
     } else {
         fill_digests_buf_gpu::<F, H>(digests_buf, cap_buf, leaves, leaf_size, cap_height);
@@ -589,57 +478,12 @@ impl<F: RichField, H: Hasher<F>> MerkleTree<F, H> {
 
         let digests_buf = capacity_up_to_mut(&mut digests, num_digests);
         let cap_buf = capacity_up_to_mut(&mut cap, len_cap);
-
-        #[cfg(feature = "cuda")]
-        {
-            // Check if we should use GPU acceleration
-            // Use GPU for large trees (>= 1024 leaves) or if CUDA_MERKLE_THRESHOLD is set
-            let use_gpu = if let Ok(threshold_str) = std::env::var("CUDA_MERKLE_THRESHOLD") {
-                if let Ok(threshold) = threshold_str.parse::<usize>() {
-                    leaves.len() >= threshold
-                } else {
-                    leaves.len() >= 1024
-                }
-            } else {
-                leaves.len() >= 1024
-            };
-
-            if use_gpu {
-                // Flatten leaves into 1D vector for GPU
-                let leaf_size = if leaves.is_empty() {
-                    0
-                } else {
-                    leaves[0].len()
-                };
-                let zeros = vec![F::ZERO; leaf_size];
-                let mut leaves_1d: Vec<F> = Vec::with_capacity(leaves.len() * leaf_size);
-                for leaf in &leaves {
-                    if leaf.is_empty() {
-                        leaves_1d.extend(zeros.clone());
-                    } else {
-                        leaves_1d.extend(leaf.clone());
-                    }
-                }
-
-                fill_digests_buf_gpu::<F, H>(
-                    digests_buf,
-                    cap_buf,
-                    &leaves_1d,
-                    leaf_size,
-                    cap_height,
-                );
-            } else {
-                fill_digests_buf::<F, H>(digests_buf, cap_buf, &leaves[..], cap_height);
-            }
-        }
-
-        #[cfg(not(feature = "cuda"))]
-        {
-            fill_digests_buf::<F, H>(digests_buf, cap_buf, &leaves[..], cap_height);
-        }
+        let now = Instant::now();
+        fill_digests_buf_meta::<F, H>(digests_buf, cap_buf, &leaves_1d, leaf_size, cap_height);
+        print_time(now, "fill digests buffer");
 
         unsafe {
-            // SAFETY: `fill_digests_buf` or `fill_digests_buf_gpu` initialized the spare capacity up to
+            // SAFETY: `fill_digests_buf` and `cap` initialized the spare capacity up to
             // `num_digests` and `len_cap`, resp.
             digests.set_len(num_digests);
             cap.set_len(len_cap);
@@ -998,7 +842,9 @@ mod tests {
     use super::*;
     use crate::field::extension::Extendable;
     use crate::hash::merkle_proofs::verify_merkle_proof_to_cap;
-    use crate::plonk::config::{GenericConfig, KeccakGoldilocksConfig, PoseidonGoldilocksConfig};
+    use crate::plonk::config::{
+        GenericConfig, KeccakGoldilocksConfig, PoseidonGoldilocksConfig,
+    };
 
     fn random_data<F: RichField>(n: usize, k: usize) -> Vec<Vec<F>> {
         (0..n).map(|_| F::rand_vec(k)).collect()
@@ -1343,7 +1189,7 @@ mod tests {
 
         Ok(())
     }
-
+    
     #[test]
     fn test_merkle_trees_keccak() -> Result<()> {
         const D: usize = 2;
@@ -1358,4 +1204,4 @@ mod tests {
 
         Ok(())
     }
-}
+}
\ No newline at end of file

From 022397af1cc0c39f97cab780b97cf0cde5515858 Mon Sep 17 00:00:00 2001
From: lighter-zz <allaboutshop8@163.com>
Date: Tue, 9 Dec 2025 14:05:05 -0500
Subject: [PATCH 13/37] fixes

---
 plonky2/src/hash/merkle_tree.rs | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs
index dfdf94421..461c04754 100644
--- a/plonky2/src/hash/merkle_tree.rs
+++ b/plonky2/src/hash/merkle_tree.rs
@@ -1,27 +1,27 @@
-#[cfg(feature = "cuda")]
-use alloc::sync::Arc;
-#[cfg(not(feature = "std"))]
-use alloc::vec::Vec;
 use core::mem::MaybeUninit;
 use core::slice;
 use std::collections::HashSet;
 #[cfg(feature = "cuda")]
+use std::sync::Arc;
+#[cfg(feature = "cuda")]
 use std::sync::Mutex;
 use std::time::Instant;
+#[cfg(not(feature = "std"))]
+use std::vec::Vec;
 
-#[cfg(feature = "cuda")]
-use cryptography_cuda::device::memory::HostOrDeviceSlice;
-#[cfg(feature = "cuda")]
-use cryptography_cuda::device::stream::CudaStream;
-#[cfg(feature = "cuda")]
-use cryptography_cuda::merkle::bindings::{
-    fill_digests_buf_linear_gpu_with_gpu_ptr, fill_digests_buf_linear_multigpu_with_gpu_ptr,
-};
 use num::range;
 #[cfg(feature = "cuda")]
 use once_cell::sync::Lazy;
 use plonky2_maybe_rayon::*;
 use serde::{Deserialize, Serialize};
+#[cfg(feature = "cuda")]
+use zeknox::device::memory::HostOrDeviceSlice;
+#[cfg(feature = "cuda")]
+use zeknox::device::stream::CudaStream;
+#[cfg(feature = "cuda")]
+use zeknox::merkle::bindings::{
+    fill_digests_buf_linear_gpu_with_gpu_ptr, fill_digests_buf_linear_multigpu_with_gpu_ptr,
+};
 
 use crate::hash::hash_types::RichField;
 #[cfg(feature = "cuda")]
@@ -842,9 +842,7 @@ mod tests {
     use super::*;
     use crate::field::extension::Extendable;
     use crate::hash::merkle_proofs::verify_merkle_proof_to_cap;
-    use crate::plonk::config::{
-        GenericConfig, KeccakGoldilocksConfig, PoseidonGoldilocksConfig,
-    };
+    use crate::plonk::config::{GenericConfig, KeccakGoldilocksConfig, PoseidonGoldilocksConfig};
 
     fn random_data<F: RichField>(n: usize, k: usize) -> Vec<Vec<F>> {
         (0..n).map(|_| F::rand_vec(k)).collect()
@@ -1189,7 +1187,7 @@ mod tests {
 
         Ok(())
     }
-    
+
     #[test]
     fn test_merkle_trees_keccak() -> Result<()> {
         const D: usize = 2;
@@ -1204,4 +1202,4 @@ mod tests {
 
         Ok(())
     }
-}
\ No newline at end of file
+}

From 914139fabdcb186bb612e3e9c0eeb7301f9a485c Mon Sep 17 00:00:00 2001
From: lighter-zz <allaboutshop8@163.com>
Date: Tue, 9 Dec 2025 14:06:18 -0500
Subject: [PATCH 14/37] fix

---
 plonky2/src/hash/merkle_tree.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs
index 461c04754..5bfbb663a 100644
--- a/plonky2/src/hash/merkle_tree.rs
+++ b/plonky2/src/hash/merkle_tree.rs
@@ -19,7 +19,7 @@ use zeknox::device::memory::HostOrDeviceSlice;
 #[cfg(feature = "cuda")]
 use zeknox::device::stream::CudaStream;
 #[cfg(feature = "cuda")]
-use zeknox::merkle::bindings::{
+use zeknox::{
     fill_digests_buf_linear_gpu_with_gpu_ptr, fill_digests_buf_linear_multigpu_with_gpu_ptr,
 };
 

From 80d429282b21b0cd9a98a5ea0a75fab56a353fc2 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-31-221.ap-northeast-1.compute.internal>
Date: Tue, 9 Dec 2025 21:12:27 +0000
Subject: [PATCH 15/37] merkle tree good version

---
 field/src/fft.rs                | 676 +-------------------------------
 field/src/polynomial/mod.rs     |  73 +---
 plonky2/examples/fibonacci.rs   |  40 +-
 plonky2/src/hash/merkle_tree.rs |  10 +-
 4 files changed, 36 insertions(+), 763 deletions(-)

diff --git a/field/src/fft.rs b/field/src/fft.rs
index 682a1e33d..d078ca6c3 100644
--- a/field/src/fft.rs
+++ b/field/src/fft.rs
@@ -32,284 +32,16 @@ pub fn fft_root_table<F: Field>(n: usize) -> FftRootTable<F> {
     root_table
 }
 
-#[cfg(feature = "cuda")]
-fn fft_dispatch_gpu<F: Field>(
-    input: &mut [F],
-    zero_factor: Option<usize>,
-    root_table: Option<&FftRootTable<F>>,
-) {
-    // if F::CUDA_SUPPORT {
-    //     use zeknox::ntt_batch;
-    //     use zeknox::types::NTTConfig;
-
-    //     let mut a = input.to_vec();
-    //     let mut b = input.to_vec();
-
-    //     ntt_batch(
-    //         0,
-    //         a.as_mut_ptr(),
-    //         input.len().trailing_zeros() as usize,
-    //         NTTConfig::default(),
-    //     );
-
-    //     fft_dispatch_cpu(&mut b, zero_factor, root_table);
-    //     ark_std::println!("a: {:?}", a);
-    //     ark_std::println!("b: {:?}", b);
-
-    //     assert_eq!(
-    //         a, b,
-    //         "failed GPU FFT vs CPU FFT comparison\ngpu:{:?}\ncpu:{:?}\ninput:{:?}",
-    //         a, b, input
-    //     );
-
-    //     input.copy_from_slice(&a);
-    // }
-    // return fft_dispatch_cpu(input, zero_factor, root_table);
-
-    use zeknox::ntt_batch;
-    use zeknox::types::NTTConfig;
-    if F::CUDA_SUPPORT {
-        return ntt_batch(
-            0,
-            input.as_mut_ptr(),
-            input.len().trailing_zeros() as usize,
-            NTTConfig::default(),
-        );
-    } else {
-        return fft_dispatch_cpu(input, zero_factor, root_table);
-    }
-}
-
-/// Batch FFT computation for multiple polynomials on GPU
-#[cfg(feature = "cuda")]
-fn fft_batch_dispatch_gpu<F: Field>(
-    inputs: &mut [F],
-    poly_size: usize,
-    num_polys: usize,
-    zero_factor: Option<usize>,
-    root_table: Option<&FftRootTable<F>>,
-) {
-    use zeknox::ntt_batch;
-    use zeknox::types::NTTConfig;
-
-    if F::CUDA_SUPPORT {
-        let mut cfg = NTTConfig::default();
-        cfg.batches = num_polys as u32;
-
-        return ntt_batch(
-            0,
-            inputs.as_mut_ptr(),
-            poly_size.trailing_zeros() as usize,
-            cfg,
-        );
-    } else {
-        // Fallback to CPU: process each polynomial separately
-        for i in 0..num_polys {
-            let start = i * poly_size;
-            let end = start + poly_size;
-            fft_dispatch_cpu(&mut inputs[start..end], zero_factor, root_table);
-        }
-    }
-}
-
-#[cfg(feature = "cuda")]
-pub(crate) fn coset_fft_gpu<F: Field>(
-    poly: PolynomialCoeffs<F>,
-    zero_factor: Option<usize>,
-    root_table: Option<&FftRootTable<F>>,
-) -> PolynomialValues<F> {
-    use zeknox::ntt_batch;
-    use zeknox::types::NTTConfig;
-
-    if !F::CUDA_SUPPORT {
-        // Fallback to CPU if CUDA not supported for this field
-        let modified_poly: PolynomialCoeffs<F> = F::coset_shift()
-            .powers()
-            .zip(&poly.coeffs)
-            .map(|(r, &c)| r * c)
-            .collect::<Vec<_>>()
-            .into();
-        return fft_with_options(modified_poly, zero_factor, root_table);
-    }
-
-    let PolynomialCoeffs { coeffs: mut buffer } = poly;
-    let lg_n = buffer.len().trailing_zeros() as usize;
-
-    // // Initialize coset on GPU
-    // // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR)
-    // // TODO: Make this generic for other fields if needed
-    // let coset_gen_u64 = 7u64;
-    // init_coset_rs(0, lg_n, coset_gen_u64);
-
-    // Configure NTT for coset
-    let mut cfg = NTTConfig::default();
-    cfg.with_coset = true;
-    cfg.ntt_type = zeknox::types::NTTType::Coset;
-
-    // Perform coset NTT on GPU
-    ntt_batch(0, buffer.as_mut_ptr(), lg_n, cfg);
-
-    PolynomialValues::new(buffer)
-}
-
-/// Batch coset FFT computation for multiple polynomials on GPU
-#[cfg(feature = "cuda")]
-fn coset_fft_batch_gpu<F: Field>(
-    polys: Vec<PolynomialCoeffs<F>>,
-    zero_factor: Option<usize>,
-    root_table: Option<&FftRootTable<F>>,
-) -> Vec<PolynomialValues<F>> {
-    use zeknox::ntt_batch;
-    use zeknox::types::NTTConfig;
-
-    if polys.is_empty() {
-        return Vec::new();
-    }
-
-    let num_polys = polys.len();
-    let poly_size = polys[0].len();
-
-    // Verify all polynomials have the same size
-    assert!(
-        polys.iter().all(|p| p.len() == poly_size),
-        "All polynomials must have the same size for batch coset FFT"
-    );
-
-    if !F::CUDA_SUPPORT {
-        // Fallback to CPU if CUDA not supported for this field
-        return polys
-            .into_iter()
-            .map(|poly| {
-                let modified_poly: PolynomialCoeffs<F> = F::coset_shift()
-                    .powers()
-                    .zip(&poly.coeffs)
-                    .map(|(r, &c)| r * c)
-                    .collect::<Vec<_>>()
-                    .into();
-                fft_with_options(modified_poly, zero_factor, root_table)
-            })
-            .collect();
-    }
-
-    // Flatten all polynomials into a single contiguous buffer
-    let mut buffer: Vec<F> = Vec::with_capacity(num_polys * poly_size);
-    for poly in polys {
-        buffer.extend_from_slice(&poly.coeffs);
-    }
-
-    let lg_n = poly_size.trailing_zeros() as usize;
-
-    // Configure NTT for batch coset
-    let mut cfg = NTTConfig::default();
-    cfg.batches = num_polys as u32;
-    cfg.with_coset = true;
-    cfg.ntt_type = zeknox::types::NTTType::Coset;
-
-    // Perform batch coset NTT on GPU
-    ntt_batch(0, buffer.as_mut_ptr(), lg_n, cfg);
-
-    // Split the buffer back into separate polynomials
-    buffer
-        .chunks(poly_size)
-        .map(|chunk| PolynomialValues::new(chunk.to_vec()))
-        .collect()
-}
-
-/// Compute coset FFT for multiple polynomials in batch.
-/// All polynomials must have the same size (power of 2).
-/// Returns a vector of PolynomialValues in the same order as input.
-pub fn coset_fft_batch<F: Field>(polys: Vec<PolynomialCoeffs<F>>) -> Vec<PolynomialValues<F>> {
-    coset_fft_batch_with_options(polys, None, None)
-}
-
-/// Compute coset FFT for multiple polynomials in batch with options.
-/// All polynomials must have the same size (power of 2).
-/// Returns a vector of PolynomialValues in the same order as input.
-pub fn coset_fft_batch_with_options<F: Field>(
-    polys: Vec<PolynomialCoeffs<F>>,
-    zero_factor: Option<usize>,
-    root_table: Option<&FftRootTable<F>>,
-) -> Vec<PolynomialValues<F>> {
-    // #[cfg(feature = "cuda")]
-    // {
-    //     let a = coset_fft_batch_gpu(polys.clone(), zero_factor, root_table);
-    //     let b = polys
-    //         .into_iter()
-    //         .map(|poly| {
-    //             let modified_poly: PolynomialCoeffs<F> = F::coset_shift()
-    //                 .powers()
-    //                 .zip(&poly.coeffs)
-    //                 .map(|(r, &c)| r * c)
-    //                 .collect::<Vec<_>>()
-    //                 .into();
-    //             fft_with_options(modified_poly, zero_factor, root_table)
-    //         })
-    //         .collect::<Vec<_>>();
-    //     assert_eq!(a.len(), b.len());
-
-    //     for (i, (val_a, val_b)) in a.iter().zip(b.iter()).enumerate() {
-    //         assert_eq!(val_a, val_b, "Mismatch at index {}", i);
-    //     }
-
-    //     return a;
-    // }
-
-    // #[cfg(not(feature = "cuda"))]
-    // {
-    // CPU fallback: process each polynomial separately
-    polys
-        .into_iter()
-        .map(|poly| {
-            let modified_poly: PolynomialCoeffs<F> = F::coset_shift()
-                .powers()
-                .zip(&poly.coeffs)
-                .map(|(r, &c)| r * c)
-                .collect::<Vec<_>>()
-                .into();
-            fft_with_options(modified_poly, zero_factor, root_table)
-        })
-        .collect()
-    // }
-}
-
-pub(crate) fn fft_dispatch_cpu<F: Field>(
-    input: &mut [F],
-    zero_factor: Option<usize>,
-    root_table: Option<&FftRootTable<F>>,
-) {
-    if root_table.is_some() {
-        return fft_classic(input, zero_factor.unwrap_or(0), root_table.unwrap());
-    } else {
-        // let pre_computed = F::pre_compute_fft_root_table(input.len());
-        // if pre_computed.is_some() {
-        //     return fft_classic(input, zero_factor.unwrap_or(0), pre_computed.unwrap());
-        // } else {
-        //     let computed = fft_root_table::<F>(input.len());
-
-        //     return fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref());
-        // }
-        let computed = fft_root_table::<F>(input.len());
-
-        return fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref());
-    };
-}
-
 #[inline]
 fn fft_dispatch<F: Field>(
     input: &mut [F],
     zero_factor: Option<usize>,
     root_table: Option<&FftRootTable<F>>,
 ) {
-    #[cfg(feature = "cuda")]
-    {
-        // ark_std::println!("Using GPU FFT dispatch");
-        return fft_dispatch_gpu(input, zero_factor, root_table);
-    }
-    #[cfg(not(feature = "cuda"))]
-    {
-        // ark_std::println!("Using CPU FFT dispatch");
-        return fft_dispatch_cpu(input, zero_factor, root_table);
-    }
+    let computed_root_table = root_table.is_none().then(|| fft_root_table(input.len()));
+    let used_root_table = root_table.or(computed_root_table.as_ref()).unwrap();
+
+    fft_classic(input, zero_factor.unwrap_or(0), used_root_table);
 }
 
 #[inline]
@@ -328,66 +60,6 @@ pub fn fft_with_options<F: Field>(
     PolynomialValues::new(buffer)
 }
 
-/// Compute FFT for multiple polynomials in batch.
-/// All polynomials must have the same size (power of 2).
-/// Returns a vector of PolynomialValues in the same order as input.
-#[inline]
-pub fn fft_batch<F: Field>(polys: Vec<PolynomialCoeffs<F>>) -> Vec<PolynomialValues<F>> {
-    fft_batch_with_options(polys, None, None)
-}
-
-/// Compute FFT for multiple polynomials in batch with options.
-/// All polynomials must have the same size (power of 2).
-/// Returns a vector of PolynomialValues in the same order as input.
-pub fn fft_batch_with_options<F: Field>(
-    polys: Vec<PolynomialCoeffs<F>>,
-    zero_factor: Option<usize>,
-    root_table: Option<&FftRootTable<F>>,
-) -> Vec<PolynomialValues<F>> {
-    if polys.is_empty() {
-        return Vec::new();
-    }
-
-    let num_polys = polys.len();
-    let poly_size = polys[0].len();
-
-    // Verify all polynomials have the same size
-    assert!(
-        polys.iter().all(|p| p.len() == poly_size),
-        "All polynomials must have the same size for batch FFT"
-    );
-    assert!(
-        poly_size.is_power_of_two(),
-        "Polynomial size must be a power of 2"
-    );
-
-    // Flatten all polynomials into a single contiguous buffer
-    let mut buffer: Vec<F> = Vec::with_capacity(num_polys * poly_size);
-    for poly in polys {
-        buffer.extend_from_slice(&poly.coeffs);
-    }
-
-    // Dispatch to GPU or CPU batch processing
-    #[cfg(feature = "cuda")]
-    fft_batch_dispatch_gpu(&mut buffer, poly_size, num_polys, zero_factor, root_table);
-
-    #[cfg(not(feature = "cuda"))]
-    {
-        // CPU fallback: process each polynomial separately
-        for i in 0..num_polys {
-            let start = i * poly_size;
-            let end = start + poly_size;
-            fft_dispatch_cpu(&mut buffer[start..end], zero_factor, root_table);
-        }
-    }
-
-    // Split the buffer back into separate polynomials
-    buffer
-        .chunks(poly_size)
-        .map(|chunk| PolynomialValues::new(chunk.to_vec()))
-        .collect()
-}
-
 #[inline]
 pub fn ifft<F: Field>(poly: PolynomialValues<F>) -> PolynomialCoeffs<F> {
     ifft_with_options(poly, None, None)
@@ -534,112 +206,18 @@ mod tests {
     use alloc::vec::Vec;
 
     use plonky2_util::{log2_ceil, log2_strict};
-    #[cfg(feature = "cuda")]
-    use zeknox::init_twiddle_factors_rs;
 
-    #[cfg(feature = "cuda")]
-    use crate::fft::{coset_fft_batch, fft_dispatch_cpu, fft_dispatch_gpu};
-    use crate::fft::{fft, fft_batch, fft_with_options, ifft};
+    use crate::fft::{fft, fft_with_options, ifft};
     use crate::goldilocks_field::GoldilocksField;
     use crate::polynomial::{PolynomialCoeffs, PolynomialValues};
     use crate::types::Field;
 
-    #[test]
-    #[cfg(feature = "cuda")]
-    fn test_kat() {
-        init_twiddle_factors_rs(0, 4);
-
-        let input = [
-            16807u64,
-            10376289027450995739,
-            18446743787439915009,
-            1905022641934172156,
-            4730749933575995392,
-            68841472,
-            18428264577490855681,
-            18445589101169082369,
-            18446744069414567514,
-            8070455041963588582,
-            49,
-            1625527855624486912,
-            7,
-            18446744069414555649,
-            7696581392640,
-            481036337152,
-        ];
-        let input_field: Vec<GoldilocksField> = input
-            .iter()
-            .map(|&x| GoldilocksField::from_canonical_u64(x))
-            .collect();
-
-        let res_cpu = [
-            8241673866677297204,
-            18443207692673526440,
-            3336172192632445894,
-            12915814655533318448,
-            5977358399840934215,
-            2796120128477098295,
-            16099264885043452953,
-            1114428869533774434,
-            1182881845840683068,
-            18442399148451944616,
-            5639697009785877037,
-            5534977815694745617,
-            3521085621945067109,
-            15650623939293352472,
-            11342098386477995483,
-            17336148097415430195,
-        ];
-        let res_cpu_field: Vec<GoldilocksField> = res_cpu
-            .iter()
-            .map(|&x| GoldilocksField::from_canonical_u64(x))
-            .collect();
-
-        let res_gpu = [
-            8241673866677297204,
-            18443207692673526440,
-            3336172192632445894,
-            12915814655533318448,
-            5977358399840934215,
-            2796120128477098295,
-            16099264885043452953,
-            1114428869533774434,
-            1182881845840683068,
-            18442399148451944616,
-            5639697009785877037,
-            5534977815694745617,
-            3521085621945067109,
-            15650623939293352472,
-            11342098386477995483,
-            17336148097415430195,
-        ];
-        let res_gpu_field: Vec<GoldilocksField> = res_gpu
-            .iter()
-            .map(|&x| GoldilocksField::from_canonical_u64(x))
-            .collect();
-
-        let mut input_cpu = input_field.clone();
-        fft_dispatch_cpu(&mut input_cpu, None, None);
-        assert_eq!(input_cpu, res_cpu_field);
-
-        let mut input_gpu = input_field.clone();
-        fft_dispatch_gpu(&mut input_gpu, None, None);
-        assert_eq!(input_gpu, res_gpu_field);
-    }
-
     #[test]
     fn fft_and_ifft() {
         type F = GoldilocksField;
         let degree = 200usize;
         let degree_padded = degree.next_power_of_two();
 
-        #[cfg(feature = "cuda")]
-        let log_degree = {
-            zeknox::clear_cuda_errors_rs();
-            let log_degree = degree_padded.trailing_zeros() as usize;
-            init_twiddle_factors_rs(0, log_degree);
-            log_degree
-        };
         // Create a vector of coeffs; the first degree of them are
         // "random", the last degree_padded-degree of them are zero.
         let coeffs = (0..degree)
@@ -661,8 +239,6 @@ mod tests {
         }
 
         for r in 0..4 {
-            #[cfg(feature = "cuda")]
-            init_twiddle_factors_rs(0, log_degree + r);
             // expand coefficients by factor 2^r by filling with zeros
             let zero_tail = coefficients.lde(r);
             assert_eq!(
@@ -672,248 +248,6 @@ mod tests {
         }
     }
 
-    #[test]
-    #[cfg(feature = "cuda")]
-    fn test_fft_gpu_vs_cpu_single() {
-        type F = GoldilocksField;
-
-        // Test various polynomial sizes
-        for log_size in [8, 10, 12, 14, 16, 18, 20] {
-            let size = 1 << log_size;
-            zeknox::clear_cuda_errors_rs();
-            init_twiddle_factors_rs(0, log_size);
-
-            // Create a random polynomial
-            let coeffs: Vec<F> = (0..size)
-                .map(|i| F::from_canonical_usize(i * 7919 % 1000000))
-                .collect();
-
-            let poly = PolynomialCoeffs {
-                coeffs: coeffs.clone(),
-            };
-
-            // Compute FFT using GPU (via fft function which dispatches to GPU)
-            let gpu_result = fft(poly.clone());
-
-            // Compute FFT using CPU (force CPU path)
-            let mut cpu_buffer = coeffs.clone();
-            super::fft_dispatch_cpu(&mut cpu_buffer, None, None);
-            let cpu_result = PolynomialValues::new(cpu_buffer);
-
-            // Compare results
-            assert_eq!(
-                gpu_result.len(),
-                cpu_result.len(),
-                "GPU and CPU results have different lengths for size {}",
-                size
-            );
-
-            for i in 0..size {
-                assert_eq!(
-                    gpu_result.values[i], cpu_result.values[i],
-                    "Mismatch at index {} for polynomial size {}",
-                    i, size
-                );
-            }
-        }
-    }
-
-    #[test]
-    #[cfg(feature = "cuda")]
-    fn test_fft_batch_gpu_vs_cpu() {
-        type F = GoldilocksField;
-
-        let poly_size: usize = 1 << 10; // 1024 elements
-        let num_polys = 8;
-        let log_size = poly_size.trailing_zeros() as usize;
-
-        zeknox::clear_cuda_errors_rs();
-        init_twiddle_factors_rs(0, log_size);
-
-        // Create multiple random polynomials
-        let polys: Vec<PolynomialCoeffs<F>> = (0..num_polys)
-            .map(|batch_idx| {
-                let coeffs: Vec<F> = (0..poly_size)
-                    .map(|i| F::from_canonical_usize((i * 7919 + batch_idx * 12345) % 1000000))
-                    .collect();
-                PolynomialCoeffs { coeffs }
-            })
-            .collect();
-
-        // Compute batch FFT using GPU
-        let gpu_results = fft_batch(polys.clone());
-
-        // Compute FFT for each polynomial using CPU
-        let cpu_results: Vec<PolynomialValues<F>> = polys
-            .into_iter()
-            .map(|poly| {
-                let mut buffer = poly.coeffs.clone();
-                super::fft_dispatch_cpu(&mut buffer, None, None);
-                PolynomialValues::new(buffer)
-            })
-            .collect();
-
-        // Compare results
-        assert_eq!(gpu_results.len(), cpu_results.len());
-        for (batch_idx, (gpu_result, cpu_result)) in
-            gpu_results.iter().zip(cpu_results.iter()).enumerate()
-        {
-            assert_eq!(gpu_result.len(), cpu_result.len());
-            for i in 0..poly_size {
-                assert_eq!(
-                    gpu_result.values[i], cpu_result.values[i],
-                    "Batch FFT mismatch at batch {} index {}",
-                    batch_idx, i
-                );
-            }
-        }
-    }
-
-    #[test]
-    #[cfg(feature = "cuda")]
-    fn test_coset_fft_gpu_vs_cpu_single() {
-        use zeknox::init_coset_rs;
-
-        use crate::types::PrimeField64;
-        type F = GoldilocksField;
-
-        for log_size in [8, 10, 12] {
-            let size = 1 << log_size;
-            zeknox::clear_cuda_errors_rs();
-            init_twiddle_factors_rs(0, log_size);
-
-            // Initialize coset for GPU
-            let coset_gen_u64 = F::coset_shift().to_canonical_u64();
-            init_coset_rs(0, log_size, coset_gen_u64);
-
-            // Create a random polynomial
-            let coeffs: Vec<F> = (0..size)
-                .map(|i| F::from_canonical_usize(i * 8191 % 1000000))
-                .collect();
-
-            let poly = PolynomialCoeffs {
-                coeffs: coeffs.clone(),
-            };
-
-            // Compute coset FFT using GPU
-            let gpu_result = super::coset_fft_gpu(poly.clone(), None, None);
-
-            // Compute coset FFT using CPU (apply coset shift then FFT)
-            let modified_poly: PolynomialCoeffs<F> = F::coset_shift()
-                .powers()
-                .zip(&coeffs)
-                .map(|(r, &c)| r * c)
-                .collect::<Vec<_>>()
-                .into();
-
-            let mut cpu_buffer = modified_poly.coeffs;
-            super::fft_dispatch_cpu(&mut cpu_buffer, None, None);
-            let cpu_result = PolynomialValues::new(cpu_buffer);
-
-            // Compare results
-            assert_eq!(
-                gpu_result.len(),
-                cpu_result.len(),
-                "GPU and CPU coset FFT results have different lengths for size {}",
-                size
-            );
-
-            for i in 0..size {
-                assert_eq!(
-                    gpu_result.values[i], cpu_result.values[i],
-                    "Coset FFT mismatch at index {} for polynomial size {}",
-                    i, size
-                );
-            }
-        }
-    }
-
-    #[test]
-    #[cfg(feature = "cuda")]
-    fn test_coset_fft_batch_gpu_vs_cpu() {
-        use zeknox::init_coset_rs;
-
-        use crate::types::PrimeField64;
-        type F = GoldilocksField;
-
-        let poly_size: usize = 1 << 10; // 1024 elements
-        let num_polys = 8;
-        let log_size = poly_size.trailing_zeros() as usize;
-
-        zeknox::clear_cuda_errors_rs();
-        init_twiddle_factors_rs(0, log_size);
-
-        // Initialize coset for GPU
-        let coset_gen_u64 = F::coset_shift().to_canonical_u64();
-        init_coset_rs(0, log_size, coset_gen_u64);
-
-        // Create multiple random polynomials
-        let polys: Vec<PolynomialCoeffs<F>> = (0..num_polys)
-            .map(|batch_idx| {
-                let coeffs: Vec<F> = (0..poly_size)
-                    .map(|i| F::from_canonical_usize((i * 8191 + batch_idx * 54321) % 1000000))
-                    .collect();
-                PolynomialCoeffs { coeffs }
-            })
-            .collect();
-
-        // Compute batch coset FFT using GPU
-        let gpu_results = coset_fft_batch(polys.clone());
-
-        // Compute coset FFT for each polynomial using CPU
-        let cpu_results: Vec<PolynomialValues<F>> = polys
-            .into_iter()
-            .map(|poly| {
-                let modified_poly: PolynomialCoeffs<F> = F::coset_shift()
-                    .powers()
-                    .zip(&poly.coeffs)
-                    .map(|(r, &c)| r * c)
-                    .collect::<Vec<_>>()
-                    .into();
-
-                let mut buffer = modified_poly.coeffs;
-                super::fft_dispatch_cpu(&mut buffer, None, None);
-                PolynomialValues::new(buffer)
-            })
-            .collect();
-
-        // Compare results
-        assert_eq!(gpu_results.len(), cpu_results.len());
-        for (batch_idx, (gpu_result, cpu_result)) in
-            gpu_results.iter().zip(cpu_results.iter()).enumerate()
-        {
-            assert_eq!(gpu_result.len(), cpu_result.len());
-            for i in 0..poly_size {
-                assert_eq!(
-                    gpu_result.values[i], cpu_result.values[i],
-                    "Batch coset FFT mismatch at batch {} index {}",
-                    batch_idx, i
-                );
-            }
-        }
-    }
-
-    #[test]
-    fn test_batch_fft_empty() {
-        type F = GoldilocksField;
-        let polys: Vec<PolynomialCoeffs<F>> = vec![];
-        let results = fft_batch(polys);
-        assert!(results.is_empty());
-    }
-
-    #[test]
-    #[should_panic(expected = "All polynomials must have the same size")]
-    fn test_batch_fft_different_sizes() {
-        type F = GoldilocksField;
-        let poly1 = PolynomialCoeffs {
-            coeffs: vec![F::ONE; 256],
-        };
-        let poly2 = PolynomialCoeffs {
-            coeffs: vec![F::ONE; 512],
-        };
-        let _ = fft_batch(vec![poly1, poly2]);
-    }
-
     fn evaluate_naive<F: Field>(coefficients: &PolynomialCoeffs<F>) -> PolynomialValues<F> {
         let degree = coefficients.len();
         let degree_padded = 1 << log2_ceil(degree);
diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs
index a78cc10d1..c13bbca27 100644
--- a/field/src/polynomial/mod.rs
+++ b/field/src/polynomial/mod.rs
@@ -12,7 +12,7 @@ use plonky2_util::log2_strict;
 use serde::{Deserialize, Serialize};
 
 use crate::extension::{Extendable, FieldExtension};
-use crate::fft::{fft, fft_dispatch_cpu, fft_with_options, ifft, FftRootTable};
+use crate::fft::{fft, fft_with_options, ifft, FftRootTable};
 use crate::types::Field;
 
 /// A polynomial in point-value form.
@@ -283,26 +283,13 @@ impl<F: Field> PolynomialCoeffs<F> {
         zero_factor: Option<usize>,
         root_table: Option<&FftRootTable<F>>,
     ) -> PolynomialValues<F> {
-        #[cfg(feature = "cuda")]
-        {
-            if F::CUDA_SUPPORT && shift == F::coset_shift() {
-                // Use GPU coset FFT directly without CPU-side coefficient modification
-                // ark_std::println!("Using GPU coset FFT: degree {}", self.len() - 1);
-                return crate::fft::coset_fft_gpu(self.clone(), zero_factor, root_table);
-            }
-        }
-
-        // CPU path: multiply by powers of shift, then do regular FFT
-        let mut modified_poly: Self = shift
+        let modified_poly: Self = shift
             .powers()
             .zip(&self.coeffs)
             .map(|(r, &c)| r * c)
             .collect::<Vec<_>>()
             .into();
-
-        fft_dispatch_cpu(&mut modified_poly.coeffs, zero_factor, root_table);
-        modified_poly.coeffs.into()
-        // modified_poly.fft_with_options(zero_factor, root_table)
+        modified_poly.fft_with_options(zero_factor, root_table)
     }
 
     pub fn to_extension<const D: usize>(&self) -> PolynomialCoeffs<F::Extension>
@@ -453,8 +440,6 @@ impl<F: Field> Mul for &PolynomialCoeffs<F> {
 mod tests {
     use std::time::Instant;
 
-    #[cfg(feature = "cuda")]
-    use plonky2_util::log2_ceil;
     use rand::rngs::OsRng;
     use rand::Rng;
 
@@ -494,13 +479,6 @@ mod tests {
 
         let k = 8;
         let n = 1 << k;
-
-        #[cfg(feature = "cuda")]
-        {
-            zeknox::clear_cuda_errors_rs();
-            zeknox::init_twiddle_factors_rs(0, k);
-        }
-
         let poly = PolynomialCoeffs::new(F::rand_vec(n));
         let shift = F::rand();
         let coset_evals = poly.coset_fft(shift).values;
@@ -522,13 +500,6 @@ mod tests {
 
         let k = 8;
         let n = 1 << k;
-
-        #[cfg(feature = "cuda")]
-        {
-            zeknox::clear_cuda_errors_rs();
-            zeknox::init_twiddle_factors_rs(0, k);
-        }
-
         let evals = PolynomialValues::new(F::rand_vec(n));
         let shift = F::rand();
         let coeffs = evals.clone().coset_ifft(shift);
@@ -549,12 +520,6 @@ mod tests {
         type F = GoldilocksField;
         let mut rng = OsRng;
         let (a_deg, b_deg) = (rng.gen_range(1..10_000), rng.gen_range(1..10_000));
-
-        #[cfg(feature = "cuda")]
-        {
-            zeknox::clear_cuda_errors_rs();
-            zeknox::init_twiddle_factors_rs(0, log2_ceil(a_deg + b_deg + 1));
-        }
         let a = PolynomialCoeffs::new(F::rand_vec(a_deg));
         let b = PolynomialCoeffs::new(F::rand_vec(b_deg));
         let m1 = &a * &b;
@@ -572,24 +537,11 @@ mod tests {
         let mut rng = OsRng;
         let a_deg = rng.gen_range(0..1_000);
         let n = rng.gen_range(1..1_000);
-
-        #[cfg(feature = "cuda")]
-        {
-            zeknox::clear_cuda_errors_rs();
-            for i in 1..=log2_ceil(max(a_deg, n)) + 1 {
-                zeknox::init_twiddle_factors_rs(0, i);
-            }
-        }
-
         let mut a = PolynomialCoeffs::new(F::rand_vec(a_deg + 1));
-        println!("a {} b {}", a.len(), n);
-
         if a.coeffs[0].is_zero() {
             a.coeffs[0] = F::ONE; // First coefficient needs to be nonzero.
         }
         let b = a.inv_mod_xn(n);
-        println!("a {} b {}", a.len(), b.len());
-
         let mut m = &a * &b;
         m.coeffs.truncate(n);
         m.trim();
@@ -623,15 +575,6 @@ mod tests {
         type F = GoldilocksField;
         let mut rng = OsRng;
         let (a_deg, b_deg) = (rng.gen_range(1..10_000), rng.gen_range(1..10_000));
-
-        #[cfg(feature = "cuda")]
-        {
-            zeknox::clear_cuda_errors_rs();
-            for i in 1..=log2_ceil(max(a_deg, b_deg)) + 1 {
-                zeknox::init_twiddle_factors_rs(0, i);
-            }
-        }
-
         let a = PolynomialCoeffs::new(F::rand_vec(a_deg));
         let b = PolynomialCoeffs::new(F::rand_vec(b_deg));
         let (q, r) = a.div_rem(&b);
@@ -663,7 +606,6 @@ mod tests {
         let mut rng = OsRng;
         let l = 14;
         let n = 1 << l;
-
         let g = F::primitive_root_of_unity(l);
         let xn_minus_one = {
             let mut xn_min_one_vec = vec![F::ZERO; n + 1];
@@ -674,15 +616,6 @@ mod tests {
 
         let a = g.exp_u64(rng.gen_range(0..(n as u64)));
         let denom = PolynomialCoeffs::new(vec![-a, F::ONE]);
-
-        #[cfg(feature = "cuda")]
-        {
-            zeknox::clear_cuda_errors_rs();
-            for i in 1..=l + 1 {
-                zeknox::init_twiddle_factors_rs(0, i);
-            }
-        }
-
         let now = Instant::now();
         xn_minus_one.div_rem(&denom);
         println!("Division time: {:?}", now.elapsed());
diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs
index 2d2460b92..9573048a6 100644
--- a/plonky2/examples/fibonacci.rs
+++ b/plonky2/examples/fibonacci.rs
@@ -29,33 +29,33 @@ fn main() -> Result<()> {
     let initial_b = builder.add_virtual_target();
     let mut prev_target = initial_a;
     let mut cur_target = initial_b;
-    for _ in 0..9999999 {
+    for _ in 0..2999999 {
         let temp = builder.add(prev_target, cur_target);
         prev_target = cur_target;
         cur_target = temp;
     }
     println!("Circuit built.");
 
-    #[cfg(feature = "cuda")]
-    {
-        zeknox::clear_cuda_errors_rs();
-        println!("Initializing CUDA twiddle factors...");
-        // Initialize twiddle factors for all dimensions that will be used
-        // This test involves multiple polynomials and recursive verification,
-        // so we initialize a wider range of dimensions to be safe
-        // for i in 0..=19 {
-        //     zeknox::init_twiddle_factors_rs(0, i);
-        // }
+    // #[cfg(feature = "cuda")]
+    // {
+    //     zeknox::clear_cuda_errors_rs();
+    //     println!("Initializing CUDA twiddle factors...");
+    //     // Initialize twiddle factors for all dimensions that will be used
+    //     // This test involves multiple polynomials and recursive verification,
+    //     // so we initialize a wider range of dimensions to be safe
+    //     // for i in 0..=19 {
+    //     //     zeknox::init_twiddle_factors_rs(0, i);
+    //     // }
 
-        zeknox::init_twiddle_factors_rs(0, 19);
-        zeknox::init_twiddle_factors_rs(0, 22);
-        // Initialize coset on GPU
-        // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR)
-        // TODO: Make this generic for other fields if needed
-        let coset_gen_u64 = 7u64;
-        zeknox::init_coset_rs(0, 22, coset_gen_u64);
-        // zeknox::init_coset_rs(0, 16, coset_gen_u64);
-    }
+    //     zeknox::init_twiddle_factors_rs(0, 19);
+    //     zeknox::init_twiddle_factors_rs(0, 22);
+    //     // Initialize coset on GPU
+    //     // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR)
+    //     // TODO: Make this generic for other fields if needed
+    //     let coset_gen_u64 = 7u64;
+    //     zeknox::init_coset_rs(0, 22, coset_gen_u64);
+    //     // zeknox::init_coset_rs(0, 16, coset_gen_u64);
+    // }
 
     // Public inputs are the two initial values (provided below) and the result (which is generated).
     builder.register_public_input(initial_a);
diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs
index 5bfbb663a..3561471bd 100644
--- a/plonky2/src/hash/merkle_tree.rs
+++ b/plonky2/src/hash/merkle_tree.rs
@@ -281,10 +281,16 @@ fn fill_digests_buf_gpu<F: RichField, H: Hasher<F>>(
     if *gpu_id_lock >= num_gpus as u64 {
         *gpu_id_lock = 0;
     }
+    println!("Using GPU id {} leave length {}", gpu_id, leaves.len());
 
     let now = Instant::now();
-    let mut gpu_leaves_buf: HostOrDeviceSlice<'_, F> =
-        HostOrDeviceSlice::cuda_malloc(gpu_id as i32, leaves.len()).unwrap();
+    let gpu_leaves_buf_result = HostOrDeviceSlice::cuda_malloc(gpu_id as i32, leaves.len());
+
+    if gpu_leaves_buf_result.is_err() {
+        panic!("CUDA malloc failed, falling back to CPU for Merkle tree generation");
+    }
+
+    let mut gpu_leaves_buf = gpu_leaves_buf_result.unwrap();
     print_time(now, "alloc gpu leaves buffer");
 
     let now = Instant::now();

From af979c0a9383b07a44c5ee7f71a948b3d24dec50 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-31-221.ap-northeast-1.compute.internal>
Date: Thu, 11 Dec 2025 18:43:33 +0000
Subject: [PATCH 16/37] working

---
 field/src/extension/quadratic.rs     |   8 ++
 field/src/extension/quartic.rs       |   8 ++
 field/src/extension/quintic.rs       |   8 ++
 field/src/fft.rs                     | 137 ++++++++++++++++++++++++++-
 field/src/goldilocks_field.rs        |   7 ++
 field/src/polynomial/mod.rs          |   6 +-
 field/src/secp256k1_base.rs          |   7 ++
 field/src/secp256k1_scalar.rs        |   7 ++
 field/src/types.rs                   |   3 +
 plonky2/examples/fibonacci.rs        |  64 ++++++++-----
 plonky2/src/fri/oracle.rs            |   3 +-
 plonky2/src/plonk/circuit_builder.rs |   2 +
 12 files changed, 230 insertions(+), 30 deletions(-)

diff --git a/field/src/extension/quadratic.rs b/field/src/extension/quadratic.rs
index 281369d21..ca74747b9 100644
--- a/field/src/extension/quadratic.rs
+++ b/field/src/extension/quadratic.rs
@@ -59,6 +59,14 @@ impl<F: Extendable<2>> Sample for QuadraticExtension<F> {
 }
 
 impl<F: Extendable<2>> Field for QuadraticExtension<F> {
+    fn to_u64(&self) -> u64 {
+        unimplemented!()
+    }
+
+    fn from_u64(u: u64) -> Self {
+        unimplemented!()
+    }
+
     const ZERO: Self = Self([F::ZERO; 2]);
     const ONE: Self = Self([F::ONE, F::ZERO]);
     const TWO: Self = Self([F::TWO, F::ZERO]);
diff --git a/field/src/extension/quartic.rs b/field/src/extension/quartic.rs
index 8c8a9e7e4..daa9d3aaf 100644
--- a/field/src/extension/quartic.rs
+++ b/field/src/extension/quartic.rs
@@ -65,6 +65,14 @@ impl<F: Extendable<4>> Sample for QuarticExtension<F> {
 }
 
 impl<F: Extendable<4>> Field for QuarticExtension<F> {
+    fn to_u64(&self) -> u64 {
+        unimplemented!()
+    }
+
+    fn from_u64(u: u64) -> Self {
+        unimplemented!()
+    }
+
     const ZERO: Self = Self([F::ZERO; 4]);
     const ONE: Self = Self([F::ONE, F::ZERO, F::ZERO, F::ZERO]);
     const TWO: Self = Self([F::TWO, F::ZERO, F::ZERO, F::ZERO]);
diff --git a/field/src/extension/quintic.rs b/field/src/extension/quintic.rs
index 28ec92267..21817c6c8 100644
--- a/field/src/extension/quintic.rs
+++ b/field/src/extension/quintic.rs
@@ -66,6 +66,14 @@ impl<F: Extendable<5>> Sample for QuinticExtension<F> {
 }
 
 impl<F: Extendable<5>> Field for QuinticExtension<F> {
+    fn to_u64(&self) -> u64 {
+        unimplemented!()
+    }
+
+    fn from_u64(u: u64) -> Self {
+        unimplemented!()
+    }
+
     const ZERO: Self = Self([F::ZERO; 5]);
     const ONE: Self = Self([F::ONE, F::ZERO, F::ZERO, F::ZERO, F::ZERO]);
     const TWO: Self = Self([F::TWO, F::ZERO, F::ZERO, F::ZERO, F::ZERO]);
diff --git a/field/src/fft.rs b/field/src/fft.rs
index d078ca6c3..666c4d697 100644
--- a/field/src/fft.rs
+++ b/field/src/fft.rs
@@ -32,16 +32,111 @@ pub fn fft_root_table<F: Field>(n: usize) -> FftRootTable<F> {
     root_table
 }
 
+#[allow(dead_code)]
+#[cfg(feature = "cuda")]
+fn fft_dispatch_gpu<F: Field>(
+    input: &mut [F],
+    zero_factor: Option<usize>,
+    root_table: Option<&FftRootTable<F>>,
+) {
+    if F::CUDA_SUPPORT {
+        use zeknox::device::memory::HostOrDeviceSlice;
+        use zeknox::ntt_batch;
+        use zeknox::types::NTTConfig;
+
+        // let mut input_clone = input.to_vec();
+        // fft_dispatch_cpu(&mut input_clone, zero_factor, root_table);
+        // ark_std::println!("cpu done" );
+
+        let total_elements = input.len();
+        let mut io_u64 = input.iter().map(|x| x.to_u64()).collect::<Vec<u64>>();
+
+        let mut device_data: HostOrDeviceSlice<'_, u64> =
+            HostOrDeviceSlice::cuda_malloc(0, total_elements).unwrap();
+        device_data
+            .copy_from_host_offset(&io_u64, 0, total_elements)
+            .unwrap();
+        ntt_batch(
+            0,
+            device_data.as_mut_ptr() as *mut F,
+            input.len().trailing_zeros() as usize,
+            NTTConfig::default(),
+        );
+
+        // Copy results back from device to host
+        io_u64.resize(total_elements, 0u64);
+        device_data
+            .copy_to_host(&mut io_u64, total_elements)
+            .unwrap();
+
+        // Convert u64 results back to field elements
+        input.iter_mut().zip(io_u64.iter()).for_each(|(a, b)| {
+            *a = F::from_canonical_u64(*b);
+        });
+        // ark_std::println!("gpu done" );
+
+        // let mut to_print = false;
+        // for (i, (a, b)) in input.iter().zip(input_clone.iter()).enumerate() {
+        //     if a != b {
+        //         // panic!("Mismatch at index {}: gpu result = {}, cpu result = {}", i, a.to_u64(), b.to_u64());
+        //         to_print = true;
+        //         ark_std::println!(
+        //             "Mismatch at index {}: gpu result = {}, cpu result = {}",
+        //             i,
+        //             a.to_u64(),
+        //             b.to_u64()
+        //         );
+        //     }
+        // }
+
+        // if to_print {
+        //     ark_std::println!("Comparing results...");
+        //     ark_std::println!("cpu {:?}", input_clone);
+        //     ark_std::println!("gpu {:?}", input);
+        // }
+
+        return;
+    } else {
+        return fft_dispatch_cpu(input, zero_factor, root_table);
+    }
+}
+
+fn fft_dispatch_cpu<F: Field>(
+    input: &mut [F],
+    zero_factor: Option<usize>,
+    root_table: Option<&FftRootTable<F>>,
+) {
+    if root_table.is_some() {
+        return fft_classic(input, zero_factor.unwrap_or(0), root_table.unwrap());
+    } else {
+        // let pre_computed = F::pre_compute_fft_root_table(input.len());
+        // if pre_computed.is_some() {
+        //     return fft_classic(input, zero_factor.unwrap_or(0), pre_computed.unwrap());
+        // } else {
+        //     let computed = fft_root_table::<F>(input.len());
+
+        //     return fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref());
+        // }
+        let computed = fft_root_table::<F>(input.len());
+
+        return fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref());
+    };
+}
+
 #[inline]
 fn fft_dispatch<F: Field>(
     input: &mut [F],
     zero_factor: Option<usize>,
     root_table: Option<&FftRootTable<F>>,
 ) {
-    let computed_root_table = root_table.is_none().then(|| fft_root_table(input.len()));
-    let used_root_table = root_table.or(computed_root_table.as_ref()).unwrap();
-
-    fft_classic(input, zero_factor.unwrap_or(0), used_root_table);
+    #[cfg(feature = "cuda")]
+    {
+        return fft_dispatch_gpu(input, zero_factor, root_table);
+    }
+    #[cfg(not(feature = "cuda"))]
+    {
+        return fft_dispatch_cpu(input, zero_factor, root_table);
+    }
 }
 
 #[inline]
@@ -50,6 +145,7 @@ pub fn fft<F: Field>(poly: PolynomialCoeffs<F>) -> PolynomialValues<F> {
 }
 
 #[inline]
+
 pub fn fft_with_options<F: Field>(
     poly: PolynomialCoeffs<F>,
     zero_factor: Option<usize>,
@@ -65,6 +161,28 @@ pub fn ifft<F: Field>(poly: PolynomialValues<F>) -> PolynomialCoeffs<F> {
     ifft_with_options(poly, None, None)
 }
 
+#[inline]
+pub fn ifft_cpu<F: Field>(poly: PolynomialValues<F>) -> PolynomialCoeffs<F> {
+    let n = poly.len();
+    let lg_n = log2_strict(n);
+    let n_inv = F::inverse_2exp(lg_n);
+
+    let PolynomialValues { values: mut buffer } = poly;
+    fft_dispatch_cpu(&mut buffer, None, None);
+
+    // We reverse all values except the first, and divide each by n.
+    buffer[0] *= n_inv;
+    buffer[n / 2] *= n_inv;
+    for i in 1..(n / 2) {
+        let j = n - i;
+        let coeffs_i = buffer[j] * n_inv;
+        let coeffs_j = buffer[i] * n_inv;
+        buffer[i] = coeffs_i;
+        buffer[j] = coeffs_j;
+    }
+    PolynomialCoeffs { coeffs: buffer }
+}
+
 pub fn ifft_with_options<F: Field>(
     poly: PolynomialValues<F>,
     zero_factor: Option<usize>,
@@ -217,12 +335,20 @@ mod tests {
         type F = GoldilocksField;
         let degree = 200usize;
         let degree_padded = degree.next_power_of_two();
+        println!("Initializing CUDA");
+
+        #[cfg(feature = "cuda")]
+        for i in 8..=12 {
+            zeknox::init_twiddle_factors_rs(0, i);
+        }
+
+        println!("Testing fft/ifft with degree {}", degree);
 
         // Create a vector of coeffs; the first degree of them are
         // "random", the last degree_padded-degree of them are zero.
         let coeffs = (0..degree)
             .map(|i| F::from_canonical_usize(i * 1337 % 100))
-            .chain(core::iter::repeat_n(F::ZERO, degree_padded - degree))
+            .chain(core::iter::repeat(F::ZERO).take(degree_padded - degree))
             .collect::<Vec<_>>();
         assert_eq!(coeffs.len(), degree_padded);
         let coefficients = PolynomialCoeffs { coeffs };
@@ -238,6 +364,7 @@ mod tests {
             assert_eq!(interpolated_coefficients.coeffs[i], F::ZERO);
         }
 
+        println!("Testing ldes");
         for r in 0..4 {
             // expand coefficients by factor 2^r by filling with zeros
             let zero_tail = coefficients.lde(r);
diff --git a/field/src/goldilocks_field.rs b/field/src/goldilocks_field.rs
index ae8457744..2ac66d4ea 100644
--- a/field/src/goldilocks_field.rs
+++ b/field/src/goldilocks_field.rs
@@ -68,6 +68,13 @@ impl Sample for GoldilocksField {
 }
 
 impl Field for GoldilocksField {
+    fn to_u64(&self) -> u64 {
+        self.0
+    }
+    fn from_u64(u: u64) -> Self {
+        Self(u)
+    }
+
     const ZERO: Self = Self(0);
     const ONE: Self = Self(1);
     const TWO: Self = Self(2);
diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs
index c13bbca27..21eae220c 100644
--- a/field/src/polynomial/mod.rs
+++ b/field/src/polynomial/mod.rs
@@ -12,7 +12,7 @@ use plonky2_util::log2_strict;
 use serde::{Deserialize, Serialize};
 
 use crate::extension::{Extendable, FieldExtension};
-use crate::fft::{fft, fft_with_options, ifft, FftRootTable};
+use crate::fft::{FftRootTable, fft, fft_with_options, ifft, ifft_cpu};
 use crate::types::Field;
 
 /// A polynomial in point-value form.
@@ -59,6 +59,10 @@ impl<F: Field> PolynomialValues<F> {
         ifft(self)
     }
 
+    pub fn ifft_cpu(self) -> PolynomialCoeffs<F> {
+        ifft_cpu(self)
+    }
+
     /// Returns the polynomial whose evaluation on the coset `shift*H` is `self`.
     pub fn coset_ifft(self, shift: F) -> PolynomialCoeffs<F> {
         let mut shifted_coeffs = self.ifft();
diff --git a/field/src/secp256k1_base.rs b/field/src/secp256k1_base.rs
index 6632a7f83..20bc7a395 100644
--- a/field/src/secp256k1_base.rs
+++ b/field/src/secp256k1_base.rs
@@ -98,6 +98,13 @@ impl Field for Secp256K1Base {
 
     const BITS: usize = 256;
 
+    fn to_u64(&self) -> u64 {
+        unimplemented!()
+    }
+    fn from_u64(u: u64) -> Self {
+        unimplemented!()
+    }
+
     fn order() -> BigUint {
         BigUint::from_slice(&[
             0xFFFFFC2F, 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
diff --git a/field/src/secp256k1_scalar.rs b/field/src/secp256k1_scalar.rs
index 3ca5b6ba2..3514188e9 100644
--- a/field/src/secp256k1_scalar.rs
+++ b/field/src/secp256k1_scalar.rs
@@ -79,6 +79,13 @@ impl Sample for Secp256K1Scalar {
 }
 
 impl Field for Secp256K1Scalar {
+    fn to_u64(&self) -> u64 {
+        unimplemented!()
+    }
+    fn from_u64(u: u64) -> Self {
+        unimplemented!()
+    }
+
     const ZERO: Self = Self([0; 4]);
     const ONE: Self = Self([1, 0, 0, 0]);
     const TWO: Self = Self([2, 0, 0, 0]);
diff --git a/field/src/types.rs b/field/src/types.rs
index 5a34bb6a3..f365ea299 100644
--- a/field/src/types.rs
+++ b/field/src/types.rs
@@ -94,6 +94,9 @@ pub trait Field:
     /// Whether this field is supported by cuda
     const CUDA_SUPPORT: bool = false;
 
+    fn to_u64(&self) -> u64;
+    fn from_u64(u: u64) -> Self;
+
     fn order() -> BigUint;
     fn characteristic() -> BigUint;
 
diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs
index 9573048a6..ead3fc974 100644
--- a/plonky2/examples/fibonacci.rs
+++ b/plonky2/examples/fibonacci.rs
@@ -29,46 +29,64 @@ fn main() -> Result<()> {
     let initial_b = builder.add_virtual_target();
     let mut prev_target = initial_a;
     let mut cur_target = initial_b;
-    for _ in 0..2999999 {
+    for _ in 0..9999999 {
         let temp = builder.add(prev_target, cur_target);
         prev_target = cur_target;
         cur_target = temp;
     }
     println!("Circuit built.");
 
-    // #[cfg(feature = "cuda")]
-    // {
-    //     zeknox::clear_cuda_errors_rs();
-    //     println!("Initializing CUDA twiddle factors...");
-    //     // Initialize twiddle factors for all dimensions that will be used
-    //     // This test involves multiple polynomials and recursive verification,
-    //     // so we initialize a wider range of dimensions to be safe
-    //     // for i in 0..=19 {
-    //     //     zeknox::init_twiddle_factors_rs(0, i);
-    //     // }
+    let size = 19;
 
-    //     zeknox::init_twiddle_factors_rs(0, 19);
-    //     zeknox::init_twiddle_factors_rs(0, 22);
-    //     // Initialize coset on GPU
-    //     // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR)
-    //     // TODO: Make this generic for other fields if needed
-    //     let coset_gen_u64 = 7u64;
-    //     zeknox::init_coset_rs(0, 22, coset_gen_u64);
-    //     // zeknox::init_coset_rs(0, 16, coset_gen_u64);
-    // }
+    #[cfg(feature = "cuda")]
+    {
+        use plonky2_field::fft;
+        use plonky2_field::goldilocks_field::GoldilocksField;
+        use plonky2_field::polynomial::PolynomialCoeffs;
+
+        zeknox::clear_cuda_errors_rs();
+        println!("Initializing CUDA twiddle factors...");
+        // Initialize twiddle factors for all dimensions that will be used
+        // This test involves multiple polynomials and recursive verification,
+        // so we initialize a wider range of dimensions to be safe
+        // for i in 0..=19 {
+        //     zeknox::init_twiddle_factors_rs(0, i);
+        // }
+
+        zeknox::init_twiddle_factors_rs(0, size);
+        zeknox::init_twiddle_factors_rs(0, size+3);
+        // Initialize coset on GPU
+        // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR)
+        // TODO: Make this generic for other fields if needed
+        let coset_gen_u64 = 7u64;
+        zeknox::init_coset_rs(0, size+3, coset_gen_u64);
+
+        // warm up GPU
+        // for some reason the first 10 FFTs are somewhat buggy
+
+        for i in 0..10 {
+            let t = (0..1 << size)
+                .map(|x| GoldilocksField::from_u64(i * x as u64))
+                .collect();
+            let poly = PolynomialCoeffs::new(t);
+            let _ = plonky2_field::fft::fft(poly.clone());
+        }
+        println!("CUDA twiddle factors initialized.");
+        // zeknox::init_coset_rs(0, 16, coset_gen_u64);
+    }
 
     // Public inputs are the two initial values (provided below) and the result (which is generated).
     builder.register_public_input(initial_a);
     builder.register_public_input(initial_b);
     builder.register_public_input(cur_target);
-
+    println!("Public inputs registered.");
     // Provide initial values.
     let mut pw = PartialWitness::new();
     pw.set_target(initial_a, F::ZERO)?;
     pw.set_target(initial_b, F::ONE)?;
-
+    println!("Initial values set in witness.");
     let data = builder.build::<C>();
-
+    println!("Circuit data built. Generating proof...");
     #[cfg(feature = "timing")]
     {
         use log::Level;
diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs
index 174068c73..ed5af3b67 100644
--- a/plonky2/src/fri/oracle.rs
+++ b/plonky2/src/fri/oracle.rs
@@ -62,11 +62,12 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
         timing: &mut TimingTree,
         fft_root_table: Option<&FftRootTable<F>>,
     ) -> Self {
+        println!("using slow ifft_cpu");
         let coeffs = timed!(
             timing,
             "IFFT",
             // Use sequential iteration for deterministic results
-            values.into_iter().map(|v| v.ifft()).collect::<Vec<_>>()
+            values.into_iter().map(|v| v.ifft_cpu()).collect::<Vec<_>>()
         );
 
         Self::from_coeffs(
diff --git a/plonky2/src/plonk/circuit_builder.rs b/plonky2/src/plonk/circuit_builder.rs
index 421d15cdc..6ca5b712d 100644
--- a/plonky2/src/plonk/circuit_builder.rs
+++ b/plonky2/src/plonk/circuit_builder.rs
@@ -1225,6 +1225,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         let max_fft_points = 1 << (degree_bits + max(rate_bits, log2_ceil(quotient_degree_factor)));
         let fft_root_table = fft_root_table(max_fft_points);
 
+        println!("start to commit");
         let constants_sigmas_commitment = if commit_to_sigma {
             let constants_sigmas_vecs = [constant_vecs, sigma_vecs.clone()].concat();
             PolynomialBatch::<F, C, D>::from_values(
@@ -1238,6 +1239,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         } else {
             PolynomialBatch::<F, C, D>::default()
         };
+        println!("end commit");
 
         // Map between gates where not all generators are used and the gate's number of used generators.
         let incomplete_gates = self

From 6ba6963363168363b57373d18a92a42bc2b5a5ed Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-31-221.ap-northeast-1.compute.internal>
Date: Thu, 11 Dec 2025 19:28:13 +0000
Subject: [PATCH 17/37] wip clean up

---
 .gitignore                       |  3 ++
 field/Cargo.toml                 |  8 ++--
 field/src/extension/quadratic.rs |  8 ----
 field/src/extension/quartic.rs   |  8 ----
 field/src/extension/quintic.rs   |  8 ----
 field/src/fft.rs                 | 70 ++++++++------------------------
 field/src/goldilocks_field.rs    |  7 ----
 field/src/polynomial/mod.rs      |  2 +-
 field/src/secp256k1_base.rs      |  7 ----
 field/src/secp256k1_scalar.rs    |  6 ---
 field/src/types.rs               |  3 --
 plonky2/Cargo.toml               |  7 +++-
 plonky2/examples/fibonacci.rs    | 10 ++---
 13 files changed, 35 insertions(+), 112 deletions(-)

diff --git a/.gitignore b/.gitignore
index 293a17bb6..695fe9b7b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,6 @@ pgo-data.profdata
 
 # MacOS nuisances
 .DS_Store
+
+*.log
+
diff --git a/field/Cargo.toml b/field/Cargo.toml
index 1b6a62d71..ba60626ff 100644
--- a/field/Cargo.toml
+++ b/field/Cargo.toml
@@ -36,6 +36,8 @@ workspace = true
 
 
 [features]
-default = []
-# default = [ "cuda" ]
-cuda = []
\ No newline at end of file
+# default = []
+default = [ "cuda", "cuda_sanity_check" ]
+cuda = []
+# sanity check: when this flag is on, the computation will done on both CPU and CUDA, and the results compared
+cuda_sanity_check = ["cuda"]
\ No newline at end of file
diff --git a/field/src/extension/quadratic.rs b/field/src/extension/quadratic.rs
index ca74747b9..281369d21 100644
--- a/field/src/extension/quadratic.rs
+++ b/field/src/extension/quadratic.rs
@@ -59,14 +59,6 @@ impl<F: Extendable<2>> Sample for QuadraticExtension<F> {
 }
 
 impl<F: Extendable<2>> Field for QuadraticExtension<F> {
-    fn to_u64(&self) -> u64 {
-        unimplemented!()
-    }
-
-    fn from_u64(u: u64) -> Self {
-        unimplemented!()
-    }
-
     const ZERO: Self = Self([F::ZERO; 2]);
     const ONE: Self = Self([F::ONE, F::ZERO]);
     const TWO: Self = Self([F::TWO, F::ZERO]);
diff --git a/field/src/extension/quartic.rs b/field/src/extension/quartic.rs
index daa9d3aaf..8c8a9e7e4 100644
--- a/field/src/extension/quartic.rs
+++ b/field/src/extension/quartic.rs
@@ -65,14 +65,6 @@ impl<F: Extendable<4>> Sample for QuarticExtension<F> {
 }
 
 impl<F: Extendable<4>> Field for QuarticExtension<F> {
-    fn to_u64(&self) -> u64 {
-        unimplemented!()
-    }
-
-    fn from_u64(u: u64) -> Self {
-        unimplemented!()
-    }
-
     const ZERO: Self = Self([F::ZERO; 4]);
     const ONE: Self = Self([F::ONE, F::ZERO, F::ZERO, F::ZERO]);
     const TWO: Self = Self([F::TWO, F::ZERO, F::ZERO, F::ZERO]);
diff --git a/field/src/extension/quintic.rs b/field/src/extension/quintic.rs
index 21817c6c8..28ec92267 100644
--- a/field/src/extension/quintic.rs
+++ b/field/src/extension/quintic.rs
@@ -66,14 +66,6 @@ impl<F: Extendable<5>> Sample for QuinticExtension<F> {
 }
 
 impl<F: Extendable<5>> Field for QuinticExtension<F> {
-    fn to_u64(&self) -> u64 {
-        unimplemented!()
-    }
-
-    fn from_u64(u: u64) -> Self {
-        unimplemented!()
-    }
-
     const ZERO: Self = Self([F::ZERO; 5]);
     const ONE: Self = Self([F::ONE, F::ZERO, F::ZERO, F::ZERO, F::ZERO]);
     const TWO: Self = Self([F::TWO, F::ZERO, F::ZERO, F::ZERO, F::ZERO]);
diff --git a/field/src/fft.rs b/field/src/fft.rs
index 666c4d697..3ee9fb067 100644
--- a/field/src/fft.rs
+++ b/field/src/fft.rs
@@ -40,61 +40,32 @@ fn fft_dispatch_gpu<F: Field>(
     root_table: Option<&FftRootTable<F>>,
 ) {
     if F::CUDA_SUPPORT {
-        use zeknox::device::memory::HostOrDeviceSlice;
         use zeknox::ntt_batch;
         use zeknox::types::NTTConfig;
 
-        // let mut input_clone = input.to_vec();
-        // fft_dispatch_cpu(&mut input_clone, zero_factor, root_table);
-        // ark_std::println!("cpu done" );
+        #[cfg(feature = "cuda_sanity_check")]
+        let cpu_res = {
+            let mut input_clone = input.to_vec();
+            fft_dispatch_cpu(&mut input_clone, zero_factor, root_table);
+            input_clone
+        };
 
-        let total_elements = input.len();
-        let mut io_u64 = input.iter().map(|x| x.to_u64()).collect::<Vec<u64>>();
-
-        let mut device_data: HostOrDeviceSlice<'_, u64> =
-            HostOrDeviceSlice::cuda_malloc(0, total_elements).unwrap();
-        device_data
-            .copy_from_host_offset(&io_u64, 0, total_elements)
-            .unwrap();
         ntt_batch(
             0,
-            device_data.as_mut_ptr() as *mut F,
+            input,
             input.len().trailing_zeros() as usize,
             NTTConfig::default(),
         );
 
-        // Copy results back from device to host
-        io_u64.resize(total_elements, 0u64);
-        device_data
-            .copy_to_host(&mut io_u64, total_elements)
-            .unwrap();
-
-        // Convert u64 results back to field elements
-        input.iter_mut().zip(io_u64.iter()).for_each(|(a, b)| {
-            *a = F::from_canonical_u64(*b);
-        });
-        // ark_std::println!("gpu done" );
-
-        // let mut to_print = false;
-        // for (i, (a, b)) in input.iter().zip(input_clone.iter()).enumerate() {
-        //     if a != b {
-        //         // panic!("Mismatch at index {}: gpu result = {}, cpu result = {}", i, a.to_u64(), b.to_u64());
-        //         to_print = true;
-        //         ark_std::println!(
-        //             "Mismatch at index {}: gpu result = {}, cpu result = {}",
-        //             i,
-        //             a.to_u64(),
-        //             b.to_u64()
-        //         );
-        //     }
-        // }
-
-        // if to_print {
-        //     ark_std::println!("Comparing results...");
-        //     ark_std::println!("cpu {:?}", input_clone);
-        //     ark_std::println!("gpu {:?}", input);
-        // }
-
+        #[cfg(feature = "cuda_sanity_check")]
+        for (i, (a, b)) in input.iter().zip(cpu_res.iter()).enumerate() {
+            if a != b {
+                panic!(
+                    "Mismatch at index {}: gpu result = {}, cpu result = {}",
+                    i, a, b
+                );
+            }
+        }
         return;
     } else {
         return fft_dispatch_cpu(input, zero_factor, root_table);
@@ -109,16 +80,7 @@ fn fft_dispatch_cpu<F: Field>(
     if root_table.is_some() {
         return fft_classic(input, zero_factor.unwrap_or(0), root_table.unwrap());
     } else {
-        // let pre_computed = F::pre_compute_fft_root_table(input.len());
-        // if pre_computed.is_some() {
-        //     return fft_classic(input, zero_factor.unwrap_or(0), pre_computed.unwrap());
-        // } else {
-        //     let computed = fft_root_table::<F>(input.len());
-
-        //     return fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref());
-        // }
         let computed = fft_root_table::<F>(input.len());
-
         return fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref());
     };
 }
diff --git a/field/src/goldilocks_field.rs b/field/src/goldilocks_field.rs
index 2ac66d4ea..ae8457744 100644
--- a/field/src/goldilocks_field.rs
+++ b/field/src/goldilocks_field.rs
@@ -68,13 +68,6 @@ impl Sample for GoldilocksField {
 }
 
 impl Field for GoldilocksField {
-    fn to_u64(&self) -> u64 {
-        self.0
-    }
-    fn from_u64(u: u64) -> Self {
-        Self(u)
-    }
-
     const ZERO: Self = Self(0);
     const ONE: Self = Self(1);
     const TWO: Self = Self(2);
diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs
index 21eae220c..5ce66943c 100644
--- a/field/src/polynomial/mod.rs
+++ b/field/src/polynomial/mod.rs
@@ -12,7 +12,7 @@ use plonky2_util::log2_strict;
 use serde::{Deserialize, Serialize};
 
 use crate::extension::{Extendable, FieldExtension};
-use crate::fft::{FftRootTable, fft, fft_with_options, ifft, ifft_cpu};
+use crate::fft::{fft, fft_with_options, ifft, ifft_cpu, FftRootTable};
 use crate::types::Field;
 
 /// A polynomial in point-value form.
diff --git a/field/src/secp256k1_base.rs b/field/src/secp256k1_base.rs
index 20bc7a395..6632a7f83 100644
--- a/field/src/secp256k1_base.rs
+++ b/field/src/secp256k1_base.rs
@@ -98,13 +98,6 @@ impl Field for Secp256K1Base {
 
     const BITS: usize = 256;
 
-    fn to_u64(&self) -> u64 {
-        unimplemented!()
-    }
-    fn from_u64(u: u64) -> Self {
-        unimplemented!()
-    }
-
     fn order() -> BigUint {
         BigUint::from_slice(&[
             0xFFFFFC2F, 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
diff --git a/field/src/secp256k1_scalar.rs b/field/src/secp256k1_scalar.rs
index 3514188e9..2b299130d 100644
--- a/field/src/secp256k1_scalar.rs
+++ b/field/src/secp256k1_scalar.rs
@@ -79,12 +79,6 @@ impl Sample for Secp256K1Scalar {
 }
 
 impl Field for Secp256K1Scalar {
-    fn to_u64(&self) -> u64 {
-        unimplemented!()
-    }
-    fn from_u64(u: u64) -> Self {
-        unimplemented!()
-    }
 
     const ZERO: Self = Self([0; 4]);
     const ONE: Self = Self([1, 0, 0, 0]);
diff --git a/field/src/types.rs b/field/src/types.rs
index f365ea299..5a34bb6a3 100644
--- a/field/src/types.rs
+++ b/field/src/types.rs
@@ -94,9 +94,6 @@ pub trait Field:
     /// Whether this field is supported by cuda
     const CUDA_SUPPORT: bool = false;
 
-    fn to_u64(&self) -> u64;
-    fn from_u64(u: u64) -> Self;
-
     fn order() -> BigUint;
     fn characteristic() -> BigUint;
 
diff --git a/plonky2/Cargo.toml b/plonky2/Cargo.toml
index 2de7408ef..5121535a7 100644
--- a/plonky2/Cargo.toml
+++ b/plonky2/Cargo.toml
@@ -12,13 +12,16 @@ keywords.workspace = true
 categories.workspace = true
 
 [features]
-default = ["gate_testing", "parallel", "rand_chacha", "std", "timing"]
-# default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing"]
+# default = ["gate_testing", "parallel", "rand_chacha", "std", "timing"]
+default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing", "cuda_sanity_check"]
 gate_testing = []
 parallel = ["hashbrown/rayon", "plonky2_maybe_rayon/parallel"]
 std = ["anyhow/std", "rand/std", "itertools/use_std"]
 timing = ["std", "dep:web-time"]
 cuda = ["plonky2_field/cuda"]
+# sanity check: when this flag is on, the computation will done on both CPU and CUDA, and the results compared
+cuda_sanity_check = [ "cuda", "plonky2_field/cuda_sanity_check" ]
+
 
 [dependencies]
 ahash = { workspace = true }
diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs
index ead3fc974..cdd4faac8 100644
--- a/plonky2/examples/fibonacci.rs
+++ b/plonky2/examples/fibonacci.rs
@@ -29,14 +29,14 @@ fn main() -> Result<()> {
     let initial_b = builder.add_virtual_target();
     let mut prev_target = initial_a;
     let mut cur_target = initial_b;
-    for _ in 0..9999999 {
+    for _ in 0..999999 {
         let temp = builder.add(prev_target, cur_target);
         prev_target = cur_target;
         cur_target = temp;
     }
     println!("Circuit built.");
 
-    let size = 19;
+    let size = 16;
 
     #[cfg(feature = "cuda")]
     {
@@ -54,19 +54,19 @@ fn main() -> Result<()> {
         // }
 
         zeknox::init_twiddle_factors_rs(0, size);
-        zeknox::init_twiddle_factors_rs(0, size+3);
+        zeknox::init_twiddle_factors_rs(0, size + 3);
         // Initialize coset on GPU
         // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR)
         // TODO: Make this generic for other fields if needed
         let coset_gen_u64 = 7u64;
-        zeknox::init_coset_rs(0, size+3, coset_gen_u64);
+        zeknox::init_coset_rs(0, size + 3, coset_gen_u64);
 
         // warm up GPU
         // for some reason the first 10 FFTs are somewhat buggy
 
         for i in 0..10 {
             let t = (0..1 << size)
-                .map(|x| GoldilocksField::from_u64(i * x as u64))
+                .map(|x| GoldilocksField::from_canonical_u64(i * x as u64))
                 .collect();
             let poly = PolynomialCoeffs::new(t);
             let _ = plonky2_field::fft::fft(poly.clone());

From 7b659d75562c7f2a07c258164581a147e262ac77 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-31-221.ap-northeast-1.compute.internal>
Date: Thu, 11 Dec 2025 19:43:03 +0000
Subject: [PATCH 18/37] continue clean up

---
 field/src/fft.rs                     | 10 --------
 field/src/secp256k1_scalar.rs        |  1 -
 plonky2/src/fri/oracle.rs            | 34 +++++++++++++++++++++++++---
 plonky2/src/plonk/circuit_builder.rs |  6 ++---
 4 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/field/src/fft.rs b/field/src/fft.rs
index 3ee9fb067..6c6874a62 100644
--- a/field/src/fft.rs
+++ b/field/src/fft.rs
@@ -32,7 +32,6 @@ pub fn fft_root_table<F: Field>(n: usize) -> FftRootTable<F> {
     root_table
 }
 
-#[allow(dead_code)]
 #[cfg(feature = "cuda")]
 fn fft_dispatch_gpu<F: Field>(
     input: &mut [F],
@@ -297,14 +296,6 @@ mod tests {
         type F = GoldilocksField;
         let degree = 200usize;
         let degree_padded = degree.next_power_of_two();
-        println!("Initializing CUDA");
-
-        #[cfg(feature = "cuda")]
-        for i in 8..=12 {
-            zeknox::init_twiddle_factors_rs(0, i);
-        }
-
-        println!("Testing fft/ifft with degree {}", degree);
 
         // Create a vector of coeffs; the first degree of them are
         // "random", the last degree_padded-degree of them are zero.
@@ -326,7 +317,6 @@ mod tests {
             assert_eq!(interpolated_coefficients.coeffs[i], F::ZERO);
         }
 
-        println!("Testing ldes");
         for r in 0..4 {
             // expand coefficients by factor 2^r by filling with zeros
             let zero_tail = coefficients.lde(r);
diff --git a/field/src/secp256k1_scalar.rs b/field/src/secp256k1_scalar.rs
index 2b299130d..3ca5b6ba2 100644
--- a/field/src/secp256k1_scalar.rs
+++ b/field/src/secp256k1_scalar.rs
@@ -79,7 +79,6 @@ impl Sample for Secp256K1Scalar {
 }
 
 impl Field for Secp256K1Scalar {
-
     const ZERO: Self = Self([0; 4]);
     const ONE: Self = Self([1, 0, 0, 0]);
     const TWO: Self = Self([2, 0, 0, 0]);
diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs
index ed5af3b67..11054cb51 100644
--- a/plonky2/src/fri/oracle.rs
+++ b/plonky2/src/fri/oracle.rs
@@ -53,6 +53,36 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize> D
 impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
     PolynomialBatch<F, C, D>
 {
+    /// Creates a list polynomial commitment for the polynomials interpolating the values in `values`.
+    /// This function is called by the builder during preprocessing the circuit.
+    /// We use parallel IFFT on CPU here to avoid strange GPU issue.
+    pub fn preprocessor_from_values(
+        values: Vec<PolynomialValues<F>>,
+        rate_bits: usize,
+        blinding: bool,
+        cap_height: usize,
+        timing: &mut TimingTree,
+        fft_root_table: Option<&FftRootTable<F>>,
+    ) -> Self {
+        let coeffs = timed!(
+            timing,
+            "IFFT",
+            values
+                .into_par_iter()
+                .map(|v| v.ifft_cpu())
+                .collect::<Vec<_>>()
+        );
+
+        Self::from_coeffs(
+            coeffs,
+            rate_bits,
+            blinding,
+            cap_height,
+            timing,
+            fft_root_table,
+        )
+    }
+
     /// Creates a list polynomial commitment for the polynomials interpolating the values in `values`.
     pub fn from_values(
         values: Vec<PolynomialValues<F>>,
@@ -62,12 +92,10 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
         timing: &mut TimingTree,
         fft_root_table: Option<&FftRootTable<F>>,
     ) -> Self {
-        println!("using slow ifft_cpu");
         let coeffs = timed!(
             timing,
             "IFFT",
-            // Use sequential iteration for deterministic results
-            values.into_iter().map(|v| v.ifft_cpu()).collect::<Vec<_>>()
+            values.into_par_iter().map(|v| v.ifft()).collect::<Vec<_>>()
         );
 
         Self::from_coeffs(
diff --git a/plonky2/src/plonk/circuit_builder.rs b/plonky2/src/plonk/circuit_builder.rs
index 6ca5b712d..5b2f2c3a1 100644
--- a/plonky2/src/plonk/circuit_builder.rs
+++ b/plonky2/src/plonk/circuit_builder.rs
@@ -1225,10 +1225,11 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         let max_fft_points = 1 << (degree_bits + max(rate_bits, log2_ceil(quotient_degree_factor)));
         let fft_root_table = fft_root_table(max_fft_points);
 
-        println!("start to commit");
+        // This part of the code on GPU is buggy. So we use CPU for computation.
+        // It does not impact performance as this is only done once during setup.
         let constants_sigmas_commitment = if commit_to_sigma {
             let constants_sigmas_vecs = [constant_vecs, sigma_vecs.clone()].concat();
-            PolynomialBatch::<F, C, D>::from_values(
+            PolynomialBatch::<F, C, D>::preprocessor_from_values(
                 constants_sigmas_vecs,
                 rate_bits,
                 PlonkOracle::CONSTANTS_SIGMAS.blinding,
@@ -1239,7 +1240,6 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         } else {
             PolynomialBatch::<F, C, D>::default()
         };
-        println!("end commit");
 
         // Map between gates where not all generators are used and the gate's number of used generators.
         let incomplete_gates = self

From 83f3d96eb68b79b7c758a762c120a60ba663d981 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-31-221.ap-northeast-1.compute.internal>
Date: Thu, 11 Dec 2025 20:15:13 +0000
Subject: [PATCH 19/37] merkle tree sanity checks

---
 field/src/polynomial/mod.rs     |  3 +++
 perm_comp.md                    | 36 -------------------------
 plonky2/src/fri/oracle.rs       | 13 ---------
 plonky2/src/hash/merkle_tree.rs | 48 ++++++++++++++++++++++++---------
 4 files changed, 39 insertions(+), 61 deletions(-)
 delete mode 100644 perm_comp.md

diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs
index 5ce66943c..a38a7c553 100644
--- a/field/src/polynomial/mod.rs
+++ b/field/src/polynomial/mod.rs
@@ -55,10 +55,13 @@ impl<F: Field> PolynomialValues<F> {
         self.values.len()
     }
 
+    /// Adaptive IFFT: uses GPU if available, otherwise CPU.
     pub fn ifft(self) -> PolynomialCoeffs<F> {
         ifft(self)
     }
 
+    /// Enfored to use CPU IFFT.
+    /// Used for bypass the GPU issue during setup phase.
     pub fn ifft_cpu(self) -> PolynomialCoeffs<F> {
         ifft_cpu(self)
     }
diff --git a/perm_comp.md b/perm_comp.md
deleted file mode 100644
index 333134394..000000000
--- a/perm_comp.md
+++ /dev/null
@@ -1,36 +0,0 @@
-# Performance comparison
-- CPU: AMD 7950x3d; 16 core
-- GPU: NVidia 4080; single card
-- Circuit size: 2^19 gates
-- Total CPU time: **32.97 s**
-- Total GPU time: **19.71 s**
-
-| Operation | CPU (s) | GPU (s) | Speedup | GPU Tuned? |
-|-----------|---------|---------|---------|------------|
-| **Run generators** | 1.7767 | 1.7899 | - | ✗ Not accelerated |
-| **Compute full witness** | 0.3369 | 0.3362 | - | ✗ Not accelerated |
-| **Compute wire polynomials** | 0.0396 | 0.0392 | - | ✗ Not accelerated |
-| **Compute wires commitment** | 20.1902 | 10.0548 | **2.01x** | ✓ Yes |
-| └─ IFFT | 1.2070 | 0.1587 | **7.61x** | ✓ **Highly tuned** |
-| └─ FFT + blinding | 11.4267 | 3.6139 | **3.16x** | ✓ **Highly tuned** |
-| └─ Transpose LDEs | 2.8010 | 2.7881 | - | ✗ Not accelerated |
-| └─ Build Merkle tree | 4.5166 | 3.2734 | **1.38x** | ✓ Tuned |
-| **Compute partial products** | 0.1700 | 0.1671 | - | ✗ Not accelerated |
-| **Commit to partial products/Z's** | 3.4213 | 1.6982 | **2.01x** | ✓ Yes |
-| └─ IFFT | 0.1860 | 0.0241 | **7.72x** | ✓ **Highly tuned** |
-| └─ FFT + blinding | 1.7627 | 0.4778 | **3.69x** | ✓ **Highly tuned** |
-| └─ Transpose LDEs | 0.3906 | 0.3874 | - | ✗ Not accelerated |
-| └─ Build Merkle tree | 1.0253 | 0.7573 | **1.35x** | ✓ Tuned |
-| **Compute quotient polys** | 1.4041 | 1.3128 | - | ✗ Not accelerated |
-| **Split quotient polys** | 0.0098 | 0.0212 | - | ✗ Not accelerated|
-| **Commit to quotient polys** | 2.6641 | 1.4077 | **1.89x** | ✓ Yes |
-| └─ FFT + blinding | 1.5496 | 0.4315 | **3.59x** | ✓ **Highly tuned** |
-| └─ Transpose LDEs | 0.2952 | 0.2908 | - | ✗ Not accelerated |
-| └─ Build Merkle tree | 0.7756 | 0.6453 | **1.20x** | ✓ Tuned |
-| **Construct opening set** | 0.1609 | 0.1600 | - | ✗ Not accelerated |
-| **Compute opening proofs** | 1.3580 | 1.2919 | - | ✗ Not accelerated |
-| └─ Reduce 255 polynomials | 0.8715 | 0.8518 | - | ✗ Not accelerated |
-| └─ Reduce 2 polynomials | 0.0087 | 0.0085 | - | ✗ Not accelerated |
-| └─ Final FFT 4194304 | 0.3083 | 0.3023 | - | ✗ Not accelerated |
-| └─ Fold codewords | 0.1312 | 0.0904 | - | ✗ Not accelerated |
-| └─ Find PoW witness | 0.0014 | 0.0038 | - | ✗ Not accelerated |
\ No newline at end of file
diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs
index 11054cb51..d008301cf 100644
--- a/plonky2/src/fri/oracle.rs
+++ b/plonky2/src/fri/oracle.rs
@@ -125,20 +125,7 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
         );
 
         let mut leaves = timed!(timing, "transpose LDEs", transpose(&lde_values));
-        // Debug: Print first leaf for determinism check
-        if !leaves.is_empty() && !leaves[0].is_empty() {
-            println!(
-                "First leaf before reverse_bits: {:?}",
-                &leaves[0][..4.min(leaves[0].len())]
-            );
-        }
         reverse_index_bits_in_place(&mut leaves);
-        if !leaves.is_empty() && !leaves[0].is_empty() {
-            println!(
-                "First leaf after reverse_bits: {:?}",
-                &leaves[0][..4.min(leaves[0].len())]
-            );
-        }
         let merkle_tree = timed!(
             timing,
             "build Merkle tree",
diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs
index 3561471bd..102a8174c 100644
--- a/plonky2/src/hash/merkle_tree.rs
+++ b/plonky2/src/hash/merkle_tree.rs
@@ -484,9 +484,43 @@ impl<F: RichField, H: Hasher<F>> MerkleTree<F, H> {
 
         let digests_buf = capacity_up_to_mut(&mut digests, num_digests);
         let cap_buf = capacity_up_to_mut(&mut cap, len_cap);
-        let now = Instant::now();
+
+        #[cfg(feature = "cuda_sanity_check")]
+        let (digests_buf_cpu, cap_cpu) = {
+            let mut digests_buf_cpu = digests_buf.to_vec();
+            let mut cap_buf_cpu = cap_buf.to_vec();
+
+            fill_digests_buf::<F, H>(
+                &mut digests_buf_cpu,
+                &mut cap_buf_cpu,
+                &leaves_1d.clone(),
+                leaf_size,
+                cap_height,
+            );
+
+            (digests_buf_cpu, cap_buf_cpu)
+        };
+
         fill_digests_buf_meta::<F, H>(digests_buf, cap_buf, &leaves_1d, leaf_size, cap_height);
-        print_time(now, "fill digests buffer");
+
+        #[cfg(feature = "cuda_sanity_check")]
+        {
+            for i in 0..num_digests {
+                unsafe {
+                    let hash1 = digests_buf[i].assume_init();
+                    let hash2 = digests_buf_cpu[i].assume_init();
+                    assert_eq!(hash1, hash2, "Digest mismatch at index {}", i);
+                }
+            }
+            for i in 0..len_cap {
+                unsafe {
+                    let hash1 = cap_buf[i].assume_init();
+                    let hash2 = cap_cpu[i].assume_init();
+                    assert_eq!(hash1, hash2, "Cap mismatch at index {}", i);
+                }
+            }
+        }
+
 
         unsafe {
             // SAFETY: `fill_digests_buf` and `cap` initialized the spare capacity up to
@@ -494,16 +528,6 @@ impl<F: RichField, H: Hasher<F>> MerkleTree<F, H> {
             digests.set_len(num_digests);
             cap.set_len(len_cap);
         }
-        /*
-        println!{"Digest Buffer"};
-        for dg in &digests {
-            println!("{:?}", dg);
-        }
-        println!{"Cap Buffer"};
-        for dg in &cap {
-            println!("{:?}", dg);
-        }
-        */
         Self {
             leaves: leaves_1d,
             leaf_size,

From 2c9dd3fbd373518994450a8b148f3ad135e14075 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-31-221.ap-northeast-1.compute.internal>
Date: Thu, 11 Dec 2025 20:54:12 +0000
Subject: [PATCH 20/37] fix more tests

---
 field/src/fft.rs                | 11 +++++
 field/src/polynomial/mod.rs     | 21 +++++++++
 fix_env.md                      | 79 +++++++++++++++++++++++++++++++++
 plonky2/src/hash/merkle_tree.rs |  1 -
 4 files changed, 111 insertions(+), 1 deletion(-)
 create mode 100644 fix_env.md

diff --git a/field/src/fft.rs b/field/src/fft.rs
index 6c6874a62..6b0ea188c 100644
--- a/field/src/fft.rs
+++ b/field/src/fft.rs
@@ -293,6 +293,17 @@ mod tests {
 
     #[test]
     fn fft_and_ifft() {
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            // Initialize twiddle factors for sizes we'll use
+            // degree_padded is 256 = 2^8
+            // lde then add 4 more bits
+            for i in 8..=12 {
+                zeknox::init_twiddle_factors_rs(0, i);
+            }
+        }
+
         type F = GoldilocksField;
         let degree = 200usize;
         let degree_padded = degree.next_power_of_two();
diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs
index a38a7c553..04ee20cca 100644
--- a/field/src/polynomial/mod.rs
+++ b/field/src/polynomial/mod.rs
@@ -454,6 +454,17 @@ mod tests {
     use crate::goldilocks_field::GoldilocksField;
     use crate::types::Sample;
 
+    #[cfg(feature = "cuda")]
+    fn init_gpu_for_tests() {
+        zeknox::clear_cuda_errors_rs();
+        // Initialize twiddle factors for various sizes
+        for i in 0..=20 {
+            zeknox::init_twiddle_factors_rs(0, i);
+        }
+        let coset_gen_u64 = 7u64;
+        zeknox::init_coset_rs(0, 20, coset_gen_u64);
+    }
+
     #[test]
     fn test_trimmed() {
         type F = GoldilocksField;
@@ -482,6 +493,9 @@ mod tests {
 
     #[test]
     fn test_coset_fft() {
+        #[cfg(feature = "cuda")]
+        init_gpu_for_tests();
+
         type F = GoldilocksField;
 
         let k = 8;
@@ -503,6 +517,9 @@ mod tests {
 
     #[test]
     fn test_coset_ifft() {
+        #[cfg(feature = "cuda")]
+        init_gpu_for_tests();
+
         type F = GoldilocksField;
 
         let k = 8;
@@ -609,6 +626,10 @@ mod tests {
     // `(X^n - 1)/(X - a)
     #[test]
     fn test_division_linear() {
+
+        #[cfg(feature = "cuda")]
+        init_gpu_for_tests();
+
         type F = GoldilocksField;
         let mut rng = OsRng;
         let l = 14;
diff --git a/fix_env.md b/fix_env.md
new file mode 100644
index 000000000..d8304c817
--- /dev/null
+++ b/fix_env.md
@@ -0,0 +1,79 @@
+# NVIDIA Driver Fix After Reboot
+
+## Problem
+After system reboot, `nvidia-smi` fails with error:
+```
+NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver.
+```
+
+## Root Cause
+System updates install new kernel versions, but NVIDIA driver modules aren't automatically built for the new kernel because kernel headers are missing.
+
+## Quick Fix (Run after each kernel update)
+
+```bash
+# 1. Install kernel headers for current kernel
+sudo apt update
+sudo apt install -y linux-headers-$(uname -r)
+
+# 2. DKMS will automatically rebuild NVIDIA modules
+# Wait for the installation to complete (shows "Building module(s)..." and "done")
+
+# 3. Load the NVIDIA driver
+sudo modprobe nvidia
+
+# 4. Verify it works
+nvidia-smi
+```
+
+## Diagnostic Commands
+
+Check if NVIDIA modules are built for current kernel:
+```bash
+uname -r                    # Show current kernel version
+dkms status                 # Check which kernels have NVIDIA modules
+modinfo nvidia              # Check if nvidia module exists for current kernel
+lsmod | grep nvidia         # Check if nvidia modules are loaded
+```
+
+## Prevention - Auto-install Headers (Recommended)
+
+Set up automatic kernel header installation:
+```bash
+sudo bash -c 'cat > /etc/apt/apt.conf.d/99auto-kernel-headers <<EOF
+# Automatically install kernel headers when kernel is upgraded
+DPkg::Post-Invoke {
+  "if [ -x /usr/bin/dkms ]; then /usr/bin/apt-get install -y linux-headers-\$(uname -r) || true; fi";
+};
+EOF'
+```
+
+After this one-time setup, kernel headers will be installed automatically with kernel updates.
+
+## Alternative - Pin Kernel Version (Not Recommended)
+
+If you want to prevent kernel updates entirely:
+```bash
+# Hold current kernel version
+sudo apt-mark hold linux-image-aws linux-headers-aws
+
+# To allow updates again later
+sudo apt-mark unhold linux-image-aws linux-headers-aws
+```
+
+## Understanding the Issue
+
+- When you boot, Linux loads the kernel from `/boot`
+- NVIDIA driver kernel modules must match the running kernel version
+- These modules live in `/lib/modules/$(uname -r)/`
+- DKMS (Dynamic Kernel Module Support) builds these modules
+- DKMS requires kernel headers to build modules
+- If headers are missing for new kernel → no NVIDIA modules → nvidia-smi fails
+
+## System Info
+
+Current setup:
+- NVIDIA Driver: 580.105.08
+- GPU: NVIDIA L4
+- OS: Ubuntu 24.04 (noble)
+- Kernel series: 6.14.0-aws (backported from newer Ubuntu)
diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs
index 102a8174c..d98f41681 100644
--- a/plonky2/src/hash/merkle_tree.rs
+++ b/plonky2/src/hash/merkle_tree.rs
@@ -521,7 +521,6 @@ impl<F: RichField, H: Hasher<F>> MerkleTree<F, H> {
             }
         }
 
-
         unsafe {
             // SAFETY: `fill_digests_buf` and `cap` initialized the spare capacity up to
             // `num_digests` and `len_cap`, resp.

From fee54ed00715a5859b5046f484621d660ded43af Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-31-221.ap-northeast-1.compute.internal>
Date: Thu, 11 Dec 2025 21:00:25 +0000
Subject: [PATCH 21/37] clean up

---
 field/src/fft.rs            | 8 ++++----
 field/src/polynomial/mod.rs | 6 ++----
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/field/src/fft.rs b/field/src/fft.rs
index 6b0ea188c..2509c7b2c 100644
--- a/field/src/fft.rs
+++ b/field/src/fft.rs
@@ -77,10 +77,10 @@ fn fft_dispatch_cpu<F: Field>(
     root_table: Option<&FftRootTable<F>>,
 ) {
     if root_table.is_some() {
-        return fft_classic(input, zero_factor.unwrap_or(0), root_table.unwrap());
+        fft_classic(input, zero_factor.unwrap_or(0), root_table.unwrap())
     } else {
         let computed = fft_root_table::<F>(input.len());
-        return fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref());
+        fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref())
     };
 }
 
@@ -92,11 +92,11 @@ fn fft_dispatch<F: Field>(
 ) {
     #[cfg(feature = "cuda")]
     {
-        return fft_dispatch_gpu(input, zero_factor, root_table);
+        fft_dispatch_gpu(input, zero_factor, root_table)
     }
     #[cfg(not(feature = "cuda"))]
     {
-        return fft_dispatch_cpu(input, zero_factor, root_table);
+        fft_dispatch_cpu(input, zero_factor, root_table)
     }
 }
 
diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs
index 04ee20cca..ffe5d18c6 100644
--- a/field/src/polynomial/mod.rs
+++ b/field/src/polynomial/mod.rs
@@ -445,7 +445,6 @@ impl<F: Field> Mul for &PolynomialCoeffs<F> {
 
 #[cfg(test)]
 mod tests {
-    use std::time::Instant;
 
     use rand::rngs::OsRng;
     use rand::Rng;
@@ -625,10 +624,9 @@ mod tests {
     // Test to see which polynomial division method is faster for divisions of the type
     // `(X^n - 1)/(X - a)
     #[test]
+    #[cfg(not(feature = "cuda"))]
     fn test_division_linear() {
-
-        #[cfg(feature = "cuda")]
-        init_gpu_for_tests();
+        use std::time::Instant;
 
         type F = GoldilocksField;
         let mut rng = OsRng;

From 4b82d19ac25bdaab8123532699ed8b5145806667 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-31-221.ap-northeast-1.compute.internal>
Date: Thu, 11 Dec 2025 21:10:02 +0000
Subject: [PATCH 22/37] clean up printing

---
 field/Cargo.toml                      |  1 -
 perm_comp.md                          | 36 +++++++++++++++++++++++++++
 plonky2/examples/fibonacci.rs         | 26 -------------------
 plonky2/src/gates/equality_base.rs    |  1 -
 plonky2/src/hash/merkle_tree.rs       |  5 ++++
 plonky2/src/iop/generator.rs          |  5 ----
 plonky2/src/plonk/prover.rs           | 19 +++-----------
 plonky2/src/util/serialization/mod.rs |  2 --
 8 files changed, 44 insertions(+), 51 deletions(-)
 create mode 100644 perm_comp.md

diff --git a/field/Cargo.toml b/field/Cargo.toml
index ba60626ff..80b4478da 100644
--- a/field/Cargo.toml
+++ b/field/Cargo.toml
@@ -11,7 +11,6 @@ keywords.workspace = true
 categories.workspace = true
 
 [dependencies]
-ark-std = "0.5.0"
 anyhow = { workspace = true }
 itertools = { workspace = true, features = ["use_alloc"] }
 num = { workspace = true, features = ["alloc"] }
diff --git a/perm_comp.md b/perm_comp.md
new file mode 100644
index 000000000..333134394
--- /dev/null
+++ b/perm_comp.md
@@ -0,0 +1,36 @@
+# Performance comparison
+- CPU: AMD 7950x3d; 16 core
+- GPU: NVidia 4080; single card
+- Circuit size: 2^19 gates
+- Total CPU time: **32.97 s**
+- Total GPU time: **19.71 s**
+
+| Operation | CPU (s) | GPU (s) | Speedup | GPU Tuned? |
+|-----------|---------|---------|---------|------------|
+| **Run generators** | 1.7767 | 1.7899 | - | ✗ Not accelerated |
+| **Compute full witness** | 0.3369 | 0.3362 | - | ✗ Not accelerated |
+| **Compute wire polynomials** | 0.0396 | 0.0392 | - | ✗ Not accelerated |
+| **Compute wires commitment** | 20.1902 | 10.0548 | **2.01x** | ✓ Yes |
+| └─ IFFT | 1.2070 | 0.1587 | **7.61x** | ✓ **Highly tuned** |
+| └─ FFT + blinding | 11.4267 | 3.6139 | **3.16x** | ✓ **Highly tuned** |
+| └─ Transpose LDEs | 2.8010 | 2.7881 | - | ✗ Not accelerated |
+| └─ Build Merkle tree | 4.5166 | 3.2734 | **1.38x** | ✓ Tuned |
+| **Compute partial products** | 0.1700 | 0.1671 | - | ✗ Not accelerated |
+| **Commit to partial products/Z's** | 3.4213 | 1.6982 | **2.01x** | ✓ Yes |
+| └─ IFFT | 0.1860 | 0.0241 | **7.72x** | ✓ **Highly tuned** |
+| └─ FFT + blinding | 1.7627 | 0.4778 | **3.69x** | ✓ **Highly tuned** |
+| └─ Transpose LDEs | 0.3906 | 0.3874 | - | ✗ Not accelerated |
+| └─ Build Merkle tree | 1.0253 | 0.7573 | **1.35x** | ✓ Tuned |
+| **Compute quotient polys** | 1.4041 | 1.3128 | - | ✗ Not accelerated |
+| **Split quotient polys** | 0.0098 | 0.0212 | - | ✗ Not accelerated|
+| **Commit to quotient polys** | 2.6641 | 1.4077 | **1.89x** | ✓ Yes |
+| └─ FFT + blinding | 1.5496 | 0.4315 | **3.59x** | ✓ **Highly tuned** |
+| └─ Transpose LDEs | 0.2952 | 0.2908 | - | ✗ Not accelerated |
+| └─ Build Merkle tree | 0.7756 | 0.6453 | **1.20x** | ✓ Tuned |
+| **Construct opening set** | 0.1609 | 0.1600 | - | ✗ Not accelerated |
+| **Compute opening proofs** | 1.3580 | 1.2919 | - | ✗ Not accelerated |
+| └─ Reduce 255 polynomials | 0.8715 | 0.8518 | - | ✗ Not accelerated |
+| └─ Reduce 2 polynomials | 0.0087 | 0.0085 | - | ✗ Not accelerated |
+| └─ Final FFT 4194304 | 0.3083 | 0.3023 | - | ✗ Not accelerated |
+| └─ Fold codewords | 0.1312 | 0.0904 | - | ✗ Not accelerated |
+| └─ Find PoW witness | 0.0014 | 0.0038 | - | ✗ Not accelerated |
\ No newline at end of file
diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs
index cdd4faac8..fade89256 100644
--- a/plonky2/examples/fibonacci.rs
+++ b/plonky2/examples/fibonacci.rs
@@ -1,11 +1,9 @@
 use anyhow::{Ok, Result};
-use log::Level;
 use plonky2::field::types::Field;
 use plonky2::iop::witness::{PartialWitness, WitnessWrite};
 use plonky2::plonk::circuit_builder::CircuitBuilder;
 use plonky2::plonk::circuit_data::CircuitConfig;
 use plonky2::plonk::config::{GenericConfig, PoseidonGoldilocksConfig};
-use plonky2::util::timing::TimingTree;
 
 /// An example of using Plonky2 to prove a statement of the form
 /// "I know the 100th element of the Fibonacci sequence, starting with constants a and b."
@@ -40,39 +38,15 @@ fn main() -> Result<()> {
 
     #[cfg(feature = "cuda")]
     {
-        use plonky2_field::fft;
-        use plonky2_field::goldilocks_field::GoldilocksField;
-        use plonky2_field::polynomial::PolynomialCoeffs;
-
         zeknox::clear_cuda_errors_rs();
         println!("Initializing CUDA twiddle factors...");
-        // Initialize twiddle factors for all dimensions that will be used
-        // This test involves multiple polynomials and recursive verification,
-        // so we initialize a wider range of dimensions to be safe
-        // for i in 0..=19 {
-        //     zeknox::init_twiddle_factors_rs(0, i);
-        // }
 
         zeknox::init_twiddle_factors_rs(0, size);
         zeknox::init_twiddle_factors_rs(0, size + 3);
         // Initialize coset on GPU
         // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR)
-        // TODO: Make this generic for other fields if needed
         let coset_gen_u64 = 7u64;
         zeknox::init_coset_rs(0, size + 3, coset_gen_u64);
-
-        // warm up GPU
-        // for some reason the first 10 FFTs are somewhat buggy
-
-        for i in 0..10 {
-            let t = (0..1 << size)
-                .map(|x| GoldilocksField::from_canonical_u64(i * x as u64))
-                .collect();
-            let poly = PolynomialCoeffs::new(t);
-            let _ = plonky2_field::fft::fft(poly.clone());
-        }
-        println!("CUDA twiddle factors initialized.");
-        // zeknox::init_coset_rs(0, 16, coset_gen_u64);
     }
 
     // Public inputs are the two initial values (provided below) and the result (which is generated).
diff --git a/plonky2/src/gates/equality_base.rs b/plonky2/src/gates/equality_base.rs
index 50a315e81..86c302e18 100644
--- a/plonky2/src/gates/equality_base.rs
+++ b/plonky2/src/gates/equality_base.rs
@@ -160,7 +160,6 @@ impl<F: RichField + Extendable<D>, const D: usize> Gate<F, D> for EqualityGate {
                 )
             })
             .collect();
-        //println!("generators {:?}", result.len());
         result
     }
 
diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs
index d98f41681..30693ca6a 100644
--- a/plonky2/src/hash/merkle_tree.rs
+++ b/plonky2/src/hash/merkle_tree.rs
@@ -35,10 +35,14 @@ use crate::util::log2_strict;
 #[cfg(feature = "cuda")]
 pub static GPU_ID: Lazy<Arc<Mutex<u64>>> = Lazy::new(|| Arc::new(Mutex::new(0)));
 
+#[cfg(feature = "cuda")]
 fn print_time(now: Instant, msg: &str) {
     println!("Time {} {} ms", msg, now.elapsed().as_millis());
 }
 
+#[cfg(not(feature = "cuda"))]
+fn print_time(_now: Instant, _msg: &str) {}
+
 #[cfg(feature = "cuda")]
 const FORCE_SINGLE_GPU: bool = true;
 
@@ -521,6 +525,7 @@ impl<F: RichField, H: Hasher<F>> MerkleTree<F, H> {
             }
         }
 
+
         unsafe {
             // SAFETY: `fill_digests_buf` and `cap` initialized the spare capacity up to
             // `num_digests` and `len_cap`, resp.
diff --git a/plonky2/src/iop/generator.rs b/plonky2/src/iop/generator.rs
index 8e387c4cd..d24d8d42b 100644
--- a/plonky2/src/iop/generator.rs
+++ b/plonky2/src/iop/generator.rs
@@ -36,7 +36,6 @@ pub fn generate_partial_witness<
     let config = &common_data.config;
     let generators = &prover_data.generators;
     let generator_indices_by_watches = &prover_data.generator_indices_by_watches;
-    println!("Initializing witness.");
     let mut witness = PartitionWitness::new(
         config.num_wires,
         common_data.degree(),
@@ -57,8 +56,6 @@ pub fn generate_partial_witness<
 
     let mut buffer = GeneratedValues::empty();
 
-    println!("Starting generator execution.");
-
     // Keep running generators until we fail to make progress.
     while !pending_generator_indices.is_empty() {
         let mut next_pending_generator_indices = Vec::new();
@@ -98,8 +95,6 @@ pub fn generate_partial_witness<
         pending_generator_indices = next_pending_generator_indices;
     }
 
-    println!("Finished generator execution.");
-
     if remaining_generators != 0 {
         return Err(anyhow!("{} generators weren't run", remaining_generators));
     }
diff --git a/plonky2/src/plonk/prover.rs b/plonky2/src/plonk/prover.rs
index 649c811a1..64771ae3a 100644
--- a/plonky2/src/plonk/prover.rs
+++ b/plonky2/src/plonk/prover.rs
@@ -150,7 +150,6 @@ where
     let degree = common_data.degree();
 
     set_lookup_wires(prover_data, common_data, &mut partition_witness)?;
-    println!("Set lookup wires.");
 
     let public_inputs = partition_witness.get_targets(&prover_data.public_inputs);
     let public_inputs_hash = C::InnerHasher::hash_no_pad(&public_inputs);
@@ -160,7 +159,6 @@ where
         "compute full witness",
         partition_witness.full_witness()
     );
-    println!("Computed full witness.");
     let wires_values: Vec<PolynomialValues<F>> = timed!(
         timing,
         "compute wire polynomials",
@@ -171,7 +169,6 @@ where
             .map(|column| PolynomialValues::new(column.clone()))
             .collect()
     );
-    println!("Computed wire polynomials.");
     // Debug: Print first few wire values to check determinism
     if !wires_values.is_empty() && !wires_values[0].values.is_empty() {
         println!(
@@ -191,7 +188,6 @@ where
             prover_data.fft_root_table.as_ref(),
         )
     );
-    println!("Computed wires commitment.");
     let mut challenger = Challenger::<F, C::Hasher>::new();
 
     // Observe the FRI config
@@ -239,7 +235,6 @@ where
         .collect();
     let zs_partial_products = [plonk_z_vecs, partial_products_and_zs.concat()].concat();
 
-    println!("Computed Z and partial products.");
     // All lookup polys: RE and partial SLDCs.
     let lookup_polys =
         compute_all_lookup_polys(&witness, &deltas, prover_data, common_data, has_lookup);
@@ -249,8 +244,7 @@ where
     } else {
         zs_partial_products
     };
-
-    println!("Computed lookup polynomials.");
+    
     let partial_products_zs_and_lookup_commitment = timed!(
         timing,
         "commit to partial products, Z's and, if any, lookup polynomials",
@@ -283,12 +277,7 @@ where
             &alphas,
         )
     );
-    println!("prover alphas: {:?}", alphas);
-    println!("prover betas: {:?}", betas);
-    println!("prover gammas: {:?}", gammas);
-    println!("prover deltas: {:?}", deltas);
 
-    println!("Split up quotient polys.");
     let all_quotient_poly_chunks: Vec<PolynomialCoeffs<F>> = timed!(
         timing,
         "split up quotient polys",
@@ -304,7 +293,6 @@ where
             .collect()
     );
 
-    println!("Committed to quotient polys.");
     let quotient_polys_commitment = timed!(
         timing,
         "commit to quotient polys",
@@ -317,12 +305,11 @@ where
             prover_data.fft_root_table.as_ref(),
         )
     );
-
-    println!("Committed to quotient polys.");
+    
     challenger.observe_cap::<C::Hasher>(&quotient_polys_commitment.merkle_tree.cap);
 
     let zeta = challenger.get_extension_challenge::<D>();
-    println!("prover zeta: {:?}", zeta);
+
     // To avoid leaking witness data, we want to ensure that our opening locations, `zeta` and
     // `g * zeta`, are not in our subgroup `H`. It suffices to check `zeta` only, since
     // `(g * zeta)^n = zeta^n`, where `n` is the order of `g`.
diff --git a/plonky2/src/util/serialization/mod.rs b/plonky2/src/util/serialization/mod.rs
index fbfd7974f..209b43d3c 100644
--- a/plonky2/src/util/serialization/mod.rs
+++ b/plonky2/src/util/serialization/mod.rs
@@ -324,7 +324,6 @@ pub trait Read {
         let leaf_len = self.read_usize()?;
         let mut leaves_2d = Vec::with_capacity(leaves_len * leaf_len);
         for _ in 0..leaves_len {
-            // let leaf_len = self.read_usize()?;
             leaves_2d.push(self.read_field_vec(leaf_len)?);
         }
 
@@ -1426,7 +1425,6 @@ pub trait Write {
         self.write_usize(leaves_count)?;
         self.write_usize(tree.leaf_size)?;
         for i in 0..leaves_count {
-            // self.write_usize(tree.leaf_size)?;
             self.write_field_vec(&tree.get(i))?;
         }
         self.write_hash_vec::<F, H>(&tree.digests)?;

From 8761671a74dc4825f1b32ef2e8c4bea0235ba463 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-31-221.ap-northeast-1.compute.internal>
Date: Thu, 11 Dec 2025 21:25:40 +0000
Subject: [PATCH 23/37] clean up and scripts for testing

---
 field/Cargo.toml                  |   4 +-
 fix_env.md                        |  79 -----------------
 perm_comp.md                      |  36 --------
 plonky2/Cargo.toml                |   4 +-
 plonky2/examples/fibonacci.rs     |   4 +-
 plonky2/src/hash/merkle_proofs.rs |   1 +
 plonky2/src/hash/merkle_tree.rs   |   2 +-
 plonky2/src/plonk/prover.rs       |   4 +-
 plonky2/src/util/mod.rs           |   1 -
 test_gpu.sh                       | 141 ++++++++++++++++++++++++++++++
 10 files changed, 151 insertions(+), 125 deletions(-)
 delete mode 100644 fix_env.md
 delete mode 100644 perm_comp.md
 create mode 100755 test_gpu.sh

diff --git a/field/Cargo.toml b/field/Cargo.toml
index 80b4478da..02f535922 100644
--- a/field/Cargo.toml
+++ b/field/Cargo.toml
@@ -35,8 +35,8 @@ workspace = true
 
 
 [features]
-# default = []
-default = [ "cuda", "cuda_sanity_check" ]
+default = []
+# default = [ "cuda", "cuda_sanity_check" ]
 cuda = []
 # sanity check: when this flag is on, the computation will done on both CPU and CUDA, and the results compared
 cuda_sanity_check = ["cuda"]
\ No newline at end of file
diff --git a/fix_env.md b/fix_env.md
deleted file mode 100644
index d8304c817..000000000
--- a/fix_env.md
+++ /dev/null
@@ -1,79 +0,0 @@
-# NVIDIA Driver Fix After Reboot
-
-## Problem
-After system reboot, `nvidia-smi` fails with error:
-```
-NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver.
-```
-
-## Root Cause
-System updates install new kernel versions, but NVIDIA driver modules aren't automatically built for the new kernel because kernel headers are missing.
-
-## Quick Fix (Run after each kernel update)
-
-```bash
-# 1. Install kernel headers for current kernel
-sudo apt update
-sudo apt install -y linux-headers-$(uname -r)
-
-# 2. DKMS will automatically rebuild NVIDIA modules
-# Wait for the installation to complete (shows "Building module(s)..." and "done")
-
-# 3. Load the NVIDIA driver
-sudo modprobe nvidia
-
-# 4. Verify it works
-nvidia-smi
-```
-
-## Diagnostic Commands
-
-Check if NVIDIA modules are built for current kernel:
-```bash
-uname -r                    # Show current kernel version
-dkms status                 # Check which kernels have NVIDIA modules
-modinfo nvidia              # Check if nvidia module exists for current kernel
-lsmod | grep nvidia         # Check if nvidia modules are loaded
-```
-
-## Prevention - Auto-install Headers (Recommended)
-
-Set up automatic kernel header installation:
-```bash
-sudo bash -c 'cat > /etc/apt/apt.conf.d/99auto-kernel-headers <<EOF
-# Automatically install kernel headers when kernel is upgraded
-DPkg::Post-Invoke {
-  "if [ -x /usr/bin/dkms ]; then /usr/bin/apt-get install -y linux-headers-\$(uname -r) || true; fi";
-};
-EOF'
-```
-
-After this one-time setup, kernel headers will be installed automatically with kernel updates.
-
-## Alternative - Pin Kernel Version (Not Recommended)
-
-If you want to prevent kernel updates entirely:
-```bash
-# Hold current kernel version
-sudo apt-mark hold linux-image-aws linux-headers-aws
-
-# To allow updates again later
-sudo apt-mark unhold linux-image-aws linux-headers-aws
-```
-
-## Understanding the Issue
-
-- When you boot, Linux loads the kernel from `/boot`
-- NVIDIA driver kernel modules must match the running kernel version
-- These modules live in `/lib/modules/$(uname -r)/`
-- DKMS (Dynamic Kernel Module Support) builds these modules
-- DKMS requires kernel headers to build modules
-- If headers are missing for new kernel → no NVIDIA modules → nvidia-smi fails
-
-## System Info
-
-Current setup:
-- NVIDIA Driver: 580.105.08
-- GPU: NVIDIA L4
-- OS: Ubuntu 24.04 (noble)
-- Kernel series: 6.14.0-aws (backported from newer Ubuntu)
diff --git a/perm_comp.md b/perm_comp.md
deleted file mode 100644
index 333134394..000000000
--- a/perm_comp.md
+++ /dev/null
@@ -1,36 +0,0 @@
-# Performance comparison
-- CPU: AMD 7950x3d; 16 core
-- GPU: NVidia 4080; single card
-- Circuit size: 2^19 gates
-- Total CPU time: **32.97 s**
-- Total GPU time: **19.71 s**
-
-| Operation | CPU (s) | GPU (s) | Speedup | GPU Tuned? |
-|-----------|---------|---------|---------|------------|
-| **Run generators** | 1.7767 | 1.7899 | - | ✗ Not accelerated |
-| **Compute full witness** | 0.3369 | 0.3362 | - | ✗ Not accelerated |
-| **Compute wire polynomials** | 0.0396 | 0.0392 | - | ✗ Not accelerated |
-| **Compute wires commitment** | 20.1902 | 10.0548 | **2.01x** | ✓ Yes |
-| └─ IFFT | 1.2070 | 0.1587 | **7.61x** | ✓ **Highly tuned** |
-| └─ FFT + blinding | 11.4267 | 3.6139 | **3.16x** | ✓ **Highly tuned** |
-| └─ Transpose LDEs | 2.8010 | 2.7881 | - | ✗ Not accelerated |
-| └─ Build Merkle tree | 4.5166 | 3.2734 | **1.38x** | ✓ Tuned |
-| **Compute partial products** | 0.1700 | 0.1671 | - | ✗ Not accelerated |
-| **Commit to partial products/Z's** | 3.4213 | 1.6982 | **2.01x** | ✓ Yes |
-| └─ IFFT | 0.1860 | 0.0241 | **7.72x** | ✓ **Highly tuned** |
-| └─ FFT + blinding | 1.7627 | 0.4778 | **3.69x** | ✓ **Highly tuned** |
-| └─ Transpose LDEs | 0.3906 | 0.3874 | - | ✗ Not accelerated |
-| └─ Build Merkle tree | 1.0253 | 0.7573 | **1.35x** | ✓ Tuned |
-| **Compute quotient polys** | 1.4041 | 1.3128 | - | ✗ Not accelerated |
-| **Split quotient polys** | 0.0098 | 0.0212 | - | ✗ Not accelerated|
-| **Commit to quotient polys** | 2.6641 | 1.4077 | **1.89x** | ✓ Yes |
-| └─ FFT + blinding | 1.5496 | 0.4315 | **3.59x** | ✓ **Highly tuned** |
-| └─ Transpose LDEs | 0.2952 | 0.2908 | - | ✗ Not accelerated |
-| └─ Build Merkle tree | 0.7756 | 0.6453 | **1.20x** | ✓ Tuned |
-| **Construct opening set** | 0.1609 | 0.1600 | - | ✗ Not accelerated |
-| **Compute opening proofs** | 1.3580 | 1.2919 | - | ✗ Not accelerated |
-| └─ Reduce 255 polynomials | 0.8715 | 0.8518 | - | ✗ Not accelerated |
-| └─ Reduce 2 polynomials | 0.0087 | 0.0085 | - | ✗ Not accelerated |
-| └─ Final FFT 4194304 | 0.3083 | 0.3023 | - | ✗ Not accelerated |
-| └─ Fold codewords | 0.1312 | 0.0904 | - | ✗ Not accelerated |
-| └─ Find PoW witness | 0.0014 | 0.0038 | - | ✗ Not accelerated |
\ No newline at end of file
diff --git a/plonky2/Cargo.toml b/plonky2/Cargo.toml
index 5121535a7..7f91186a0 100644
--- a/plonky2/Cargo.toml
+++ b/plonky2/Cargo.toml
@@ -12,8 +12,8 @@ keywords.workspace = true
 categories.workspace = true
 
 [features]
-# default = ["gate_testing", "parallel", "rand_chacha", "std", "timing"]
-default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing", "cuda_sanity_check"]
+default = ["gate_testing", "parallel", "rand_chacha", "std", "timing"]
+# default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing", "cuda_sanity_check"]
 gate_testing = []
 parallel = ["hashbrown/rayon", "plonky2_maybe_rayon/parallel"]
 std = ["anyhow/std", "rand/std", "itertools/use_std"]
diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs
index fade89256..71bd479bb 100644
--- a/plonky2/examples/fibonacci.rs
+++ b/plonky2/examples/fibonacci.rs
@@ -34,10 +34,10 @@ fn main() -> Result<()> {
     }
     println!("Circuit built.");
 
-    let size = 16;
-
     #[cfg(feature = "cuda")]
     {
+        let size = 16;
+
         zeknox::clear_cuda_errors_rs();
         println!("Initializing CUDA twiddle factors...");
 
diff --git a/plonky2/src/hash/merkle_proofs.rs b/plonky2/src/hash/merkle_proofs.rs
index 892564932..4fb3393d3 100644
--- a/plonky2/src/hash/merkle_proofs.rs
+++ b/plonky2/src/hash/merkle_proofs.rs
@@ -240,6 +240,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
 
     /// Same as `verify_batch_merkle_proof_to_cap`, except with the final "cap index" as separate parameter,
     /// rather than being contained in `leaf_index_bits`.
+    #[allow(dead_code)]
     pub(crate) fn verify_batch_merkle_proof_to_cap_with_cap_index<H: AlgebraicHasher<F>>(
         &mut self,
         leaf_data: &[Vec<Target>],
diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs
index 30693ca6a..705f5ad74 100644
--- a/plonky2/src/hash/merkle_tree.rs
+++ b/plonky2/src/hash/merkle_tree.rs
@@ -41,6 +41,7 @@ fn print_time(now: Instant, msg: &str) {
 }
 
 #[cfg(not(feature = "cuda"))]
+#[allow(dead_code)]
 fn print_time(_now: Instant, _msg: &str) {}
 
 #[cfg(feature = "cuda")]
@@ -525,7 +526,6 @@ impl<F: RichField, H: Hasher<F>> MerkleTree<F, H> {
             }
         }
 
-
         unsafe {
             // SAFETY: `fill_digests_buf` and `cap` initialized the spare capacity up to
             // `num_digests` and `len_cap`, resp.
diff --git a/plonky2/src/plonk/prover.rs b/plonky2/src/plonk/prover.rs
index 64771ae3a..d05619311 100644
--- a/plonky2/src/plonk/prover.rs
+++ b/plonky2/src/plonk/prover.rs
@@ -244,7 +244,7 @@ where
     } else {
         zs_partial_products
     };
-    
+
     let partial_products_zs_and_lookup_commitment = timed!(
         timing,
         "commit to partial products, Z's and, if any, lookup polynomials",
@@ -305,7 +305,7 @@ where
             prover_data.fft_root_table.as_ref(),
         )
     );
-    
+
     challenger.observe_cap::<C::Hasher>(&quotient_polys_commitment.merkle_tree.cap);
 
     let zeta = challenger.get_extension_challenge::<D>();
diff --git a/plonky2/src/util/mod.rs b/plonky2/src/util/mod.rs
index 6f2ae608e..cb11f05e2 100644
--- a/plonky2/src/util/mod.rs
+++ b/plonky2/src/util/mod.rs
@@ -3,7 +3,6 @@
 #[cfg(not(feature = "std"))]
 use alloc::vec::Vec;
 
-use plonky2_maybe_rayon::*;
 #[doc(inline)]
 pub use plonky2_util::*;
 #[cfg(feature = "cuda")]
diff --git a/test_gpu.sh b/test_gpu.sh
new file mode 100755
index 000000000..a9c2c4a8c
--- /dev/null
+++ b/test_gpu.sh
@@ -0,0 +1,141 @@
+#!/bin/bash
+
+# GPU Testing Script for Plonky2
+# This script validates CUDA setup, zeknox library, and runs GPU-accelerated tests
+
+set -e  # Exit on any error
+
+# Color codes for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+echo "========================================="
+echo "Plonky2 GPU Testing Script"
+echo "========================================="
+echo ""
+
+# Step 1: Check NVIDIA driver and CUDA
+echo -e "${YELLOW}[1/5] Checking NVIDIA driver and CUDA...${NC}"
+if ! command -v nvidia-smi &> /dev/null; then
+    echo -e "${RED}ERROR: nvidia-smi not found. Please install NVIDIA drivers.${NC}"
+    exit 1
+fi
+
+echo "NVIDIA Driver Information:"
+nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv
+echo ""
+
+# Check for CUDA toolkit
+if command -v nvcc &> /dev/null; then
+    echo "CUDA Compiler Version:"
+    nvcc --version | grep "release"
+    echo ""
+elif [ -n "$CUDA_HOME" ]; then
+    echo "CUDA_HOME is set to: $CUDA_HOME"
+    echo ""
+else
+    echo -e "${YELLOW}WARNING: nvcc not found and CUDA_HOME not set. CUDA toolkit may not be installed.${NC}"
+    echo "Continuing anyway as runtime libraries may still be available..."
+    echo ""
+fi
+
+echo -e "${GREEN} NVIDIA driver check passed${NC}"
+echo ""
+
+# Step 2: Check zeknox library
+echo -e "${YELLOW}[2/5] Checking zeknox library...${NC}"
+ZEKNOX_PATH="../zeknox"
+if [ ! -d "$ZEKNOX_PATH" ]; then
+    echo -e "${RED}ERROR: zeknox library not found at $ZEKNOX_PATH${NC}"
+    echo "Expected location: $(cd .. && pwd)/zeknox"
+    exit 1
+fi
+
+if [ -d "$ZEKNOX_PATH/wrappers/rust" ]; then
+    echo "Found zeknox library at: $(cd $ZEKNOX_PATH && pwd)"
+    echo "Rust wrappers directory exists: $ZEKNOX_PATH/wrappers/rust"
+else
+    echo -e "${YELLOW}WARNING: zeknox/wrappers/rust not found, but zeknox directory exists${NC}"
+fi
+
+echo -e "${GREEN} zeknox library check passed${NC}"
+echo ""
+
+# Step 3: Run field tests
+echo -e "${YELLOW}[3/5] Running field tests with GPU acceleration...${NC}"
+echo "Command: cd field && cargo test --release --features=cuda -- --test-threads=1"
+echo ""
+
+cd field
+if cargo test --release --features=cuda -- --test-threads=1; then
+    echo ""
+    echo -e "${GREEN} Field tests passed${NC}"
+else
+    echo ""
+    echo -e "${RED}ERROR: Field tests failed${NC}"
+    cd ..
+    exit 1
+fi
+cd ..
+echo ""
+
+# Step 4: Run fibonacci example with CUDA for correctness
+echo -e "${YELLOW}[4/5] Running fibonacci example with CUDA features...${NC}"
+echo "Command: NUM_OF_GPUS=1 cargo run --release --features=cuda_sanity_check --example fibonacci"
+echo ""
+
+if NUM_OF_GPUS=1 cargo run --release --features=cuda_sanity_check --example fibonacci; then
+    echo ""
+    echo -e "${GREEN} Fibonacci example completed successfully with GPU${NC}"
+else
+    echo ""
+    echo -e "${RED}ERROR: Fibonacci example failed with GPU${NC}"
+    exit 1
+fi
+echo ""
+
+# Step 5: Run fibonacci example with CUDA for speed
+echo -e "${YELLOW}[4/5] Running fibonacci example with CUDA features...${NC}"
+echo "Command: NUM_OF_GPUS=1 cargo run --release --example fibonacci --features=cuda"
+echo ""
+
+if NUM_OF_GPUS=1 cargo run --release --example fibonacci --features=cuda; then
+    echo ""
+    echo -e "${GREEN} Fibonacci example completed successfully with GPU${NC}"
+else
+    echo ""
+    echo -e "${RED}ERROR: Fibonacci example failed with GPU${NC}"
+    exit 1
+fi
+echo ""
+
+
+# Step 6: Run fibonacci example with CPU
+echo -e "${YELLOW}[4/5] Running fibonacci example with CUDA features...${NC}"
+echo "Command: NUM_OF_GPUS=1 cargo run --release --example fibonacci"
+echo ""
+
+if cargo run --release --example fibonacci; then
+    echo ""
+    echo -e "${GREEN} Fibonacci example completed successfully with CPU${NC}"
+else
+    echo ""
+    echo -e "${RED}ERROR: Fibonacci example failed with CPU${NC}"
+    exit 1
+fi
+echo ""
+
+# Step 7: Summary
+echo "========================================="
+echo -e "${GREEN}All GPU tests completed successfully!${NC}"
+echo "========================================="
+echo ""
+echo "Tests run:"
+echo "   NVIDIA driver and CUDA verification"
+echo "   zeknox library verification"
+echo "   Field tests (FFT, polynomials, interpolation, cosets)"
+echo "   Fibonacci proof generation with GPU acceleration"
+echo ""
+echo -e "${GREEN}GPU testing complete!${NC}"

From d58854cb91d6226882f36515533afe52da450fbe Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-31-221.ap-northeast-1.compute.internal>
Date: Thu, 11 Dec 2025 21:30:23 +0000
Subject: [PATCH 24/37] more clean up

---
 plonky2/src/plonk/verifier.rs | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/plonky2/src/plonk/verifier.rs b/plonky2/src/plonk/verifier.rs
index d369656c6..fa1bc14b8 100644
--- a/plonky2/src/plonk/verifier.rs
+++ b/plonky2/src/plonk/verifier.rs
@@ -27,11 +27,6 @@ pub(crate) fn verify<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, c
         &verifier_data.circuit_digest,
         common_data,
     )?;
-    println!("verifier alphas: {:?}", challenges.plonk_alphas);
-    println!("verifier betas: {:?}", challenges.plonk_betas);
-    println!("verifier gammas: {:?}", challenges.plonk_gammas);
-    println!("verifier deltas: {:?}", challenges.plonk_deltas);
-    println!("verifier zeta: {:?}", challenges.plonk_zeta);
 
     verify_with_challenges::<F, C, D>(
         proof_with_pis.proof,

From 50a71fb6a087eb5f325220f95d6fb3c18b33df54 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-31-221.ap-northeast-1.compute.internal>
Date: Thu, 11 Dec 2025 21:36:37 +0000
Subject: [PATCH 25/37] more clean up

---
 plonky2/src/hash/merkle_tree.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs
index 705f5ad74..41d0ffee7 100644
--- a/plonky2/src/hash/merkle_tree.rs
+++ b/plonky2/src/hash/merkle_tree.rs
@@ -36,8 +36,8 @@ use crate::util::log2_strict;
 pub static GPU_ID: Lazy<Arc<Mutex<u64>>> = Lazy::new(|| Arc::new(Mutex::new(0)));
 
 #[cfg(feature = "cuda")]
-fn print_time(now: Instant, msg: &str) {
-    println!("Time {} {} ms", msg, now.elapsed().as_millis());
+fn print_time(_now: Instant, _msg: &str) {
+    // println!("Time {} {} ms", _msg, _now.elapsed().as_millis());
 }
 
 #[cfg(not(feature = "cuda"))]
@@ -526,6 +526,7 @@ impl<F: RichField, H: Hasher<F>> MerkleTree<F, H> {
             }
         }
 
+
         unsafe {
             // SAFETY: `fill_digests_buf` and `cap` initialized the spare capacity up to
             // `num_digests` and `len_cap`, resp.

From 9246859ffe2eec7c0fdbefa536c69e8951601e47 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-31-221.ap-northeast-1.compute.internal>
Date: Fri, 12 Dec 2025 20:25:59 +0000
Subject: [PATCH 26/37] fix bugs

---
 field/Cargo.toml                     |  3 ++-
 plonky2/Cargo.toml                   |  9 ++++++--
 plonky2/src/fri/oracle.rs            | 31 +++-------------------------
 plonky2/src/hash/merkle_tree.rs      |  1 -
 plonky2/src/plonk/circuit_builder.rs |  2 +-
 test_gpu.sh                          | 12 +++++------
 6 files changed, 19 insertions(+), 39 deletions(-)

diff --git a/field/Cargo.toml b/field/Cargo.toml
index 02f535922..39ee8ef07 100644
--- a/field/Cargo.toml
+++ b/field/Cargo.toml
@@ -35,7 +35,8 @@ workspace = true
 
 
 [features]
-default = []
+# default = []
+default = [ "cuda" ]
 # default = [ "cuda", "cuda_sanity_check" ]
 cuda = []
 # sanity check: when this flag is on, the computation will done on both CPU and CUDA, and the results compared
diff --git a/plonky2/Cargo.toml b/plonky2/Cargo.toml
index 7f91186a0..df60129ac 100644
--- a/plonky2/Cargo.toml
+++ b/plonky2/Cargo.toml
@@ -11,8 +11,13 @@ repository.workspace = true
 keywords.workspace = true
 categories.workspace = true
 
+
 [features]
-default = ["gate_testing", "parallel", "rand_chacha", "std", "timing"]
+# default = ["gate_testing", "rand_chacha", "std", "timing", "cuda"]
+# default = ["gate_testing", "parallel", "rand_chacha", "std", "timing", ]
+
+default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing", ]
+# default = ["gate_testing", "rand_chacha", "std", "cuda", "timing", "cuda_sanity_check"]
 # default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing", "cuda_sanity_check"]
 gate_testing = []
 parallel = ["hashbrown/rayon", "plonky2_maybe_rayon/parallel"]
@@ -31,7 +36,7 @@ itertools = { workspace = true }
 keccak-hash = { version = "0.8.0", default-features = false }
 log = { workspace = true }
 num = { workspace = true }
-once_cell = { workspace = true }
+once_cell = { workspace = true, features = ["std"] }
 rand = { workspace = true }
 rand_chacha = { version = "0.3.1", optional = true, default-features = false }
 serde = { workspace = true, features = ["rc"] }
diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs
index d008301cf..e058374e2 100644
--- a/plonky2/src/fri/oracle.rs
+++ b/plonky2/src/fri/oracle.rs
@@ -55,8 +55,8 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
 {
     /// Creates a list polynomial commitment for the polynomials interpolating the values in `values`.
     /// This function is called by the builder during preprocessing the circuit.
-    /// We use parallel IFFT on CPU here to avoid strange GPU issue.
-    pub fn preprocessor_from_values(
+    /// This function always calls IFFT on CPU to avoid strange GPU issue.
+    pub fn from_values(
         values: Vec<PolynomialValues<F>>,
         rate_bits: usize,
         blinding: bool,
@@ -66,7 +66,7 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
     ) -> Self {
         let coeffs = timed!(
             timing,
-            "IFFT",
+            "CPU IFFT",
             values
                 .into_par_iter()
                 .map(|v| v.ifft_cpu())
@@ -83,31 +83,6 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
         )
     }
 
-    /// Creates a list polynomial commitment for the polynomials interpolating the values in `values`.
-    pub fn from_values(
-        values: Vec<PolynomialValues<F>>,
-        rate_bits: usize,
-        blinding: bool,
-        cap_height: usize,
-        timing: &mut TimingTree,
-        fft_root_table: Option<&FftRootTable<F>>,
-    ) -> Self {
-        let coeffs = timed!(
-            timing,
-            "IFFT",
-            values.into_par_iter().map(|v| v.ifft()).collect::<Vec<_>>()
-        );
-
-        Self::from_coeffs(
-            coeffs,
-            rate_bits,
-            blinding,
-            cap_height,
-            timing,
-            fft_root_table,
-        )
-    }
-
     /// Creates a list polynomial commitment for the polynomials `polynomials`.
     pub fn from_coeffs(
         polynomials: Vec<PolynomialCoeffs<F>>,
diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs
index 41d0ffee7..b2a57df52 100644
--- a/plonky2/src/hash/merkle_tree.rs
+++ b/plonky2/src/hash/merkle_tree.rs
@@ -526,7 +526,6 @@ impl<F: RichField, H: Hasher<F>> MerkleTree<F, H> {
             }
         }
 
-
         unsafe {
             // SAFETY: `fill_digests_buf` and `cap` initialized the spare capacity up to
             // `num_digests` and `len_cap`, resp.
diff --git a/plonky2/src/plonk/circuit_builder.rs b/plonky2/src/plonk/circuit_builder.rs
index 5b2f2c3a1..e6a81f378 100644
--- a/plonky2/src/plonk/circuit_builder.rs
+++ b/plonky2/src/plonk/circuit_builder.rs
@@ -1229,7 +1229,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         // It does not impact performance as this is only done once during setup.
         let constants_sigmas_commitment = if commit_to_sigma {
             let constants_sigmas_vecs = [constant_vecs, sigma_vecs.clone()].concat();
-            PolynomialBatch::<F, C, D>::preprocessor_from_values(
+            PolynomialBatch::<F, C, D>::from_values(
                 constants_sigmas_vecs,
                 rate_bits,
                 PlonkOracle::CONSTANTS_SIGMAS.blinding,
diff --git a/test_gpu.sh b/test_gpu.sh
index a9c2c4a8c..9979bfeaf 100755
--- a/test_gpu.sh
+++ b/test_gpu.sh
@@ -17,7 +17,7 @@ echo "========================================="
 echo ""
 
 # Step 1: Check NVIDIA driver and CUDA
-echo -e "${YELLOW}[1/5] Checking NVIDIA driver and CUDA...${NC}"
+echo -e "${YELLOW}[1/7] Checking NVIDIA driver and CUDA...${NC}"
 if ! command -v nvidia-smi &> /dev/null; then
     echo -e "${RED}ERROR: nvidia-smi not found. Please install NVIDIA drivers.${NC}"
     exit 1
@@ -45,7 +45,7 @@ echo -e "${GREEN} NVIDIA driver check passed${NC}"
 echo ""
 
 # Step 2: Check zeknox library
-echo -e "${YELLOW}[2/5] Checking zeknox library...${NC}"
+echo -e "${YELLOW}[2/7] Checking zeknox library...${NC}"
 ZEKNOX_PATH="../zeknox"
 if [ ! -d "$ZEKNOX_PATH" ]; then
     echo -e "${RED}ERROR: zeknox library not found at $ZEKNOX_PATH${NC}"
@@ -64,7 +64,7 @@ echo -e "${GREEN} zeknox library check passed${NC}"
 echo ""
 
 # Step 3: Run field tests
-echo -e "${YELLOW}[3/5] Running field tests with GPU acceleration...${NC}"
+echo -e "${YELLOW}[3/7] Running field tests with GPU acceleration...${NC}"
 echo "Command: cd field && cargo test --release --features=cuda -- --test-threads=1"
 echo ""
 
@@ -82,7 +82,7 @@ cd ..
 echo ""
 
 # Step 4: Run fibonacci example with CUDA for correctness
-echo -e "${YELLOW}[4/5] Running fibonacci example with CUDA features...${NC}"
+echo -e "${YELLOW}[4/7] Running fibonacci example with CUDA features...${NC}"
 echo "Command: NUM_OF_GPUS=1 cargo run --release --features=cuda_sanity_check --example fibonacci"
 echo ""
 
@@ -97,7 +97,7 @@ fi
 echo ""
 
 # Step 5: Run fibonacci example with CUDA for speed
-echo -e "${YELLOW}[4/5] Running fibonacci example with CUDA features...${NC}"
+echo -e "${YELLOW}[5/7] Running fibonacci example with CUDA features...${NC}"
 echo "Command: NUM_OF_GPUS=1 cargo run --release --example fibonacci --features=cuda"
 echo ""
 
@@ -113,7 +113,7 @@ echo ""
 
 
 # Step 6: Run fibonacci example with CPU
-echo -e "${YELLOW}[4/5] Running fibonacci example with CUDA features...${NC}"
+echo -e "${YELLOW}[6/7] Running fibonacci example with CUDA features...${NC}"
 echo "Command: NUM_OF_GPUS=1 cargo run --release --example fibonacci"
 echo ""
 

From 39ca68006a2ef6da6b3b72bb6f29ff8a7317cfad Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-31-221.ap-northeast-1.compute.internal>
Date: Mon, 15 Dec 2025 21:01:10 +0000
Subject: [PATCH 27/37] working now

---
 field/src/fft.rs                    |  34 +++
 field/src/polynomial/division.rs    |   2 +-
 plonky2/examples/fibonacci.rs       |   4 +-
 plonky2/src/fri/oracle.rs           | 332 +++++++++++++++++++++++++++-
 plonky2/src/plonk/circuit_data.rs   |  12 +-
 plonky2/src/plonk/prover.rs         | 100 ++++++---
 plonky2/src/plonk/vanishing_poly.rs |  95 +++++---
 7 files changed, 505 insertions(+), 74 deletions(-)

diff --git a/field/src/fft.rs b/field/src/fft.rs
index 2509c7b2c..e08fd8021 100644
--- a/field/src/fft.rs
+++ b/field/src/fft.rs
@@ -11,6 +11,40 @@ use crate::types::Field;
 
 pub type FftRootTable<F> = Vec<Vec<F>>;
 
+pub fn batch_fft<F: Field>(input: &[PolynomialCoeffs<F>]) -> Vec<PolynomialValues<F>> {
+    #[cfg(feature = "cuda")]
+    {
+        use zeknox::ntt_batch;
+        use zeknox::types::NTTConfig;
+
+        let mut data = input
+            .iter()
+            .flat_map(|poly| poly.coeffs.clone())
+            .collect::<Vec<F>>();
+        let mut cfg = NTTConfig::default();
+        cfg.batches = input.len() as u32;
+        let poly_len = input[0].len();
+        ntt_batch(0, &mut data, log2_strict(poly_len), cfg);
+
+        data.chunks(poly_len)
+            .map(|chunk| PolynomialValues::new(chunk.to_vec()))
+            .collect()
+    }
+    #[cfg(not(feature = "cuda"))]
+    {
+        let mut res = Vec::with_capacity(input.len());
+        for poly in input.iter() {
+            let mut batch_res = Vec::with_capacity(poly.len());
+            for p in poly {
+                let pv = fft_with_options(p.clone(), None, None);
+                batch_res.push(pv);
+            }
+            res.extend(batch_res);
+        }
+        res
+    }
+}
+
 pub fn fft_root_table<F: Field>(n: usize) -> FftRootTable<F> {
     let lg_n = log2_strict(n);
     // bases[i] = g^2^i, for i = 0, ..., lg_n - 1
diff --git a/field/src/polynomial/division.rs b/field/src/polynomial/division.rs
index 7d85d5492..a34602de4 100644
--- a/field/src/polynomial/division.rs
+++ b/field/src/polynomial/division.rs
@@ -78,7 +78,7 @@ impl<F: Field> PolynomialCoeffs<F> {
             .iter()
             .rev()
             .scan(F::ZERO, |acc, &c| {
-                *acc = *acc * z + c;
+                *acc = c.multiply_accumulate(*acc, z);
                 Some(*acc)
             })
             .collect::<Vec<_>>();
diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs
index 71bd479bb..d2d30e2d7 100644
--- a/plonky2/examples/fibonacci.rs
+++ b/plonky2/examples/fibonacci.rs
@@ -27,7 +27,7 @@ fn main() -> Result<()> {
     let initial_b = builder.add_virtual_target();
     let mut prev_target = initial_a;
     let mut cur_target = initial_b;
-    for _ in 0..999999 {
+    for _ in 0..99 {
         let temp = builder.add(prev_target, cur_target);
         prev_target = cur_target;
         cur_target = temp;
@@ -36,7 +36,7 @@ fn main() -> Result<()> {
 
     #[cfg(feature = "cuda")]
     {
-        let size = 16;
+        let size = 3;
 
         zeknox::clear_cuda_errors_rs();
         println!("Initializing CUDA twiddle factors...");
diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs
index e058374e2..29ebb5fd1 100644
--- a/plonky2/src/fri/oracle.rs
+++ b/plonky2/src/fri/oracle.rs
@@ -53,6 +53,24 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize> D
 impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
     PolynomialBatch<F, C, D>
 {
+    // pub fn from_values_gpu(
+    //     values: Vec<PolynomialValues<F>>,
+    //     rate_bits: usize,
+    //     blinding: bool,
+    //     cap_height: usize,
+    //     timing: &mut TimingTree,
+    //     fft_root_table: Option<&FftRootTable<F>>,
+    // ) -> Self {
+    //     let coeffs = timed!(
+    //         timing,
+    //         "CPU IFFT",
+    //         values
+    //             .into_par_iter()
+    //             .map(|v| v.ifft_cpu())
+    //             .collect::<Vec<_>>()
+    //     );
+    // }
+
     /// Creates a list polynomial commitment for the polynomials interpolating the values in `values`.
     /// This function is called by the builder during preprocessing the circuit.
     /// This function always calls IFFT on CPU to avoid strange GPU issue.
@@ -73,7 +91,7 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
                 .collect::<Vec<_>>()
         );
 
-        Self::from_coeffs(
+        Self::from_coeffs_gpu(
             coeffs,
             rate_bits,
             blinding,
@@ -116,6 +134,318 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
         }
     }
 
+    pub fn from_coeffs_gpu(
+        polynomials: Vec<PolynomialCoeffs<F>>,
+        rate_bits: usize,
+        blinding: bool,
+        cap_height: usize,
+        timing: &mut TimingTree,
+        fft_root_table: Option<&FftRootTable<F>>,
+    ) -> Self {
+        let degree = polynomials[0].len();
+
+        // If blinding, salt with two random elements to each leaf vector.
+        let salt_size = if blinding { SALT_SIZE } else { 0 };
+        println!(
+            "lde_values: num_polys={}, degree={}, blinding={}, salt_size={}",
+            polynomials.len(),
+            degree,
+            blinding,
+            salt_size
+        );
+
+        #[cfg(feature = "cuda")]
+        {
+            if F::CUDA_SUPPORT {
+                return Self::from_coeffs_gpu_optimized(
+                    polynomials,
+                    rate_bits,
+                    blinding,
+                    cap_height,
+                    timing,
+                    fft_root_table,
+                    degree,
+                    salt_size,
+                );
+            }
+        }
+
+        // Fallback to CPU path
+        let lde_values = polynomials
+            .iter()
+            .map(|p| {
+                assert_eq!(p.len(), degree, "Polynomial degrees inconsistent");
+                p.lde(rate_bits)
+                    .coset_fft_with_options(F::coset_shift(), Some(rate_bits), fft_root_table)
+                    .values
+            })
+            .chain(
+                (0..salt_size)
+                    .into_iter()
+                    .map(|_| F::rand_vec(degree << rate_bits)),
+            )
+            .collect::<Vec<_>>();
+        let mut leaves = timed!(timing, "transpose LDEs", transpose(&lde_values));
+        reverse_index_bits_in_place(&mut leaves);
+        let merkle_tree = timed!(
+            timing,
+            "build Merkle tree",
+            MerkleTree::new_from_2d(leaves, cap_height)
+        );
+
+        Self {
+            polynomials,
+            merkle_tree,
+            degree_log: log2_strict(degree),
+            rate_bits,
+            blinding,
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    fn from_coeffs_gpu_optimized(
+        polynomials: Vec<PolynomialCoeffs<F>>,
+        rate_bits: usize,
+        blinding: bool,
+        cap_height: usize,
+        timing: &mut TimingTree,
+        fft_root_table: Option<&FftRootTable<F>>,
+        degree: usize,
+        salt_size: usize,
+    ) -> Self {
+        println!("Using GPU-accelerated LDE computation");
+
+        use std::time::Instant;
+
+        use zeknox::device::memory::HostOrDeviceSlice;
+        use zeknox::types::{NTTConfig, TransposeConfig};
+        use zeknox::{ntt_batch_ptr, transpose_rev_batch};
+
+        let lde_size = degree << rate_bits;
+        let num_polys = polynomials.len() + salt_size;
+
+        // let lde_cpu = {
+        //     // Fallback to CPU path
+        //     let lde_values = polynomials
+        //         .iter()
+        //         .map(|p| {
+        //             assert_eq!(p.len(), degree, "Polynomial degrees inconsistent");
+        //             p.lde(rate_bits)
+        //                 .coset_fft_with_options(F::coset_shift(), Some(rate_bits), fft_root_table)
+        //                 .values
+        //         })
+        //         .collect::<Vec<_>>();
+
+        //     // for v in &lde_values {
+        //     //     println!("lde_values {:?}", v);
+        //     // }
+
+        //     lde_values
+        // };
+
+        // let salt_size = if blinding { SALT_SIZE } else { 0 };
+
+        // Step 1: Compute coset FFT on GPU, keeping data on GPU
+        let gpu_lde_values = timed!(timing, "GPU coset FFT", {
+            // let mut all_lde_data = Vec::with_capacity(num_polys);
+
+            // // Process each polynomial
+            // for p in polynomials.iter() {
+            //     assert_eq!(p.len(), degree, "Polynomial degrees inconsistent");
+
+            //     // Perform LDE (padding) and coset scaling on CPU
+            //     let mut padded_coeffs = p.lde(rate_bits).coeffs;
+
+            //     // Apply coset shift
+            //     let shift = F::coset_shift();
+            //     for (i, coeff) in padded_coeffs.iter_mut().enumerate() {
+            //         *coeff *= shift.exp_u64(i as u64);
+            //     }
+
+            //     all_lde_data.push(padded_coeffs);
+            // }
+
+            // // Add salt
+            // for _ in 0..salt_size {
+            //     all_lde_data.push(F::rand_vec(lde_size));
+            // }
+
+            // Allocate GPU memory for all polynomials
+            let total_elements = num_polys * lde_size;
+            println!(
+                "Allocating GPU memory for {} polynomials of size {} (total {} elements)",
+                num_polys, lde_size, total_elements
+            );
+            let total_alloce_size = num_polys * lde_size;
+            // let total_alloce_size = num_polys.next_power_of_two() * lde_size;
+
+            let timer = Instant::now();
+            let mut gpu_buffer = HostOrDeviceSlice::cuda_malloc(0, total_alloce_size)
+                .expect("Failed to allocate GPU memory");
+            println!("cuda alloc took: {:?}", timer.elapsed());
+            println!(
+                "lde size: {}, total_elements: {}, total allocated size: {}",
+                lde_size, total_elements, total_alloce_size
+            );
+
+            let timer = Instant::now();
+            // Copy all data to GPU in one go
+            let mut flat_data = vec![F::ZERO; total_alloce_size];
+
+            for i in 0..polynomials.len() {
+
+                flat_data[i*lde_size.. i*lde_size +degree].copy_from_slice(polynomials[i].coeffs.as_ref())
+
+            }
+
+            // polynomials.par_iter().zip(flat_data.par_chunks_exact_mut(lde_size)).for_each(|(p, c)|
+            //     c[..degree].copy_from_slice(p.coeffs.as_ref())
+            // );
+
+            // let flat_data: Vec<F> = polynomials
+            //     .iter()
+            //     .flat_map(|v| v.lde(rate_bits).coeffs) // pad each polynomial to lde_size
+            //     // .into_iter()
+            //     // .collect()
+            //     // .iter()
+            //     // .chain(
+            //     //     vec![F::ZERO; total_alloce_size - total_elements]
+            //     //         .iter()
+            //     //         .copied(),
+            //     // )
+            //     .collect();
+
+            println!("cpu prepare took: {:?}", timer.elapsed());
+            // let flat_data =  vec![F::ZERO; lde_size - total_elements].iter()
+
+            println!(
+                "Copying {} elements to GPU (expected {})",
+                flat_data.len(),
+                total_elements
+            );
+
+            let timer = Instant::now();
+            gpu_buffer
+                .copy_from_host(&flat_data)
+                .expect("Failed to copy data to GPU");
+            println!("IO took: {:?}", timer.elapsed());
+
+
+            // Perform batched NTT on GPU
+            let log_domain_size = log2_strict(lde_size);
+            let ntt_config = NTTConfig {
+                batches: num_polys as u32,
+                are_inputs_on_device: true,
+                are_outputs_on_device: true,
+                with_coset: true,
+                ..Default::default()
+            };
+
+            let timer = Instant::now();
+            ntt_batch_ptr(0, gpu_buffer.as_mut_ptr(), log_domain_size, ntt_config);
+
+            println!("comput took: {:?}", timer.elapsed());
+
+            gpu_buffer
+        });
+
+        println!("Completed GPU coset FFT for {} polynomials", num_polys);
+
+        // let total_elements = num_polys * lde_size;
+        // let mut gpu_lde_values_copied_to_cpu = vec![F::ZERO; total_elements];
+
+        // gpu_lde_values
+        //     .copy_to_host(&mut gpu_lde_values_copied_to_cpu, total_elements)
+        //     .expect("Failed to copy data from GPU");
+
+        // println!("lde value: {:?}", gpu_lde_values_copied_to_cpu);
+
+        // let lde_cpu_1d = lde_cpu.clone().into_iter().flatten().collect::<Vec<_>>();
+        // assert_eq!(lde_cpu_1d.len(), gpu_lde_values_copied_to_cpu.len(), "LDE size mismatch");
+        // assert_eq!(lde_cpu_1d, gpu_lde_values_copied_to_cpu, "LDE values mismatch");
+
+        // Step 2: Transpose on GPU using Zeknox
+        let gpu_transposed = timed!(timing, "GPU transpose", {
+            let total_alloce_size = num_polys * lde_size;
+            // let total_alloce_size = num_polys.next_power_of_two() * lde_size;
+
+            let mut gpu_output = HostOrDeviceSlice::cuda_malloc(0, total_alloce_size)
+                .expect("Failed to allocate GPU memory for transpose");
+
+            let log_n = log2_strict(lde_size);
+            let transpose_config = TransposeConfig {
+                batches: num_polys as u32,
+                are_inputs_on_device: true,
+                are_outputs_on_device: true,
+            };
+
+            transpose_rev_batch(
+                0,
+                gpu_output.as_mut_ptr(),
+                gpu_lde_values.as_ptr(),
+                log_n,
+                transpose_config,
+            );
+
+            // gpu_lde_values will be automatically freed when it goes out of scope
+
+            gpu_output
+        });
+        print!("Completed GPU transpose for {} polynomials", num_polys);
+
+        // Step 3: Copy back to CPU
+        let leaves = timed!(timing, "GPU to CPU transfer", {
+            let total_elements = num_polys * lde_size;
+            let mut cpu_data = vec![F::ZERO; total_elements];
+
+            gpu_transposed
+                .copy_to_host(&mut cpu_data, total_elements)
+                .expect("Failed to copy data from GPU");
+
+            // // Reshape into leaves: Vec<Vec<F>> where each inner vec has num_polys elements
+            // cpu_data
+            //     .chunks(num_polys)
+            //     .map(|chunk| chunk.to_vec())
+            //     .collect::<Vec<_>>()
+
+            cpu_data
+        });
+
+        // let mut leaves_cpu = timed!(timing, "transpose LDEs", transpose(&lde_cpu));
+        // println!("tatal leaves: {}", leaves.len());
+        // println!("leaves[0]:     {:?}", leaves[0]);
+        // println!("leaves_cpu[0]: {:?}", leaves_cpu[0]);
+
+        // reverse_index_bits_in_place(&mut leaves_cpu);
+        // for i in 0..leaves.len() {
+        //     if leaves[i] != leaves_cpu[i] {
+        //         println!("Mismatch at leaf {}: \n{:?}\n{:?}\n", i, leaves[i], leaves_cpu[i]);
+        //     }
+        // }
+
+        // assert!(leaves == leaves_cpu, "Transposed LDE values mismatch");
+
+        // let merkle_tree = timed!(
+        //     timing,
+        //     "build Merkle tree",
+        //     MerkleTree::new_from_2d(leaves, cap_height)
+        // );
+
+       let merkle_tree = timed!(
+            timing,
+            "build Merkle tree",
+            MerkleTree::new_from_1d(leaves,polynomials.len(), cap_height)
+        );
+
+        Self {
+            polynomials,
+            merkle_tree,
+            degree_log: log2_strict(degree),
+            rate_bits,
+            blinding,
+        }
+    }
+
     pub(crate) fn lde_values(
         polynomials: &[PolynomialCoeffs<F>],
         rate_bits: usize,
diff --git a/plonky2/src/plonk/circuit_data.rs b/plonky2/src/plonk/circuit_data.rs
index e06e9b69b..61aa36ab0 100644
--- a/plonky2/src/plonk/circuit_data.rs
+++ b/plonky2/src/plonk/circuit_data.rs
@@ -19,6 +19,7 @@ use core::ops::{Range, RangeFrom};
 use std::collections::BTreeMap;
 
 use anyhow::Result;
+use log::Level;
 use serde::Serialize;
 
 use super::circuit_builder::LookupWire;
@@ -213,12 +214,11 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
     }
 
     pub fn prove(&self, inputs: PartialWitness<F>) -> Result<ProofWithPublicInputs<F, C, D>> {
-        prove::<F, C, D>(
-            &self.prover_only,
-            &self.common,
-            inputs,
-            &mut TimingTree::default(),
-        )
+        let mut timing = TimingTree::new("CircuitData::prove", Level::Debug);
+
+        let res = prove::<F, C, D>(&self.prover_only, &self.common, inputs, &mut timing);
+        timing.print();
+        res
     }
 
     pub fn verify(&self, proof_with_pis: ProofWithPublicInputs<F, C, D>) -> Result<()> {
diff --git a/plonky2/src/plonk/prover.rs b/plonky2/src/plonk/prover.rs
index d05619311..b79391fc8 100644
--- a/plonky2/src/plonk/prover.rs
+++ b/plonky2/src/plonk/prover.rs
@@ -4,6 +4,7 @@
 use alloc::{format, vec, vec::Vec};
 use core::cmp::min;
 use core::mem::swap;
+use std::time::Instant;
 
 use anyhow::{ensure, Result};
 use hashbrown::HashMap;
@@ -649,11 +650,15 @@ fn compute_quotient_polys<
     // steps away since we work on an LDE of degree `max_filtered_constraint_degree`.
     let next_step = 1 << quotient_degree_bits;
 
+    let timer = Instant::now();
     let points = F::two_adic_subgroup(common_data.degree_bits() + quotient_degree_bits);
+    println!("Time to compute LDE points: {:?}", timer.elapsed());
+
     let lde_size = points.len();
 
     let z_h_on_coset = ZeroPolyOnCoset::new(common_data.degree_bits(), quotient_degree_bits);
 
+    let timer = Instant::now();
     // Precompute the lookup table evals on the challenges in delta
     // These values are used to produce the final RE constraints for each lut,
     // and are the same each time in check_lookup_constraints_batched.
@@ -686,14 +691,19 @@ fn compute_quotient_polys<
     } else {
         vec![]
     };
+    println!(
+        "Time to compute LUT RE polynomial evals: {:?}",
+        timer.elapsed()
+    );
 
+    let timer = Instant::now();
     let lut_re_poly_evals_refs: Vec<&[F]> =
         lut_re_poly_evals.iter().map(|v| v.as_slice()).collect();
 
-    let points_batches = points.par_chunks(BATCH_SIZE);
     let num_batches = points.len().div_ceil(BATCH_SIZE);
 
-    let quotient_values: Vec<Vec<F>> = points_batches
+    let quotient_values: Vec<Vec<F>> = points
+        .par_chunks(BATCH_SIZE)
         .enumerate()
         .flat_map(|(batch_i, xs_batch)| {
             // Each batch must be the same size, except the last one, which may be smaller.
@@ -702,23 +712,26 @@ fn compute_quotient_polys<
                     || (batch_i == num_batches - 1 && xs_batch.len() <= BATCH_SIZE)
             );
 
-            let indices_batch: Vec<usize> =
-                (BATCH_SIZE * batch_i..BATCH_SIZE * batch_i + xs_batch.len()).collect();
+            let batch_size = xs_batch.len();
+            let batch_start = BATCH_SIZE * batch_i;
+
+            let mut shifted_xs_batch = Vec::with_capacity(batch_size);
+            let mut local_zs_batch = Vec::with_capacity(batch_size);
+            let mut next_zs_batch = Vec::with_capacity(batch_size);
 
-            let mut shifted_xs_batch = Vec::with_capacity(xs_batch.len());
-            let mut local_zs_batch = Vec::with_capacity(xs_batch.len());
-            let mut next_zs_batch = Vec::with_capacity(xs_batch.len());
+            let mut local_lookup_batch = Vec::with_capacity(batch_size);
+            let mut next_lookup_batch = Vec::with_capacity(batch_size);
 
-            let mut local_lookup_batch = Vec::with_capacity(xs_batch.len());
-            let mut next_lookup_batch = Vec::with_capacity(xs_batch.len());
+            let mut partial_products_batch = Vec::with_capacity(batch_size);
+            let mut s_sigmas_batch = Vec::with_capacity(batch_size);
 
-            let mut partial_products_batch = Vec::with_capacity(xs_batch.len());
-            let mut s_sigmas_batch = Vec::with_capacity(xs_batch.len());
+            let mut local_constants_batch_refs = Vec::with_capacity(batch_size);
+            let mut local_wires_batch_refs = Vec::with_capacity(batch_size);
 
-            let mut local_constants_batch_refs = Vec::with_capacity(xs_batch.len());
-            let mut local_wires_batch_refs = Vec::with_capacity(xs_batch.len());
+            // let timer1 = Instant::now();
 
-            for (&i, &x) in indices_batch.iter().zip(xs_batch) {
+            for (j, &x) in xs_batch.iter().enumerate() {
+                let i = batch_start + j;
                 let shifted_x = F::coset_shift() * x;
                 let i_next = (i + next_step) % lde_size;
                 let local_constants_sigmas = prover_data
@@ -762,20 +775,28 @@ fn compute_quotient_polys<
                 s_sigmas_batch.push(s_sigmas);
             }
 
-            // NB (JN): I'm not sure how (in)efficient the below is. It needs measuring.
-            let mut local_constants_batch =
-                vec![F::ZERO; xs_batch.len() * local_constants_batch_refs[0].len()];
-            for i in 0..local_constants_batch_refs[0].len() {
+            // println!(
+            //     "Time to gather LDE values for batch {}: {:?}",
+            //     batch_i,
+            //     timer1.elapsed()
+            // );
+
+            // Optimized transposition with better cache locality
+            let n_constants = local_constants_batch_refs[0].len();
+            let mut local_constants_batch = vec![F::ZERO; xs_batch.len() * n_constants];
+            for i in 0..n_constants {
+                let offset = i * xs_batch.len();
                 for (j, constants) in local_constants_batch_refs.iter().enumerate() {
-                    local_constants_batch[i * xs_batch.len() + j] = constants[i];
+                    local_constants_batch[offset + j] = constants[i];
                 }
             }
 
-            let mut local_wires_batch =
-                vec![F::ZERO; xs_batch.len() * local_wires_batch_refs[0].len()];
-            for i in 0..local_wires_batch_refs[0].len() {
+            let n_wires = local_wires_batch_refs[0].len();
+            let mut local_wires_batch = vec![F::ZERO; xs_batch.len() * n_wires];
+            for i in 0..n_wires {
+                let offset = i * xs_batch.len();
                 for (j, wires) in local_wires_batch_refs.iter().enumerate() {
-                    local_wires_batch[i * xs_batch.len() + j] = wires[i];
+                    local_wires_batch[offset + j] = wires[i];
                 }
             }
 
@@ -786,6 +807,8 @@ fn compute_quotient_polys<
                 public_inputs_hash,
             );
 
+            // let timer1 = Instant::now();
+            let indices_batch: Vec<usize> = (batch_start..batch_start + batch_size).collect();
             let mut quotient_values_batch = eval_vanishing_poly_base_batch::<F, D>(
                 common_data,
                 &indices_batch,
@@ -804,21 +827,42 @@ fn compute_quotient_polys<
                 &z_h_on_coset,
                 &lut_re_poly_evals_refs,
             );
-
-            for (&i, quotient_values) in indices_batch.iter().zip(quotient_values_batch.iter_mut())
-            {
+            // println!(
+            //     "Time to eval vanishing poly for batch {}: {:?}",
+            //     batch_i,
+            //     timer1.elapsed()
+            // );
+
+            // let timer1 = Instant::now();
+            for (j, quotient_values) in quotient_values_batch.iter_mut().enumerate() {
+                let i = batch_start + j;
                 let denominator_inv = z_h_on_coset.eval_inverse(i);
                 quotient_values
                     .iter_mut()
                     .for_each(|v| *v *= denominator_inv);
             }
+            // println!(
+            //     "Time to divide out Z_H for batch {}: {:?}",
+            //     batch_i,
+            //     timer1.elapsed()
+            // );
+
             quotient_values_batch
         })
         .collect();
 
-    transpose(&quotient_values)
+    println!(
+        "Time to compute quotient polys: {:?} for {} points",
+        timer.elapsed(),
+        quotient_values.len()
+    );
+
+    let timer = Instant::now();
+    let res = transpose(&quotient_values)
         .into_par_iter()
         .map(PolynomialValues::new)
         .map(|values| values.coset_ifft(F::coset_shift()))
-        .collect()
+        .collect();
+    println!("Time to compute quotient polys IFFT: {:?}", timer.elapsed());
+    res
 }
diff --git a/plonky2/src/plonk/vanishing_poly.rs b/plonky2/src/plonk/vanishing_poly.rs
index 48179ce63..0f6177daa 100644
--- a/plonky2/src/plonk/vanishing_poly.rs
+++ b/plonky2/src/plonk/vanishing_poly.rs
@@ -211,20 +211,29 @@ pub(crate) fn eval_vanishing_poly_base_batch<F: RichField + Extendable<D>, const
     let num_challenges = common_data.config.num_challenges;
     let num_routed_wires = common_data.config.num_routed_wires;
 
-    let mut numerator_values = Vec::with_capacity(num_routed_wires);
-    let mut denominator_values = Vec::with_capacity(num_routed_wires);
+    // Pre-allocate reusable buffers with exact capacities
+    let mut numerator_values = Vec::with_capacity(num_routed_wires * num_challenges);
+    let mut denominator_values = Vec::with_capacity(num_routed_wires * num_challenges);
 
     // The L_0(x) (Z(x) - 1) vanishing terms.
     let mut vanishing_z_1_terms = Vec::with_capacity(num_challenges);
     // The terms checking the partial products.
-    let mut vanishing_partial_products_terms = Vec::new();
+    let mut vanishing_partial_products_terms = Vec::with_capacity(num_challenges * num_prods);
 
     // The terms checking the lookup constraints.
-    let mut vanishing_all_lookup_terms = if has_lookup {
+    let lookup_terms_capacity = if has_lookup {
         let num_sldc_polys = common_data.num_lookup_polys - 1;
-        Vec::with_capacity(
-            common_data.config.num_challenges * (4 + common_data.luts.len() + 2 * num_sldc_polys),
-        )
+        num_challenges * (4 + common_data.luts.len() + 2 * num_sldc_polys)
+    } else {
+        0
+    };
+    let mut vanishing_all_lookup_terms = Vec::with_capacity(lookup_terms_capacity);
+
+    // Pre-allocate selector buffer if needed
+    let selector_offset = common_data.selectors_info.num_selectors();
+    let num_lookup_selectors = common_data.num_lookup_selectors;
+    let mut lookup_selectors = if has_lookup && num_lookup_selectors > 0 {
+        Vec::with_capacity(num_lookup_selectors)
     } else {
         Vec::new()
     };
@@ -235,22 +244,20 @@ pub(crate) fn eval_vanishing_poly_base_batch<F: RichField + Extendable<D>, const
         let x = xs_batch[k];
         let vars = vars_batch.view(k);
 
-        let lookup_selectors: Vec<F> = (0..common_data.num_lookup_selectors)
-            .map(|i| vars.local_constants[common_data.selectors_info.num_selectors() + i])
-            .collect();
+        // Reuse lookup_selectors buffer
+        if has_lookup {
+            lookup_selectors.clear();
+            lookup_selectors.extend(
+                (0..num_lookup_selectors).map(|i| vars.local_constants[selector_offset + i]),
+            );
+        }
 
         let local_zs = local_zs_batch[k];
         let next_zs = next_zs_batch[k];
-        let local_lookup_zs = if has_lookup {
-            local_lookup_zs_batch[k]
-        } else {
-            &[]
-        };
-
-        let next_lookup_zs = if has_lookup {
-            next_lookup_zs_batch[k]
+        let (local_lookup_zs, next_lookup_zs) = if has_lookup {
+            (local_lookup_zs_batch[k], next_lookup_zs_batch[k])
         } else {
-            &[]
+            (&[][..], &[][..])
         };
 
         let partial_products = partial_products_batch[k];
@@ -259,6 +266,16 @@ pub(crate) fn eval_vanishing_poly_base_batch<F: RichField + Extendable<D>, const
         let constraint_terms = PackedStridedView::new(&constraint_terms_batch, n, k);
 
         let l_0_x = z_h_on_coset.eval_l_0(index, x);
+
+        // Pre-compute common values for all challenges
+        let beta_x_s_ids: Vec<F> = if num_challenges > 0 {
+            (0..num_routed_wires)
+                .map(|j| common_data.k_is[j] * x)
+                .collect()
+        } else {
+            Vec::new()
+        };
+
         for i in 0..num_challenges {
             let z_x = local_zs[i];
             let z_gx = next_zs[i];
@@ -268,10 +285,10 @@ pub(crate) fn eval_vanishing_poly_base_batch<F: RichField + Extendable<D>, const
             if has_lookup {
                 let cur_deltas = &deltas[NUM_COINS_LOOKUP * i..NUM_COINS_LOOKUP * (i + 1)];
 
-                let cur_local_lookup_zs = &local_lookup_zs
-                    [common_data.num_lookup_polys * i..common_data.num_lookup_polys * (i + 1)];
-                let cur_next_lookup_zs = &next_lookup_zs
-                    [common_data.num_lookup_polys * i..common_data.num_lookup_polys * (i + 1)];
+                let lookup_poly_start = common_data.num_lookup_polys * i;
+                let lookup_poly_end = lookup_poly_start + common_data.num_lookup_polys;
+                let cur_local_lookup_zs = &local_lookup_zs[lookup_poly_start..lookup_poly_end];
+                let cur_next_lookup_zs = &next_lookup_zs[lookup_poly_start..lookup_poly_end];
 
                 let lookup_constraints = check_lookup_constraints_batch(
                     common_data,
@@ -285,17 +302,17 @@ pub(crate) fn eval_vanishing_poly_base_batch<F: RichField + Extendable<D>, const
                 vanishing_all_lookup_terms.extend(lookup_constraints);
             }
 
-            numerator_values.extend((0..num_routed_wires).map(|j| {
-                let wire_value = vars.local_wires[j];
-                let k_i = common_data.k_is[j];
-                let s_id = k_i * x;
-                wire_value + betas[i] * s_id + gammas[i]
-            }));
-            denominator_values.extend((0..num_routed_wires).map(|j| {
+            let beta_i = betas[i];
+            let gamma_i = gammas[i];
+
+            // Compute numerators and denominators in a single pass
+            for j in 0..num_routed_wires {
                 let wire_value = vars.local_wires[j];
-                let s_sigma = s_sigmas[j];
-                wire_value + betas[i] * s_sigma + gammas[i]
-            }));
+                numerator_values
+                    .push(wire_value + gamma_i.multiply_accumulate(beta_i, beta_x_s_ids[j]));
+                denominator_values
+                    .push(wire_value + gamma_i.multiply_accumulate(beta_i, s_sigmas[j]));
+            }
 
             // The partial products considered for this iteration of `i`.
             let current_partial_products = &partial_products[i * num_prods..(i + 1) * num_prods];
@@ -587,7 +604,9 @@ pub fn check_lookup_constraints_batch<F: RichField + Extendable<D>, const D: usi
     // Check RE row transition constraint.
     let mut cur_sum = next_z_re;
     for elt in &current_lookup_combos {
-        cur_sum = cur_sum * deltas[LookupChallenges::ChallengeDelta as usize] + *elt;
+        // cur_sum = cur_sum * deltas[LookupChallenges::ChallengeDelta as usize] + *elt;
+        cur_sum =
+            elt.multiply_accumulate(cur_sum, deltas[LookupChallenges::ChallengeDelta as usize]);
     }
     let unfiltered_re_line = z_re - cur_sum;
 
@@ -639,7 +658,10 @@ pub fn check_lookup_constraints_batch<F: RichField + Extendable<D>, const D: usi
         let lut_sum_prods_with_mul = (poly * lut_degree
             ..min((poly + 1) * lut_degree, num_lut_slots))
             .fold(F::ZERO, |acc, i| {
-                acc + vars.local_wires[LookupTableGate::wire_ith_multiplicity(i)] * lut_prod_i(i)
+                acc.multiply_accumulate(
+                    vars.local_wires[LookupTableGate::wire_ith_multiplicity(i)],
+                    lut_prod_i(i),
+                )
             });
 
         // The previous element is the previous poly of the current row or the last poly of the next row.
@@ -656,7 +678,8 @@ pub fn check_lookup_constraints_batch<F: RichField + Extendable<D>, const D: usi
             .push(lookup_selectors[LookupSelectors::TransSre as usize] * unfiltered_sum_transition);
 
         // Check LDC row and col transitions. It's the same constraint, with a row transition happening for slot == 0.
-        let unfiltered_ldc_transition = lu_prod * (z_x_lookup_sldcs[poly] - prev) + lu_sum_prods;
+        let unfiltered_ldc_transition =
+            lu_sum_prods.multiply_accumulate(lu_prod, (z_x_lookup_sldcs[poly] - prev));
         constraints
             .push(lookup_selectors[LookupSelectors::TransLdc as usize] * unfiltered_ldc_transition);
     }

From d9375789c72fc85726b3813df13c01cf2816cfab Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-31-221.ap-northeast-1.compute.internal>
Date: Mon, 15 Dec 2025 22:04:55 +0000
Subject: [PATCH 28/37] clean up

---
 plonky2/src/fri/oracle.rs           | 306 +++++++++----------------
 plonky2/src/plonk/prover.rs         | 332 +++++++++++++---------------
 plonky2/src/plonk/vanishing_poly.rs |   2 +-
 plonky2/src/util/mod.rs             | 102 ---------
 4 files changed, 265 insertions(+), 477 deletions(-)

diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs
index 29ebb5fd1..56bb34c53 100644
--- a/plonky2/src/fri/oracle.rs
+++ b/plonky2/src/fri/oracle.rs
@@ -53,24 +53,6 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize> D
 impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
     PolynomialBatch<F, C, D>
 {
-    // pub fn from_values_gpu(
-    //     values: Vec<PolynomialValues<F>>,
-    //     rate_bits: usize,
-    //     blinding: bool,
-    //     cap_height: usize,
-    //     timing: &mut TimingTree,
-    //     fft_root_table: Option<&FftRootTable<F>>,
-    // ) -> Self {
-    //     let coeffs = timed!(
-    //         timing,
-    //         "CPU IFFT",
-    //         values
-    //             .into_par_iter()
-    //             .map(|v| v.ifft_cpu())
-    //             .collect::<Vec<_>>()
-    //     );
-    // }
-
     /// Creates a list polynomial commitment for the polynomials interpolating the values in `values`.
     /// This function is called by the builder during preprocessing the circuit.
     /// This function always calls IFFT on CPU to avoid strange GPU issue.
@@ -91,14 +73,25 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
                 .collect::<Vec<_>>()
         );
 
-        Self::from_coeffs_gpu(
-            coeffs,
-            rate_bits,
-            blinding,
-            cap_height,
-            timing,
-            fft_root_table,
-        )
+        if cfg!(feature = "cuda") {
+            Self::from_coeffs_gpu(
+                coeffs,
+                rate_bits,
+                blinding,
+                cap_height,
+                timing,
+                fft_root_table,
+            )
+        } else {
+            Self::from_coeffs_cpu(
+                coeffs,
+                rate_bits,
+                blinding,
+                cap_height,
+                timing,
+                fft_root_table,
+            )
+        }
     }
 
     /// Creates a list polynomial commitment for the polynomials `polynomials`.
@@ -109,6 +102,36 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
         cap_height: usize,
         timing: &mut TimingTree,
         fft_root_table: Option<&FftRootTable<F>>,
+    ) -> Self {
+        if cfg!(feature = "cuda") {
+            Self::from_coeffs_gpu(
+                polynomials,
+                rate_bits,
+                blinding,
+                cap_height,
+                timing,
+                fft_root_table,
+            )
+        } else {
+            Self::from_coeffs_cpu(
+                polynomials,
+                rate_bits,
+                blinding,
+                cap_height,
+                timing,
+                fft_root_table,
+            )
+        }
+    }
+
+    /// Creates a list polynomial commitment for the polynomials `polynomials`.
+    fn from_coeffs_cpu(
+        polynomials: Vec<PolynomialCoeffs<F>>,
+        rate_bits: usize,
+        blinding: bool,
+        cap_height: usize,
+        timing: &mut TimingTree,
+        fft_root_table: Option<&FftRootTable<F>>,
     ) -> Self {
         let degree = polynomials[0].len();
         let lde_values = timed!(
@@ -134,7 +157,7 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
         }
     }
 
-    pub fn from_coeffs_gpu(
+    fn from_coeffs_gpu(
         polynomials: Vec<PolynomialCoeffs<F>>,
         rate_bits: usize,
         blinding: bool,
@@ -154,20 +177,17 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
             salt_size
         );
 
-        #[cfg(feature = "cuda")]
-        {
-            if F::CUDA_SUPPORT {
-                return Self::from_coeffs_gpu_optimized(
-                    polynomials,
-                    rate_bits,
-                    blinding,
-                    cap_height,
-                    timing,
-                    fft_root_table,
-                    degree,
-                    salt_size,
-                );
-            }
+        if F::CUDA_SUPPORT {
+            return Self::from_coeffs_gpu_optimized(
+                polynomials,
+                rate_bits,
+                blinding,
+                cap_height,
+                timing,
+                fft_root_table,
+                degree,
+                salt_size,
+            );
         }
 
         // Fallback to CPU path
@@ -209,129 +229,64 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
         blinding: bool,
         cap_height: usize,
         timing: &mut TimingTree,
-        fft_root_table: Option<&FftRootTable<F>>,
+        _fft_root_table: Option<&FftRootTable<F>>,
         degree: usize,
         salt_size: usize,
     ) -> Self {
-        println!("Using GPU-accelerated LDE computation");
-
-        use std::time::Instant;
-
         use zeknox::device::memory::HostOrDeviceSlice;
         use zeknox::types::{NTTConfig, TransposeConfig};
         use zeknox::{ntt_batch_ptr, transpose_rev_batch};
 
         let lde_size = degree << rate_bits;
         let num_polys = polynomials.len() + salt_size;
+        let total_alloc_size = num_polys * lde_size;
 
-        // let lde_cpu = {
-        //     // Fallback to CPU path
-        //     let lde_values = polynomials
-        //         .iter()
-        //         .map(|p| {
-        //             assert_eq!(p.len(), degree, "Polynomial degrees inconsistent");
-        //             p.lde(rate_bits)
-        //                 .coset_fft_with_options(F::coset_shift(), Some(rate_bits), fft_root_table)
-        //                 .values
-        //         })
-        //         .collect::<Vec<_>>();
-
-        //     // for v in &lde_values {
-        //     //     println!("lde_values {:?}", v);
-        //     // }
-
-        //     lde_values
-        // };
-
-        // let salt_size = if blinding { SALT_SIZE } else { 0 };
+        let salt_polys = (0..salt_size)
+            .map(|_| F::rand_vec(lde_size))
+            .collect::<Vec<_>>();
 
         // Step 1: Compute coset FFT on GPU, keeping data on GPU
         let gpu_lde_values = timed!(timing, "GPU coset FFT", {
-            // let mut all_lde_data = Vec::with_capacity(num_polys);
-
-            // // Process each polynomial
-            // for p in polynomials.iter() {
-            //     assert_eq!(p.len(), degree, "Polynomial degrees inconsistent");
-
-            //     // Perform LDE (padding) and coset scaling on CPU
-            //     let mut padded_coeffs = p.lde(rate_bits).coeffs;
-
-            //     // Apply coset shift
-            //     let shift = F::coset_shift();
-            //     for (i, coeff) in padded_coeffs.iter_mut().enumerate() {
-            //         *coeff *= shift.exp_u64(i as u64);
-            //     }
-
-            //     all_lde_data.push(padded_coeffs);
-            // }
-
-            // // Add salt
-            // for _ in 0..salt_size {
-            //     all_lde_data.push(F::rand_vec(lde_size));
-            // }
-
             // Allocate GPU memory for all polynomials
-            let total_elements = num_polys * lde_size;
             println!(
                 "Allocating GPU memory for {} polynomials of size {} (total {} elements)",
-                num_polys, lde_size, total_elements
+                num_polys, lde_size, total_alloc_size
             );
-            let total_alloce_size = num_polys * lde_size;
-            // let total_alloce_size = num_polys.next_power_of_two() * lde_size;
 
-            let timer = Instant::now();
-            let mut gpu_buffer = HostOrDeviceSlice::cuda_malloc(0, total_alloce_size)
-                .expect("Failed to allocate GPU memory");
-            println!("cuda alloc took: {:?}", timer.elapsed());
-            println!(
-                "lde size: {}, total_elements: {}, total allocated size: {}",
-                lde_size, total_elements, total_alloce_size
+            let mut gpu_buffer = timed!(
+                timing,
+                format!("cuda alloc memory for {} elements", total_alloc_size).as_ref(),
+                HostOrDeviceSlice::cuda_malloc(0, total_alloc_size)
+                    .expect("Failed to allocate GPU memory")
             );
 
-            let timer = Instant::now();
             // Copy all data to GPU in one go
-            let mut flat_data = vec![F::ZERO; total_alloce_size];
-
-            for i in 0..polynomials.len() {
-
-                flat_data[i*lde_size.. i*lde_size +degree].copy_from_slice(polynomials[i].coeffs.as_ref())
-
-            }
 
-            // polynomials.par_iter().zip(flat_data.par_chunks_exact_mut(lde_size)).for_each(|(p, c)|
-            //     c[..degree].copy_from_slice(p.coeffs.as_ref())
-            // );
-
-            // let flat_data: Vec<F> = polynomials
-            //     .iter()
-            //     .flat_map(|v| v.lde(rate_bits).coeffs) // pad each polynomial to lde_size
-            //     // .into_iter()
-            //     // .collect()
-            //     // .iter()
-            //     // .chain(
-            //     //     vec![F::ZERO; total_alloce_size - total_elements]
-            //     //         .iter()
-            //     //         .copied(),
-            //     // )
-            //     .collect();
-
-            println!("cpu prepare took: {:?}", timer.elapsed());
-            // let flat_data =  vec![F::ZERO; lde_size - total_elements].iter()
+            let mut flat_data = vec![F::ZERO; total_alloc_size];
+
+            timed!(timing, "Prepare CPU memory", {
+                for i in 0..polynomials.len() {
+                    flat_data[i * lde_size..i * lde_size + degree]
+                        .copy_from_slice(polynomials[i].coeffs.as_ref())
+                }
+                for i in polynomials.len()..num_polys {
+                    flat_data[i * lde_size..(i + 1) * lde_size]
+                        .copy_from_slice(salt_polys[i - polynomials.len()].as_slice());
+                }
+            });
 
-            println!(
-                "Copying {} elements to GPU (expected {})",
-                flat_data.len(),
-                total_elements
+            timed!(
+                timing,
+                "CPU to GPU",
+                gpu_buffer
+                    .copy_from_host(&flat_data)
+                    .expect("Failed to copy data to GPU")
             );
 
-            let timer = Instant::now();
-            gpu_buffer
-                .copy_from_host(&flat_data)
-                .expect("Failed to copy data to GPU");
-            println!("IO took: {:?}", timer.elapsed());
-
-
             // Perform batched NTT on GPU
+            // Technically we don't really need to do FFTs for the salt polynomial
+            // but then the cuda memory becomes extremely difficult to handle
+            // so we might as well do those FFTs.
             let log_domain_size = log2_strict(lde_size);
             let ntt_config = NTTConfig {
                 batches: num_polys as u32,
@@ -340,36 +295,21 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
                 with_coset: true,
                 ..Default::default()
             };
-
-            let timer = Instant::now();
-            ntt_batch_ptr(0, gpu_buffer.as_mut_ptr(), log_domain_size, ntt_config);
-
-            println!("comput took: {:?}", timer.elapsed());
-
+            timed!(
+                timing,
+                format!(
+                    "GPU batch NTT for {} poly of degree {}",
+                    num_polys, lde_size
+                )
+                .as_ref(),
+                ntt_batch_ptr(0, gpu_buffer.as_mut_ptr(), log_domain_size, ntt_config)
+            );
             gpu_buffer
         });
 
-        println!("Completed GPU coset FFT for {} polynomials", num_polys);
-
-        // let total_elements = num_polys * lde_size;
-        // let mut gpu_lde_values_copied_to_cpu = vec![F::ZERO; total_elements];
-
-        // gpu_lde_values
-        //     .copy_to_host(&mut gpu_lde_values_copied_to_cpu, total_elements)
-        //     .expect("Failed to copy data from GPU");
-
-        // println!("lde value: {:?}", gpu_lde_values_copied_to_cpu);
-
-        // let lde_cpu_1d = lde_cpu.clone().into_iter().flatten().collect::<Vec<_>>();
-        // assert_eq!(lde_cpu_1d.len(), gpu_lde_values_copied_to_cpu.len(), "LDE size mismatch");
-        // assert_eq!(lde_cpu_1d, gpu_lde_values_copied_to_cpu, "LDE values mismatch");
-
         // Step 2: Transpose on GPU using Zeknox
         let gpu_transposed = timed!(timing, "GPU transpose", {
-            let total_alloce_size = num_polys * lde_size;
-            // let total_alloce_size = num_polys.next_power_of_two() * lde_size;
-
-            let mut gpu_output = HostOrDeviceSlice::cuda_malloc(0, total_alloce_size)
+            let mut gpu_output = HostOrDeviceSlice::cuda_malloc(0, total_alloc_size)
                 .expect("Failed to allocate GPU memory for transpose");
 
             let log_n = log2_strict(lde_size);
@@ -387,54 +327,24 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
                 transpose_config,
             );
 
-            // gpu_lde_values will be automatically freed when it goes out of scope
-
             gpu_output
         });
-        print!("Completed GPU transpose for {} polynomials", num_polys);
 
         // Step 3: Copy back to CPU
-        let leaves = timed!(timing, "GPU to CPU transfer", {
-            let total_elements = num_polys * lde_size;
-            let mut cpu_data = vec![F::ZERO; total_elements];
+        let leaves_1d = timed!(timing, "GPU to CPU", {
+            let mut cpu_data = vec![F::ZERO; total_alloc_size];
 
             gpu_transposed
-                .copy_to_host(&mut cpu_data, total_elements)
+                .copy_to_host(&mut cpu_data, total_alloc_size)
                 .expect("Failed to copy data from GPU");
 
-            // // Reshape into leaves: Vec<Vec<F>> where each inner vec has num_polys elements
-            // cpu_data
-            //     .chunks(num_polys)
-            //     .map(|chunk| chunk.to_vec())
-            //     .collect::<Vec<_>>()
-
             cpu_data
         });
 
-        // let mut leaves_cpu = timed!(timing, "transpose LDEs", transpose(&lde_cpu));
-        // println!("tatal leaves: {}", leaves.len());
-        // println!("leaves[0]:     {:?}", leaves[0]);
-        // println!("leaves_cpu[0]: {:?}", leaves_cpu[0]);
-
-        // reverse_index_bits_in_place(&mut leaves_cpu);
-        // for i in 0..leaves.len() {
-        //     if leaves[i] != leaves_cpu[i] {
-        //         println!("Mismatch at leaf {}: \n{:?}\n{:?}\n", i, leaves[i], leaves_cpu[i]);
-        //     }
-        // }
-
-        // assert!(leaves == leaves_cpu, "Transposed LDE values mismatch");
-
-        // let merkle_tree = timed!(
-        //     timing,
-        //     "build Merkle tree",
-        //     MerkleTree::new_from_2d(leaves, cap_height)
-        // );
-
-       let merkle_tree = timed!(
+        let merkle_tree = timed!(
             timing,
             "build Merkle tree",
-            MerkleTree::new_from_1d(leaves,polynomials.len(), cap_height)
+            MerkleTree::new_from_1d(leaves_1d, polynomials.len(), cap_height)
         );
 
         Self {
diff --git a/plonky2/src/plonk/prover.rs b/plonky2/src/plonk/prover.rs
index b79391fc8..24229a25e 100644
--- a/plonky2/src/plonk/prover.rs
+++ b/plonky2/src/plonk/prover.rs
@@ -4,7 +4,6 @@
 use alloc::{format, vec, vec::Vec};
 use core::cmp::min;
 use core::mem::swap;
-use std::time::Instant;
 
 use anyhow::{ensure, Result};
 use hashbrown::HashMap;
@@ -276,6 +275,7 @@ where
             &gammas,
             &deltas,
             &alphas,
+            timing,
         )
     );
 
@@ -631,6 +631,7 @@ fn compute_quotient_polys<
     gammas: &[F],
     deltas: &[F],
     alphas: &[F],
+    timing: &mut TimingTree,
 ) -> Vec<PolynomialCoeffs<F>> {
     let num_challenges = common_data.config.num_challenges;
 
@@ -650,219 +651,198 @@ fn compute_quotient_polys<
     // steps away since we work on an LDE of degree `max_filtered_constraint_degree`.
     let next_step = 1 << quotient_degree_bits;
 
-    let timer = Instant::now();
-    let points = F::two_adic_subgroup(common_data.degree_bits() + quotient_degree_bits);
-    println!("Time to compute LDE points: {:?}", timer.elapsed());
+    let points = timed!(
+        timing,
+        "set up subgroup generators",
+        F::two_adic_subgroup(common_data.degree_bits() + quotient_degree_bits)
+    );
 
     let lde_size = points.len();
 
     let z_h_on_coset = ZeroPolyOnCoset::new(common_data.degree_bits(), quotient_degree_bits);
 
-    let timer = Instant::now();
     // Precompute the lookup table evals on the challenges in delta
     // These values are used to produce the final RE constraints for each lut,
     // and are the same each time in check_lookup_constraints_batched.
     // lut_poly_evals[i][j] gives the eval for the i'th challenge and the j'th lookup table
-    let lut_re_poly_evals: Vec<Vec<F>> = if has_lookup {
-        let num_lut_slots = LookupTableGate::num_slots(&common_data.config);
-        (0..num_challenges)
-            .map(move |i| {
-                let cur_deltas = &deltas[NUM_COINS_LOOKUP * i..NUM_COINS_LOOKUP * (i + 1)];
-                let cur_challenge_delta = cur_deltas[LookupChallenges::ChallengeDelta as usize];
-
-                (LookupSelectors::StartEnd as usize..common_data.num_lookup_selectors)
-                    .map(|r| {
-                        let lut_row_number = common_data.luts
-                            [r - LookupSelectors::StartEnd as usize]
-                            .len()
-                            .div_ceil(num_lut_slots);
-
-                        get_lut_poly(
-                            common_data,
-                            r - LookupSelectors::StartEnd as usize,
-                            cur_deltas,
-                            num_lut_slots * lut_row_number,
-                        )
-                        .eval(cur_challenge_delta)
-                    })
-                    .collect()
-            })
-            .collect()
-    } else {
-        vec![]
-    };
-    println!(
-        "Time to compute LUT RE polynomial evals: {:?}",
-        timer.elapsed()
+    let lut_re_poly_evals: Vec<Vec<F>> = timed!(
+        timing,
+        "compute LUT RE polynomial evals",
+        if has_lookup {
+            let num_lut_slots = LookupTableGate::num_slots(&common_data.config);
+            (0..num_challenges)
+                .map(move |i| {
+                    let cur_deltas = &deltas[NUM_COINS_LOOKUP * i..NUM_COINS_LOOKUP * (i + 1)];
+                    let cur_challenge_delta = cur_deltas[LookupChallenges::ChallengeDelta as usize];
+
+                    (LookupSelectors::StartEnd as usize..common_data.num_lookup_selectors)
+                        .map(|r| {
+                            let lut_row_number = common_data.luts
+                                [r - LookupSelectors::StartEnd as usize]
+                                .len()
+                                .div_ceil(num_lut_slots);
+
+                            get_lut_poly(
+                                common_data,
+                                r - LookupSelectors::StartEnd as usize,
+                                cur_deltas,
+                                num_lut_slots * lut_row_number,
+                            )
+                            .eval(cur_challenge_delta)
+                        })
+                        .collect()
+                })
+                .collect()
+        } else {
+            vec![]
+        }
     );
 
-    let timer = Instant::now();
     let lut_re_poly_evals_refs: Vec<&[F]> =
         lut_re_poly_evals.iter().map(|v| v.as_slice()).collect();
 
     let num_batches = points.len().div_ceil(BATCH_SIZE);
 
-    let quotient_values: Vec<Vec<F>> = points
-        .par_chunks(BATCH_SIZE)
-        .enumerate()
-        .flat_map(|(batch_i, xs_batch)| {
-            // Each batch must be the same size, except the last one, which may be smaller.
-            debug_assert!(
-                xs_batch.len() == BATCH_SIZE
-                    || (batch_i == num_batches - 1 && xs_batch.len() <= BATCH_SIZE)
-            );
-
-            let batch_size = xs_batch.len();
-            let batch_start = BATCH_SIZE * batch_i;
+    let quotient_values: Vec<Vec<F>> = timed!(timing, "compute quotient value", {
+        points
+            .par_chunks(BATCH_SIZE)
+            .enumerate()
+            .flat_map(|(batch_i, xs_batch)| {
+                // Each batch must be the same size, except the last one, which may be smaller.
+                debug_assert!(
+                    xs_batch.len() == BATCH_SIZE
+                        || (batch_i == num_batches - 1 && xs_batch.len() <= BATCH_SIZE)
+                );
 
-            let mut shifted_xs_batch = Vec::with_capacity(batch_size);
-            let mut local_zs_batch = Vec::with_capacity(batch_size);
-            let mut next_zs_batch = Vec::with_capacity(batch_size);
+                let batch_size = xs_batch.len();
+                let batch_start = BATCH_SIZE * batch_i;
 
-            let mut local_lookup_batch = Vec::with_capacity(batch_size);
-            let mut next_lookup_batch = Vec::with_capacity(batch_size);
+                let mut shifted_xs_batch = Vec::with_capacity(batch_size);
+                let mut local_zs_batch = Vec::with_capacity(batch_size);
+                let mut next_zs_batch = Vec::with_capacity(batch_size);
 
-            let mut partial_products_batch = Vec::with_capacity(batch_size);
-            let mut s_sigmas_batch = Vec::with_capacity(batch_size);
+                let mut local_lookup_batch = Vec::with_capacity(batch_size);
+                let mut next_lookup_batch = Vec::with_capacity(batch_size);
 
-            let mut local_constants_batch_refs = Vec::with_capacity(batch_size);
-            let mut local_wires_batch_refs = Vec::with_capacity(batch_size);
+                let mut partial_products_batch = Vec::with_capacity(batch_size);
+                let mut s_sigmas_batch = Vec::with_capacity(batch_size);
 
-            // let timer1 = Instant::now();
+                let mut local_constants_batch_refs = Vec::with_capacity(batch_size);
+                let mut local_wires_batch_refs = Vec::with_capacity(batch_size);
 
-            for (j, &x) in xs_batch.iter().enumerate() {
-                let i = batch_start + j;
-                let shifted_x = F::coset_shift() * x;
-                let i_next = (i + next_step) % lde_size;
-                let local_constants_sigmas = prover_data
-                    .constants_sigmas_commitment
-                    .get_lde_values(i, step);
-                let local_constants = &local_constants_sigmas[common_data.constants_range()];
-                let s_sigmas = &local_constants_sigmas[common_data.sigmas_range()];
-                let local_wires = wires_commitment.get_lde_values(i, step);
-                let local_zs_partial_and_lookup =
-                    zs_partial_products_and_lookup_commitment.get_lde_values(i, step);
-                let next_zs_partial_and_lookup =
-                    zs_partial_products_and_lookup_commitment.get_lde_values(i_next, step);
+                for (j, &x) in xs_batch.iter().enumerate() {
+                    let i = batch_start + j;
+                    let shifted_x = F::coset_shift() * x;
+                    let i_next = (i + next_step) % lde_size;
+                    let local_constants_sigmas = prover_data
+                        .constants_sigmas_commitment
+                        .get_lde_values(i, step);
+                    let local_constants = &local_constants_sigmas[common_data.constants_range()];
+                    let s_sigmas = &local_constants_sigmas[common_data.sigmas_range()];
+                    let local_wires = wires_commitment.get_lde_values(i, step);
+                    let local_zs_partial_and_lookup =
+                        zs_partial_products_and_lookup_commitment.get_lde_values(i, step);
+                    let next_zs_partial_and_lookup =
+                        zs_partial_products_and_lookup_commitment.get_lde_values(i_next, step);
 
-                let local_zs = &local_zs_partial_and_lookup[common_data.zs_range()];
+                    let local_zs = &local_zs_partial_and_lookup[common_data.zs_range()];
 
-                let next_zs = &next_zs_partial_and_lookup[common_data.zs_range()];
+                    let next_zs = &next_zs_partial_and_lookup[common_data.zs_range()];
 
-                let partial_products =
-                    &local_zs_partial_and_lookup[common_data.partial_products_range()];
+                    let partial_products =
+                        &local_zs_partial_and_lookup[common_data.partial_products_range()];
 
-                if has_lookup {
-                    let local_lookup_zs = &local_zs_partial_and_lookup[common_data.lookup_range()];
+                    if has_lookup {
+                        let local_lookup_zs =
+                            &local_zs_partial_and_lookup[common_data.lookup_range()];
 
-                    let next_lookup_zs = &next_zs_partial_and_lookup[common_data.lookup_range()];
-                    debug_assert_eq!(local_lookup_zs.len(), common_data.num_all_lookup_polys());
+                        let next_lookup_zs =
+                            &next_zs_partial_and_lookup[common_data.lookup_range()];
+                        debug_assert_eq!(local_lookup_zs.len(), common_data.num_all_lookup_polys());
 
-                    local_lookup_batch.push(local_lookup_zs);
-                    next_lookup_batch.push(next_lookup_zs);
-                }
+                        local_lookup_batch.push(local_lookup_zs);
+                        next_lookup_batch.push(next_lookup_zs);
+                    }
 
-                debug_assert_eq!(local_wires.len(), common_data.config.num_wires);
-                debug_assert_eq!(local_zs.len(), num_challenges);
+                    debug_assert_eq!(local_wires.len(), common_data.config.num_wires);
+                    debug_assert_eq!(local_zs.len(), num_challenges);
 
-                local_constants_batch_refs.push(local_constants);
-                local_wires_batch_refs.push(local_wires);
+                    local_constants_batch_refs.push(local_constants);
+                    local_wires_batch_refs.push(local_wires);
 
-                shifted_xs_batch.push(shifted_x);
-                local_zs_batch.push(local_zs);
-                next_zs_batch.push(next_zs);
-                partial_products_batch.push(partial_products);
-                s_sigmas_batch.push(s_sigmas);
-            }
+                    shifted_xs_batch.push(shifted_x);
+                    local_zs_batch.push(local_zs);
+                    next_zs_batch.push(next_zs);
+                    partial_products_batch.push(partial_products);
+                    s_sigmas_batch.push(s_sigmas);
+                }
 
-            // println!(
-            //     "Time to gather LDE values for batch {}: {:?}",
-            //     batch_i,
-            //     timer1.elapsed()
-            // );
-
-            // Optimized transposition with better cache locality
-            let n_constants = local_constants_batch_refs[0].len();
-            let mut local_constants_batch = vec![F::ZERO; xs_batch.len() * n_constants];
-            for i in 0..n_constants {
-                let offset = i * xs_batch.len();
-                for (j, constants) in local_constants_batch_refs.iter().enumerate() {
-                    local_constants_batch[offset + j] = constants[i];
+                // Optimized transposition with better cache locality
+                let n_constants = local_constants_batch_refs[0].len();
+                let mut local_constants_batch = vec![F::ZERO; xs_batch.len() * n_constants];
+                for i in 0..n_constants {
+                    let offset = i * xs_batch.len();
+                    for (j, constants) in local_constants_batch_refs.iter().enumerate() {
+                        local_constants_batch[offset + j] = constants[i];
+                    }
                 }
-            }
 
-            let n_wires = local_wires_batch_refs[0].len();
-            let mut local_wires_batch = vec![F::ZERO; xs_batch.len() * n_wires];
-            for i in 0..n_wires {
-                let offset = i * xs_batch.len();
-                for (j, wires) in local_wires_batch_refs.iter().enumerate() {
-                    local_wires_batch[offset + j] = wires[i];
+                let n_wires = local_wires_batch_refs[0].len();
+                let mut local_wires_batch = vec![F::ZERO; xs_batch.len() * n_wires];
+                for i in 0..n_wires {
+                    let offset = i * xs_batch.len();
+                    for (j, wires) in local_wires_batch_refs.iter().enumerate() {
+                        local_wires_batch[offset + j] = wires[i];
+                    }
                 }
-            }
 
-            let vars_batch = EvaluationVarsBaseBatch::new(
-                xs_batch.len(),
-                &local_constants_batch,
-                &local_wires_batch,
-                public_inputs_hash,
-            );
+                let vars_batch = EvaluationVarsBaseBatch::new(
+                    xs_batch.len(),
+                    &local_constants_batch,
+                    &local_wires_batch,
+                    public_inputs_hash,
+                );
 
-            // let timer1 = Instant::now();
-            let indices_batch: Vec<usize> = (batch_start..batch_start + batch_size).collect();
-            let mut quotient_values_batch = eval_vanishing_poly_base_batch::<F, D>(
-                common_data,
-                &indices_batch,
-                &shifted_xs_batch,
-                vars_batch,
-                &local_zs_batch,
-                &next_zs_batch,
-                &local_lookup_batch,
-                &next_lookup_batch,
-                &partial_products_batch,
-                &s_sigmas_batch,
-                betas,
-                gammas,
-                deltas,
-                alphas,
-                &z_h_on_coset,
-                &lut_re_poly_evals_refs,
-            );
-            // println!(
-            //     "Time to eval vanishing poly for batch {}: {:?}",
-            //     batch_i,
-            //     timer1.elapsed()
-            // );
-
-            // let timer1 = Instant::now();
-            for (j, quotient_values) in quotient_values_batch.iter_mut().enumerate() {
-                let i = batch_start + j;
-                let denominator_inv = z_h_on_coset.eval_inverse(i);
-                quotient_values
-                    .iter_mut()
-                    .for_each(|v| *v *= denominator_inv);
-            }
-            // println!(
-            //     "Time to divide out Z_H for batch {}: {:?}",
-            //     batch_i,
-            //     timer1.elapsed()
-            // );
+                let indices_batch: Vec<usize> = (batch_start..batch_start + batch_size).collect();
+                let mut quotient_values_batch = eval_vanishing_poly_base_batch::<F, D>(
+                    common_data,
+                    &indices_batch,
+                    &shifted_xs_batch,
+                    vars_batch,
+                    &local_zs_batch,
+                    &next_zs_batch,
+                    &local_lookup_batch,
+                    &next_lookup_batch,
+                    &partial_products_batch,
+                    &s_sigmas_batch,
+                    betas,
+                    gammas,
+                    deltas,
+                    alphas,
+                    &z_h_on_coset,
+                    &lut_re_poly_evals_refs,
+                );
 
-            quotient_values_batch
-        })
-        .collect();
+                for (j, quotient_values) in quotient_values_batch.iter_mut().enumerate() {
+                    let i = batch_start + j;
+                    let denominator_inv = z_h_on_coset.eval_inverse(i);
+                    quotient_values
+                        .iter_mut()
+                        .for_each(|v| *v *= denominator_inv);
+                }
 
-    println!(
-        "Time to compute quotient polys: {:?} for {} points",
-        timer.elapsed(),
-        quotient_values.len()
-    );
+                quotient_values_batch
+            })
+            .collect()
+    });
 
-    let timer = Instant::now();
-    let res = transpose(&quotient_values)
-        .into_par_iter()
-        .map(PolynomialValues::new)
-        .map(|values| values.coset_ifft(F::coset_shift()))
-        .collect();
-    println!("Time to compute quotient polys IFFT: {:?}", timer.elapsed());
-    res
+    timed!(
+        timing,
+        "transpose and final ifft",
+        transpose(&quotient_values)
+            .into_par_iter()
+            .map(PolynomialValues::new)
+            .map(|values| values.coset_ifft(F::coset_shift()))
+            .collect()
+    )
 }
diff --git a/plonky2/src/plonk/vanishing_poly.rs b/plonky2/src/plonk/vanishing_poly.rs
index 0f6177daa..6a5b8a266 100644
--- a/plonky2/src/plonk/vanishing_poly.rs
+++ b/plonky2/src/plonk/vanishing_poly.rs
@@ -679,7 +679,7 @@ pub fn check_lookup_constraints_batch<F: RichField + Extendable<D>, const D: usi
 
         // Check LDC row and col transitions. It's the same constraint, with a row transition happening for slot == 0.
         let unfiltered_ldc_transition =
-            lu_sum_prods.multiply_accumulate(lu_prod, (z_x_lookup_sldcs[poly] - prev));
+            lu_sum_prods.multiply_accumulate(lu_prod, z_x_lookup_sldcs[poly] - prev);
         constraints
             .push(lookup_selectors[LookupSelectors::TransLdc as usize] * unfiltered_ldc_transition);
     }
diff --git a/plonky2/src/util/mod.rs b/plonky2/src/util/mod.rs
index cb11f05e2..d0ec960c8 100644
--- a/plonky2/src/util/mod.rs
+++ b/plonky2/src/util/mod.rs
@@ -5,8 +5,6 @@ use alloc::vec::Vec;
 
 #[doc(inline)]
 pub use plonky2_util::*;
-#[cfg(feature = "cuda")]
-use zeknox::{device::memory::HostOrDeviceSlice, transpose_rev_batch, types::TransposeConfig};
 
 use crate::field::polynomial::PolynomialValues;
 use crate::field::types::Field;
@@ -23,81 +21,6 @@ pub(crate) fn transpose_poly_values<F: Field>(polys: Vec<PolynomialValues<F>>) -
     transpose(&poly_values)
 }
 
-#[cfg(feature = "cuda")]
-fn transpose_gpu<T: Send + Sync + Copy>(matrix: &[Vec<T>]) -> Vec<Vec<T>> {
-    use std::time::Instant;
-
-    if matrix.is_empty() || matrix[0].is_empty() {
-        return vec![];
-    }
-
-    let num_rows = matrix.len();
-    let num_cols = matrix[0].len();
-    let total_elements = num_rows * num_cols;
-
-    // Flatten the 2D matrix into a 1D vector for GPU
-    let mut flat_input: Vec<T> = Vec::with_capacity(total_elements);
-    for row in matrix {
-        flat_input.extend_from_slice(row);
-    }
-
-    let gpu_id = 0;
-    let log_n = (num_cols as f64).log2().ceil() as usize;
-
-    // Allocate GPU memory for input and output
-    let mut gpu_input: HostOrDeviceSlice<'_, T> =
-        HostOrDeviceSlice::cuda_malloc(gpu_id, total_elements).unwrap();
-    let mut gpu_output: HostOrDeviceSlice<'_, T> =
-        HostOrDeviceSlice::cuda_malloc(gpu_id, total_elements).unwrap();
-
-    // Copy input to GPU
-    gpu_input.copy_from_host(&flat_input).unwrap();
-
-    // Configure transpose
-    let mut cfg = TransposeConfig::default();
-    cfg.batches = num_rows as u32;
-    cfg.are_inputs_on_device = true;
-    cfg.are_outputs_on_device = true;
-
-    let timers = Instant::now();
-    // Perform GPU transpose
-    transpose_rev_batch(
-        gpu_id,
-        gpu_output.as_mut_ptr(),
-        gpu_input.as_mut_ptr(),
-        log_n,
-        cfg,
-    );
-    println!(
-        "CUDA transpose of {}x{} took {:?}",
-        num_rows,
-        num_cols,
-        timers.elapsed()
-    );
-
-    let timer = Instant::now();
-    // Copy result back to host
-    let mut flat_output = vec![unsafe { std::mem::zeroed() }; total_elements];
-    gpu_output
-        .copy_to_host(&mut flat_output, total_elements)
-        .unwrap();
-    println!(
-        "CUDA transpose copy back and reshape of {}x{} took {:?}",
-        num_rows,
-        num_cols,
-        timer.elapsed()
-    );
-
-    // Reshape back to 2D (transposed) using chunks_exact for better performance
-    // The GPU transpose outputs in column-major order, so we can just chunk by num_rows
-    let result: Vec<Vec<T>> = flat_output
-        .chunks_exact(num_rows)
-        .map(|chunk| chunk.to_vec())
-        .collect();
-
-    result
-}
-
 pub fn transpose<T: Send + Sync + Copy>(matrix: &[Vec<T>]) -> Vec<Vec<T>> {
     if matrix.is_empty() {
         return vec![];
@@ -105,31 +28,6 @@ pub fn transpose<T: Send + Sync + Copy>(matrix: &[Vec<T>]) -> Vec<Vec<T>> {
 
     let len = matrix[0].len();
 
-    // #[cfg(feature = "cuda")]
-    // {
-    //     // Use GPU for large matrices
-    //     // Threshold: use GPU if total elements >= 2^16 (65536) or if CUDA_TRANSPOSE_THRESHOLD is set
-    //     let num_rows = matrix.len();
-    //     let num_cols = len;
-    //     let total_elements = num_rows * num_cols;
-
-    //     let use_gpu = if let Ok(threshold_str) = std::env::var("CUDA_TRANSPOSE_THRESHOLD") {
-    //         if let Ok(threshold) = threshold_str.parse::<usize>() {
-    //             total_elements >= threshold
-    //         } else {
-    //             total_elements >= 65536
-    //         }
-    //     } else {
-    //         total_elements >= 65536
-    //     };
-
-    //     if use_gpu && num_cols.is_power_of_two() {
-    //         return transpose_gpu(matrix);
-    //     }
-    // }
-
-    // CPU fallback
-    // Use sequential iteration for deterministic results
     (0..len)
         .map(|i| matrix.iter().map(|row| row[i]).collect())
         .collect()

From 248682545ef63d6d3b75e38cd37e97b151135abc Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-31-221.ap-northeast-1.compute.internal>
Date: Mon, 15 Dec 2025 22:13:14 +0000
Subject: [PATCH 29/37] finished

---
 plonky2/examples/fibonacci.rs | 67 ++++++++++++-----------------------
 1 file changed, 23 insertions(+), 44 deletions(-)

diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs
index d2d30e2d7..1cca1b513 100644
--- a/plonky2/examples/fibonacci.rs
+++ b/plonky2/examples/fibonacci.rs
@@ -1,9 +1,13 @@
 use anyhow::{Ok, Result};
+use log::Level;
 use plonky2::field::types::Field;
 use plonky2::iop::witness::{PartialWitness, WitnessWrite};
 use plonky2::plonk::circuit_builder::CircuitBuilder;
 use plonky2::plonk::circuit_data::CircuitConfig;
 use plonky2::plonk::config::{GenericConfig, PoseidonGoldilocksConfig};
+use plonky2::util::timing::TimingTree;
+
+const LOOP: usize = 100_000;
 
 /// An example of using Plonky2 to prove a statement of the form
 /// "I know the 100th element of the Fibonacci sequence, starting with constants a and b."
@@ -19,31 +23,34 @@ fn main() -> Result<()> {
     type F = <C as GenericConfig<D>>::F;
 
     let config = CircuitConfig::standard_recursion_config();
-    println!("Building circuit...");
     let mut builder = CircuitBuilder::<F, D>::new(config);
-    println!("Building arithmetic circuit...");
     // The arithmetic circuit.
     let initial_a = builder.add_virtual_target();
     let initial_b = builder.add_virtual_target();
     let mut prev_target = initial_a;
     let mut cur_target = initial_b;
-    for _ in 0..99 {
+    for _ in 0..LOOP {
         let temp = builder.add(prev_target, cur_target);
         prev_target = cur_target;
         cur_target = temp;
     }
-    println!("Circuit built.");
 
     #[cfg(feature = "cuda")]
     {
-        let size = 3;
+        use plonky2_util::log2_ceil;
+
+        let size = log2_ceil(builder.num_gates());
 
         zeknox::clear_cuda_errors_rs();
-        println!("Initializing CUDA twiddle factors...");
+        println!(
+            "Initializing CUDA twiddle factors for dimeinsions 2^{} and 2^{}",
+            size,
+            size + 3
+        );
 
         zeknox::init_twiddle_factors_rs(0, size);
         zeknox::init_twiddle_factors_rs(0, size + 3);
-        // Initialize coset on GPU
+
         // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR)
         let coset_gen_u64 = 7u64;
         zeknox::init_coset_rs(0, size + 3, coset_gen_u64);
@@ -53,52 +60,24 @@ fn main() -> Result<()> {
     builder.register_public_input(initial_a);
     builder.register_public_input(initial_b);
     builder.register_public_input(cur_target);
-    println!("Public inputs registered.");
+
     // Provide initial values.
     let mut pw = PartialWitness::new();
     pw.set_target(initial_a, F::ZERO)?;
     pw.set_target(initial_b, F::ONE)?;
-    println!("Initial values set in witness.");
     let data = builder.build::<C>();
-    println!("Circuit data built. Generating proof...");
-    #[cfg(feature = "timing")]
-    {
-        use log::Level;
-        use plonky2::util::timing::TimingTree;
-        let mut timing = TimingTree::new("prove", Level::Info);
-        println!("Starting proof generation...");
-        let proof =
-            plonky2::plonk::prover::prove(&data.prover_only, &data.common, pw, &mut timing)?;
 
-        println!(
-            "100th Fibonacci number mod |F| (starting with {}, {}) is: {}",
-            proof.public_inputs[0], proof.public_inputs[1], proof.public_inputs[2]
-        );
+    let mut timing = TimingTree::new("prove", Level::Info);
 
-        // Print first few elements of wires_cap for comparison
-        println!("First wires_cap hash: {:?}", proof.proof.wires_cap.0[0]);
-        println!(
-            "First plonk_zs hash: {:?}",
-            proof.proof.plonk_zs_partial_products_cap.0[0]
-        );
-        println!(
-            "First quotient hash: {:?}",
-            proof.proof.quotient_polys_cap.0[0]
-        );
+    let proof = plonky2::plonk::prover::prove(&data.prover_only, &data.common, pw, &mut timing)?;
 
-        timing.print();
-        data.verify(proof)?;
-    }
+    println!(
+        "{}-th Fibonacci number mod |F| (starting with {}, {}) is: {}",
+        LOOP, proof.public_inputs[0], proof.public_inputs[1], proof.public_inputs[2]
+    );
 
-    #[cfg(not(feature = "timing"))]
-    {
-        let proof = data.prove(pw)?;
-        println!(
-            "100th Fibonacci number mod |F| (starting with {}, {}) is: {}",
-            proof.public_inputs[0], proof.public_inputs[1], proof.public_inputs[2]
-        );
-        data.verify(proof)?;
-    }
+    timing.print();
+    data.verify(proof)?;
 
     println!("finished");
     Ok(())

From e0c397c8d6fccda0286365167ca2f22a1056f7e0 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-31-221.ap-northeast-1.compute.internal>
Date: Mon, 15 Dec 2025 22:36:00 +0000
Subject: [PATCH 30/37] clean up

---
 plonky2/src/fri/oracle.rs       | 108 ++++++++------------------------
 plonky2/src/hash/merkle_tree.rs |   6 +-
 plonky2/src/plonk/prover.rs     |  13 +---
 plonky2/src/util/mod.rs         |   2 +
 4 files changed, 33 insertions(+), 96 deletions(-)

diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs
index 56bb34c53..f3daa648d 100644
--- a/plonky2/src/fri/oracle.rs
+++ b/plonky2/src/fri/oracle.rs
@@ -54,8 +54,6 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
     PolynomialBatch<F, C, D>
 {
     /// Creates a list polynomial commitment for the polynomials interpolating the values in `values`.
-    /// This function is called by the builder during preprocessing the circuit.
-    /// This function always calls IFFT on CPU to avoid strange GPU issue.
     pub fn from_values(
         values: Vec<PolynomialValues<F>>,
         rate_bits: usize,
@@ -64,6 +62,7 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
         timing: &mut TimingTree,
         fft_root_table: Option<&FftRootTable<F>>,
     ) -> Self {
+        // The first IFFT is always done on CPU to avoid strange GPU issue.
         let coeffs = timed!(
             timing,
             "CPU IFFT",
@@ -73,25 +72,14 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
                 .collect::<Vec<_>>()
         );
 
-        if cfg!(feature = "cuda") {
-            Self::from_coeffs_gpu(
-                coeffs,
-                rate_bits,
-                blinding,
-                cap_height,
-                timing,
-                fft_root_table,
-            )
-        } else {
-            Self::from_coeffs_cpu(
-                coeffs,
-                rate_bits,
-                blinding,
-                cap_height,
-                timing,
-                fft_root_table,
-            )
-        }
+        Self::from_coeffs(
+            coeffs,
+            rate_bits,
+            blinding,
+            cap_height,
+            timing,
+            fft_root_table,
+        )
     }
 
     /// Creates a list polynomial commitment for the polynomials `polynomials`.
@@ -157,79 +145,40 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
         }
     }
 
+    #[cfg(feature = "cuda")]
     fn from_coeffs_gpu(
         polynomials: Vec<PolynomialCoeffs<F>>,
         rate_bits: usize,
         blinding: bool,
         cap_height: usize,
         timing: &mut TimingTree,
-        fft_root_table: Option<&FftRootTable<F>>,
+        _fft_root_table: Option<&FftRootTable<F>>,
     ) -> Self {
+        assert!(F::CUDA_SUPPORT, "CUDA is not support for this field");
+
         let degree = polynomials[0].len();
 
         // If blinding, salt with two random elements to each leaf vector.
         let salt_size = if blinding { SALT_SIZE } else { 0 };
-        println!(
-            "lde_values: num_polys={}, degree={}, blinding={}, salt_size={}",
-            polynomials.len(),
-            degree,
-            blinding,
-            salt_size
-        );
 
-        if F::CUDA_SUPPORT {
-            return Self::from_coeffs_gpu_optimized(
-                polynomials,
-                rate_bits,
-                blinding,
-                cap_height,
-                timing,
-                fft_root_table,
-                degree,
-                salt_size,
-            );
-        }
-
-        // Fallback to CPU path
-        let lde_values = polynomials
-            .iter()
-            .map(|p| {
-                assert_eq!(p.len(), degree, "Polynomial degrees inconsistent");
-                p.lde(rate_bits)
-                    .coset_fft_with_options(F::coset_shift(), Some(rate_bits), fft_root_table)
-                    .values
-            })
-            .chain(
-                (0..salt_size)
-                    .into_iter()
-                    .map(|_| F::rand_vec(degree << rate_bits)),
-            )
-            .collect::<Vec<_>>();
-        let mut leaves = timed!(timing, "transpose LDEs", transpose(&lde_values));
-        reverse_index_bits_in_place(&mut leaves);
-        let merkle_tree = timed!(
-            timing,
-            "build Merkle tree",
-            MerkleTree::new_from_2d(leaves, cap_height)
-        );
-
-        Self {
+        Self::from_coeffs_gpu_helper(
             polynomials,
-            merkle_tree,
-            degree_log: log2_strict(degree),
             rate_bits,
             blinding,
-        }
+            cap_height,
+            timing,
+            degree,
+            salt_size,
+        )
     }
 
     #[cfg(feature = "cuda")]
-    fn from_coeffs_gpu_optimized(
+    fn from_coeffs_gpu_helper(
         polynomials: Vec<PolynomialCoeffs<F>>,
         rate_bits: usize,
         blinding: bool,
         cap_height: usize,
         timing: &mut TimingTree,
-        _fft_root_table: Option<&FftRootTable<F>>,
         degree: usize,
         salt_size: usize,
     ) -> Self {
@@ -248,9 +197,11 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
         // Step 1: Compute coset FFT on GPU, keeping data on GPU
         let gpu_lde_values = timed!(timing, "GPU coset FFT", {
             // Allocate GPU memory for all polynomials
-            println!(
+            log::debug!(
                 "Allocating GPU memory for {} polynomials of size {} (total {} elements)",
-                num_polys, lde_size, total_alloc_size
+                num_polys,
+                lde_size,
+                total_alloc_size
             );
 
             let mut gpu_buffer = timed!(
@@ -366,16 +317,9 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
 
         // If blinding, salt with two random elements to each leaf vector.
         let salt_size = if blinding { SALT_SIZE } else { 0 };
-        println!(
-            "lde_values: num_polys={}, degree={}, blinding={}, salt_size={}",
-            polynomials.len(),
-            degree,
-            blinding,
-            salt_size
-        );
 
         polynomials
-            .iter()
+            .par_iter()
             .map(|p| {
                 assert_eq!(p.len(), degree, "Polynomial degrees inconsistent");
                 p.lde(rate_bits)
@@ -384,7 +328,7 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
             })
             .chain(
                 (0..salt_size)
-                    .into_iter()
+                    .into_par_iter()
                     .map(|_| F::rand_vec(degree << rate_bits)),
             )
             .collect()
diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs
index b2a57df52..12f44970b 100644
--- a/plonky2/src/hash/merkle_tree.rs
+++ b/plonky2/src/hash/merkle_tree.rs
@@ -276,7 +276,7 @@ fn fill_digests_buf_gpu<F: RichField, H: Hasher<F>>(
     let leaves_count = leaves.len() / leaf_size;
 
     let num_gpus: usize = std::env::var("NUM_OF_GPUS")
-        .expect("NUM_OF_GPUS should be set")
+        .unwrap_or("1".to_string())
         .parse()
         .unwrap();
 
@@ -286,7 +286,7 @@ fn fill_digests_buf_gpu<F: RichField, H: Hasher<F>>(
     if *gpu_id_lock >= num_gpus as u64 {
         *gpu_id_lock = 0;
     }
-    println!("Using GPU id {} leave length {}", gpu_id, leaves.len());
+    log::debug!("Using GPU id {} leave length {}", gpu_id, leaves.len());
 
     let now = Instant::now();
     let gpu_leaves_buf_result = HostOrDeviceSlice::cuda_malloc(gpu_id as i32, leaves.len());
@@ -351,7 +351,7 @@ fn fill_digests_buf_gpu_ptr<F: RichField, H: Hasher<F>>(
 
     unsafe {
         let num_gpus: usize = std::env::var("NUM_OF_GPUS")
-            .expect("NUM_OF_GPUS should be set")
+            .unwrap_or("1".to_string())
             .parse()
             .unwrap();
         if !FORCE_SINGLE_GPU
diff --git a/plonky2/src/plonk/prover.rs b/plonky2/src/plonk/prover.rs
index 24229a25e..b4112cf8f 100644
--- a/plonky2/src/plonk/prover.rs
+++ b/plonky2/src/plonk/prover.rs
@@ -165,17 +165,10 @@ where
         // Use sequential iteration for deterministic results
         witness
             .wire_values
-            .iter()
+            .par_iter()
             .map(|column| PolynomialValues::new(column.clone()))
             .collect()
     );
-    // Debug: Print first few wire values to check determinism
-    if !wires_values.is_empty() && !wires_values[0].values.is_empty() {
-        println!(
-            "First wire poly first 5 values: {:?}",
-            &wires_values[0].values[..5.min(wires_values[0].values.len())]
-        );
-    }
     let wires_commitment = timed!(
         timing,
         "compute wires commitment",
@@ -320,7 +313,6 @@ where
         "Opening point is in the subgroup."
     );
 
-    println!("Constructing the opening set, including lookups.");
     let openings = timed!(
         timing,
         "construct the opening set, including lookups",
@@ -334,7 +326,6 @@ where
             common_data
         )
     );
-    println!("Computed openings.");
 
     challenger.observe_openings(&openings.to_fri_openings());
     let instance = common_data.get_fri_instance(zeta);
@@ -357,7 +348,7 @@ where
             timing,
         )
     );
-    println!("Computed opening proofs.");
+
     let proof = Proof::<F, C, D> {
         wires_cap: wires_commitment.merkle_tree.cap,
         plonk_zs_partial_products_cap: partial_products_zs_and_lookup_commitment.merkle_tree.cap,
diff --git a/plonky2/src/util/mod.rs b/plonky2/src/util/mod.rs
index d0ec960c8..73ec06f7a 100644
--- a/plonky2/src/util/mod.rs
+++ b/plonky2/src/util/mod.rs
@@ -3,6 +3,7 @@
 #[cfg(not(feature = "std"))]
 use alloc::vec::Vec;
 
+use plonky2_maybe_rayon::{MaybeIntoParIter, ParallelIterator};
 #[doc(inline)]
 pub use plonky2_util::*;
 
@@ -29,6 +30,7 @@ pub fn transpose<T: Send + Sync + Copy>(matrix: &[Vec<T>]) -> Vec<Vec<T>> {
     let len = matrix[0].len();
 
     (0..len)
+        .into_par_iter()
         .map(|i| matrix.iter().map(|row| row[i]).collect())
         .collect()
 }

From 0606854a7e572cc9d94e46e4207afafe8de27bbb Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-31-221.ap-northeast-1.compute.internal>
Date: Tue, 16 Dec 2025 15:14:13 +0000
Subject: [PATCH 31/37] fix bug

---
 field/Cargo.toml          |  4 ++--
 field/src/fft.rs          |  8 ++------
 plonky2/Cargo.toml        |  4 ++--
 plonky2/src/fri/oracle.rs | 37 ++++++++++++++++++-------------------
 4 files changed, 24 insertions(+), 29 deletions(-)

diff --git a/field/Cargo.toml b/field/Cargo.toml
index 39ee8ef07..ba7de5f04 100644
--- a/field/Cargo.toml
+++ b/field/Cargo.toml
@@ -35,8 +35,8 @@ workspace = true
 
 
 [features]
-# default = []
-default = [ "cuda" ]
+default = []
+# default = [ "cuda" ]
 # default = [ "cuda", "cuda_sanity_check" ]
 cuda = []
 # sanity check: when this flag is on, the computation will done on both CPU and CUDA, and the results compared
diff --git a/field/src/fft.rs b/field/src/fft.rs
index e08fd8021..6f1d5355d 100644
--- a/field/src/fft.rs
+++ b/field/src/fft.rs
@@ -34,12 +34,8 @@ pub fn batch_fft<F: Field>(input: &[PolynomialCoeffs<F>]) -> Vec<PolynomialValue
     {
         let mut res = Vec::with_capacity(input.len());
         for poly in input.iter() {
-            let mut batch_res = Vec::with_capacity(poly.len());
-            for p in poly {
-                let pv = fft_with_options(p.clone(), None, None);
-                batch_res.push(pv);
-            }
-            res.extend(batch_res);
+            let pv = fft_with_options(poly.clone(), None, None);
+            res.push(pv);
         }
         res
     }
diff --git a/plonky2/Cargo.toml b/plonky2/Cargo.toml
index df60129ac..82594fb67 100644
--- a/plonky2/Cargo.toml
+++ b/plonky2/Cargo.toml
@@ -14,9 +14,9 @@ categories.workspace = true
 
 [features]
 # default = ["gate_testing", "rand_chacha", "std", "timing", "cuda"]
-# default = ["gate_testing", "parallel", "rand_chacha", "std", "timing", ]
+default = ["gate_testing", "parallel", "rand_chacha", "std", "timing", ]
 
-default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing", ]
+# default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing", ]
 # default = ["gate_testing", "rand_chacha", "std", "cuda", "timing", "cuda_sanity_check"]
 # default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing", "cuda_sanity_check"]
 gate_testing = []
diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs
index f3daa648d..e301e6e8e 100644
--- a/plonky2/src/fri/oracle.rs
+++ b/plonky2/src/fri/oracle.rs
@@ -91,25 +91,24 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
         timing: &mut TimingTree,
         fft_root_table: Option<&FftRootTable<F>>,
     ) -> Self {
-        if cfg!(feature = "cuda") {
-            Self::from_coeffs_gpu(
-                polynomials,
-                rate_bits,
-                blinding,
-                cap_height,
-                timing,
-                fft_root_table,
-            )
-        } else {
-            Self::from_coeffs_cpu(
-                polynomials,
-                rate_bits,
-                blinding,
-                cap_height,
-                timing,
-                fft_root_table,
-            )
-        }
+        #[cfg(feature = "cuda")]
+        return Self::from_coeffs_gpu(
+            polynomials,
+            rate_bits,
+            blinding,
+            cap_height,
+            timing,
+            fft_root_table,
+        );
+        #[cfg(not(feature = "cuda"))]
+        Self::from_coeffs_cpu(
+            polynomials,
+            rate_bits,
+            blinding,
+            cap_height,
+            timing,
+            fft_root_table,
+        )
     }
 
     /// Creates a list polynomial commitment for the polynomials `polynomials`.

From ca5a5bb0a3d6e573264e5524439bfc9e799ef921 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-31-221.ap-northeast-1.compute.internal>
Date: Tue, 16 Dec 2025 21:58:08 +0000
Subject: [PATCH 32/37] optimized cpu memory alloce

---
 field/Cargo.toml          |  4 ++--
 plonky2/Cargo.toml        |  6 +++---
 plonky2/src/fri/oracle.rs | 11 ++++++++---
 plonky2/src/util/mem.rs   | 17 +++++++++++++++++
 plonky2/src/util/mod.rs   |  1 +
 5 files changed, 31 insertions(+), 8 deletions(-)
 create mode 100644 plonky2/src/util/mem.rs

diff --git a/field/Cargo.toml b/field/Cargo.toml
index ba7de5f04..39ee8ef07 100644
--- a/field/Cargo.toml
+++ b/field/Cargo.toml
@@ -35,8 +35,8 @@ workspace = true
 
 
 [features]
-default = []
-# default = [ "cuda" ]
+# default = []
+default = [ "cuda" ]
 # default = [ "cuda", "cuda_sanity_check" ]
 cuda = []
 # sanity check: when this flag is on, the computation will done on both CPU and CUDA, and the results compared
diff --git a/plonky2/Cargo.toml b/plonky2/Cargo.toml
index 82594fb67..10deefedf 100644
--- a/plonky2/Cargo.toml
+++ b/plonky2/Cargo.toml
@@ -14,9 +14,9 @@ categories.workspace = true
 
 [features]
 # default = ["gate_testing", "rand_chacha", "std", "timing", "cuda"]
-default = ["gate_testing", "parallel", "rand_chacha", "std", "timing", ]
-
-# default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing", ]
+# default = ["gate_testing", "parallel", "rand_chacha", "std", "timing", ]
+# 
+default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing", ]
 # default = ["gate_testing", "rand_chacha", "std", "cuda", "timing", "cuda_sanity_check"]
 # default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing", "cuda_sanity_check"]
 gate_testing = []
diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs
index e301e6e8e..364ea8497 100644
--- a/plonky2/src/fri/oracle.rs
+++ b/plonky2/src/fri/oracle.rs
@@ -185,6 +185,8 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
         use zeknox::types::{NTTConfig, TransposeConfig};
         use zeknox::{ntt_batch_ptr, transpose_rev_batch};
 
+        use crate::util::mem::vec_zeroed;
+
         let lde_size = degree << rate_bits;
         let num_polys = polynomials.len() + salt_size;
         let total_alloc_size = num_polys * lde_size;
@@ -211,10 +213,13 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
             );
 
             // Copy all data to GPU in one go
+            let mut flat_data = timed!(
+                timing,
+                "Prepare CPU memory",
+                unsafe { vec_zeroed::<F>(total_alloc_size) }
+            );
 
-            let mut flat_data = vec![F::ZERO; total_alloc_size];
-
-            timed!(timing, "Prepare CPU memory", {
+            timed!(timing, "Copy CPU memory", {
                 for i in 0..polynomials.len() {
                     flat_data[i * lde_size..i * lde_size + degree]
                         .copy_from_slice(polynomials[i].coeffs.as_ref())
diff --git a/plonky2/src/util/mem.rs b/plonky2/src/util/mem.rs
new file mode 100644
index 000000000..c1cc44ef9
--- /dev/null
+++ b/plonky2/src/util/mem.rs
@@ -0,0 +1,17 @@
+// alloc memory for Vec<F>, where every element is 0. (a lot) faster than vec![F::ZERO; len]
+pub unsafe fn vec_zeroed<F>(len: usize) -> Vec<F> {
+    let elem_size = std::mem::size_of::<F>();
+    debug_assert!(elem_size != 0, "ZST not supported by this helper");
+
+    // Layout for len elements
+    let layout = std::alloc::Layout::array::<F>(len).expect("layout overflow");
+
+    // Allocate zeroed memory
+    let ptr = std::alloc::alloc_zeroed(layout) as *mut F;
+    if ptr.is_null() {
+        std::alloc::handle_alloc_error(layout);
+    }
+
+    // Take ownership as a Vec<F>
+    Vec::from_raw_parts(ptr, len, len)
+}
diff --git a/plonky2/src/util/mod.rs b/plonky2/src/util/mod.rs
index 73ec06f7a..a0e81e06e 100644
--- a/plonky2/src/util/mod.rs
+++ b/plonky2/src/util/mod.rs
@@ -11,6 +11,7 @@ use crate::field::polynomial::PolynomialValues;
 use crate::field::types::Field;
 
 pub(crate) mod context_tree;
+pub(crate) mod mem;
 pub(crate) mod partial_products;
 pub mod reducing;
 pub mod serialization;

From 67853083bc860c3df368026abe3ce6beb5d94764 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-31-221.ap-northeast-1.compute.internal>
Date: Tue, 16 Dec 2025 22:32:06 +0000
Subject: [PATCH 33/37] futher opts

---
 field/src/lib.rs                             |  1 +
 plonky2/src/util/mem.rs => field/src/util.rs | 11 +++++++----
 plonky2/src/fri/oracle.rs                    | 11 ++++-------
 plonky2/src/hash/merkle_tree.rs              |  4 +++-
 plonky2/src/plonk/prover.rs                  |  1 +
 plonky2/src/util/mod.rs                      |  1 -
 6 files changed, 16 insertions(+), 13 deletions(-)
 rename plonky2/src/util/mem.rs => field/src/util.rs (58%)

diff --git a/field/src/lib.rs b/field/src/lib.rs
index 9a2ea4f9c..be1c4b512 100644
--- a/field/src/lib.rs
+++ b/field/src/lib.rs
@@ -24,6 +24,7 @@ pub mod polynomial;
 pub mod secp256k1_base;
 pub mod secp256k1_scalar;
 pub mod types;
+pub mod util;
 pub mod zero_poly_coset;
 
 #[cfg(test)]
diff --git a/plonky2/src/util/mem.rs b/field/src/util.rs
similarity index 58%
rename from plonky2/src/util/mem.rs
rename to field/src/util.rs
index c1cc44ef9..374a38986 100644
--- a/plonky2/src/util/mem.rs
+++ b/field/src/util.rs
@@ -1,15 +1,18 @@
+use alloc::vec::Vec;
+use core::mem;
+
 // alloc memory for Vec<F>, where every element is 0. (a lot) faster than vec![F::ZERO; len]
 pub unsafe fn vec_zeroed<F>(len: usize) -> Vec<F> {
-    let elem_size = std::mem::size_of::<F>();
+    let elem_size = mem::size_of::<F>();
     debug_assert!(elem_size != 0, "ZST not supported by this helper");
 
     // Layout for len elements
-    let layout = std::alloc::Layout::array::<F>(len).expect("layout overflow");
+    let layout = alloc::alloc::Layout::array::<F>(len).expect("layout overflow");
 
     // Allocate zeroed memory
-    let ptr = std::alloc::alloc_zeroed(layout) as *mut F;
+    let ptr = alloc::alloc::alloc_zeroed(layout) as *mut F;
     if ptr.is_null() {
-        std::alloc::handle_alloc_error(layout);
+        alloc::alloc::handle_alloc_error(layout);
     }
 
     // Take ownership as a Vec<F>
diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs
index 364ea8497..292f2a0cf 100644
--- a/plonky2/src/fri/oracle.rs
+++ b/plonky2/src/fri/oracle.rs
@@ -181,12 +181,11 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
         degree: usize,
         salt_size: usize,
     ) -> Self {
+        use plonky2_field::util::vec_zeroed;
         use zeknox::device::memory::HostOrDeviceSlice;
         use zeknox::types::{NTTConfig, TransposeConfig};
         use zeknox::{ntt_batch_ptr, transpose_rev_batch};
 
-        use crate::util::mem::vec_zeroed;
-
         let lde_size = degree << rate_bits;
         let num_polys = polynomials.len() + salt_size;
         let total_alloc_size = num_polys * lde_size;
@@ -213,11 +212,9 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
             );
 
             // Copy all data to GPU in one go
-            let mut flat_data = timed!(
-                timing,
-                "Prepare CPU memory",
-                unsafe { vec_zeroed::<F>(total_alloc_size) }
-            );
+            let mut flat_data = timed!(timing, "Prepare CPU memory", unsafe {
+                vec_zeroed::<F>(total_alloc_size)
+            });
 
             timed!(timing, "Copy CPU memory", {
                 for i in 0..polynomials.len() {
diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs
index 12f44970b..4773d71ab 100644
--- a/plonky2/src/hash/merkle_tree.rs
+++ b/plonky2/src/hash/merkle_tree.rs
@@ -576,6 +576,8 @@ impl<F: RichField, H: Hasher<F>> MerkleTree<F, H> {
         leaf_len: usize,
         cap_height: usize,
     ) -> Self {
+        use plonky2_field::util::vec_zeroed;
+
         let log2_leaves_len = log2_strict(leaves_len);
         assert!(
             cap_height <= log2_leaves_len,
@@ -585,7 +587,7 @@ impl<F: RichField, H: Hasher<F>> MerkleTree<F, H> {
         );
 
         // copy data from GPU in async mode
-        let mut host_leaves: Vec<F> = vec![F::ZERO; leaves_len * leaf_len];
+        let mut host_leaves: Vec<F> = unsafe { vec_zeroed(leaves_len * leaf_len) };
         let stream_copy = CudaStream::create().unwrap();
 
         let start = std::time::Instant::now();
diff --git a/plonky2/src/plonk/prover.rs b/plonky2/src/plonk/prover.rs
index b4112cf8f..18ce4fc64 100644
--- a/plonky2/src/plonk/prover.rs
+++ b/plonky2/src/plonk/prover.rs
@@ -7,6 +7,7 @@ use core::mem::swap;
 
 use anyhow::{ensure, Result};
 use hashbrown::HashMap;
+use plonky2_field::util::vec_zeroed;
 use plonky2_maybe_rayon::*;
 
 use super::circuit_builder::{LookupChallenges, LookupWire};
diff --git a/plonky2/src/util/mod.rs b/plonky2/src/util/mod.rs
index a0e81e06e..73ec06f7a 100644
--- a/plonky2/src/util/mod.rs
+++ b/plonky2/src/util/mod.rs
@@ -11,7 +11,6 @@ use crate::field::polynomial::PolynomialValues;
 use crate::field::types::Field;
 
 pub(crate) mod context_tree;
-pub(crate) mod mem;
 pub(crate) mod partial_products;
 pub mod reducing;
 pub mod serialization;

From fa060c41dabbdf8bbc8c73609aa301fc0459829d Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-31-221.ap-northeast-1.compute.internal>
Date: Sat, 10 Jan 2026 17:41:18 +0000
Subject: [PATCH 34/37] clean up

---
 plonky2/src/gadgets/arithmetic_extension.rs | 36 +++++++++++++++++++++
 plonky2/src/plonk/circuit_builder.rs        | 16 +++++++++
 2 files changed, 52 insertions(+)

diff --git a/plonky2/src/gadgets/arithmetic_extension.rs b/plonky2/src/gadgets/arithmetic_extension.rs
index 9d1088030..6c8a253cf 100644
--- a/plonky2/src/gadgets/arithmetic_extension.rs
+++ b/plonky2/src/gadgets/arithmetic_extension.rs
@@ -629,6 +629,18 @@ mod tests {
 
     #[test]
     fn test_mul_many() -> Result<()> {
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            // Initialize twiddle factors for a range of sizes that might be used
+            for i in 0..=20 {
+                zeknox::init_twiddle_factors_rs(0, i);
+            }
+            // Initialize coset for Goldilocks field (coset generator = 7)
+            let coset_gen_u64 = 7u64;
+            zeknox::init_coset_rs(0, 20, coset_gen_u64);
+        }
+
         const D: usize = 2;
         type C = PoseidonGoldilocksConfig;
         type F = <C as GenericConfig<D>>::F;
@@ -665,6 +677,18 @@ mod tests {
 
     #[test]
     fn test_div_extension() -> Result<()> {
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            // Initialize twiddle factors for a range of sizes that might be used
+            for i in 0..=20 {
+                zeknox::init_twiddle_factors_rs(0, i);
+            }
+            // Initialize coset for Goldilocks field (coset generator = 7)
+            let coset_gen_u64 = 7u64;
+            zeknox::init_coset_rs(0, 20, coset_gen_u64);
+        }
+
         const D: usize = 2;
         type C = PoseidonGoldilocksConfig;
         type F = <C as GenericConfig<D>>::F;
@@ -692,6 +716,18 @@ mod tests {
 
     #[test]
     fn test_mul_algebra() -> Result<()> {
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            // Initialize twiddle factors for a range of sizes that might be used
+            for i in 0..=20 {
+                zeknox::init_twiddle_factors_rs(0, i);
+            }
+            // Initialize coset for Goldilocks field (coset generator = 7)
+            let coset_gen_u64 = 7u64;
+            zeknox::init_coset_rs(0, 20, coset_gen_u64);
+        }
+
         const D: usize = 2;
         type C = KeccakGoldilocksConfig;
         type F = <C as GenericConfig<D>>::F;
diff --git a/plonky2/src/plonk/circuit_builder.rs b/plonky2/src/plonk/circuit_builder.rs
index e6a81f378..74fa4be62 100644
--- a/plonky2/src/plonk/circuit_builder.rs
+++ b/plonky2/src/plonk/circuit_builder.rs
@@ -1225,6 +1225,22 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         let max_fft_points = 1 << (degree_bits + max(rate_bits, log2_ceil(quotient_degree_factor)));
         let fft_root_table = fft_root_table(max_fft_points);
 
+        // Initialize GPU twiddle factors for all sizes we'll use
+        #[cfg(feature = "cuda")]
+        {
+            if F::CUDA_SUPPORT {
+                zeknox::clear_cuda_errors_rs();
+                // Initialize twiddle factors for degree and LDE sizes
+                // degree_bits: the base degree
+                // degree_bits + rate_bits: the LDE size used in from_coeffs
+                // We need to initialize from 0 to cover all potential sizes
+                let max_log_size = degree_bits + max(rate_bits, log2_ceil(quotient_degree_factor));
+                for i in 0..=max_log_size {
+                    zeknox::init_twiddle_factors_rs(0, i);
+                }
+            }
+        }
+
         // This part of the code on GPU is buggy. So we use CPU for computation.
         // It does not impact performance as this is only done once during setup.
         let constants_sigmas_commitment = if commit_to_sigma {

From 2055bbc704b1f05d60de13e1e2dc59ec1c2e828d Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-31-221.ap-northeast-1.compute.internal>
Date: Sat, 10 Jan 2026 18:03:23 +0000
Subject: [PATCH 35/37] fix example

---
 plonky2/examples/fibonacci.rs | 50 +++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 20 deletions(-)

diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs
index 63c94cc78..fdde49534 100644
--- a/plonky2/examples/fibonacci.rs
+++ b/plonky2/examples/fibonacci.rs
@@ -1,39 +1,44 @@
-use anyhow::{Ok, Result};
+use std::time::Instant;
+
+use anyhow::Result;
 use log::Level;
 use plonky2::field::types::Field;
 use plonky2::iop::witness::{PartialWitness, WitnessWrite};
 use plonky2::plonk::circuit_builder::CircuitBuilder;
 use plonky2::plonk::circuit_data::CircuitConfig;
-use plonky2::plonk::config::{GenericConfig, PoseidonGoldilocksConfig};
+use plonky2::plonk::config::{GenericConfig, Poseidon2GoldilocksConfig, PoseidonGoldilocksConfig};
+use plonky2::plonk::prover::prove;
 use plonky2::util::timing::TimingTree;
 
-const LOOP: usize = 100_000;
-
 /// An example of using Plonky2 to prove a statement of the form
 /// "I know the 100th element of the Fibonacci sequence, starting with constants a and b."
 /// When a == 0 and b == 1, this is proving knowledge of the 100th (standard) Fibonacci number.
 fn main() -> Result<()> {
-    // Initialize logger to see timing output
     env_logger::Builder::from_default_env()
-        .format_timestamp(None)
         .filter_level(log::LevelFilter::Debug)
         .init();
+    work::<PoseidonGoldilocksConfig>()?;
+    work::<Poseidon2GoldilocksConfig>()
+}
+
+fn work<C: GenericConfig<2>>() -> Result<()> {
     const D: usize = 2;
 
     let config = CircuitConfig::standard_recursion_config();
-    let mut builder = CircuitBuilder::<F, D>::new(config);
+    let mut builder = CircuitBuilder::<C::F, D>::new(config);
+
     // The arithmetic circuit.
     let initial_a = builder.add_virtual_target();
     let initial_b = builder.add_virtual_target();
     let mut prev_target = initial_a;
     let mut cur_target = initial_b;
-    for _ in 0..LOOP {
+    for _ in 0..999999 {
         let temp = builder.add(prev_target, cur_target);
         prev_target = cur_target;
         cur_target = temp;
     }
 
-    #[cfg(feature = "cuda")]
+     #[cfg(feature = "cuda")]
     {
         use plonky2_util::log2_ceil;
 
@@ -62,22 +67,27 @@ fn main() -> Result<()> {
     // Provide initial values.
     let timer1 = Instant::now();
     let mut pw = PartialWitness::new();
-    pw.set_target(initial_a, F::ZERO)?;
-    pw.set_target(initial_b, F::ONE)?;
+    pw.set_target(initial_a, C::F::ZERO)?;
+    pw.set_target(initial_b, C::F::ONE)?;
+
     let data = builder.build::<C>();
+    let timer2 = Instant::now();
 
-    let mut timing = TimingTree::new("prove", Level::Info);
+    // Create a TimingTree to track detailed timing information
+    let mut timing = TimingTree::new("prove", Level::Debug);
+    let proof = prove::<C::F, C, D>(&data.prover_only, &data.common, pw, &mut timing)?;
+    let timer3 = Instant::now();
 
-    let proof = plonky2::plonk::prover::prove(&data.prover_only, &data.common, pw, &mut timing)?;
+    // Print the timing tree
+    timing.print();
 
     println!(
-        "{}-th Fibonacci number mod |F| (starting with {}, {}) is: {}",
-        LOOP, proof.public_inputs[0], proof.public_inputs[1], proof.public_inputs[2]
+        "100th Fibonacci number mod |F| (starting with {}, {}) is: {}",
+        proof.public_inputs[0], proof.public_inputs[1], proof.public_inputs[2]
     );
 
-    timing.print();
-    data.verify(proof)?;
+    println!("Build time: {:?}", timer2.duration_since(timer1));
+    println!("Prove time: {:?}", timer3.duration_since(timer2));
 
-    println!("finished");
-    Ok(())
-}
+    data.verify(proof)
+}
\ No newline at end of file

From bd17b4ddfb6db20fb579aabacc92e9a817562543 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-31-221.ap-northeast-1.compute.internal>
Date: Sat, 10 Jan 2026 18:36:24 +0000
Subject: [PATCH 36/37] update dependency

---
 Cargo.toml                         | 3 ++-
 plonky2/examples/fibonacci.rs      | 4 ++--
 plonky2/src/hash/poseidon2/hash.rs | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 3bb243ecf..336f94466 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -14,7 +14,8 @@ rand = { version = "0.8.4", default-features = false }
 serde = { version = "1.0", default-features = false, features = ["derive"] }
 static_assertions = { version = "1.1.0", default-features = false }
 unroll = { version = "0.1.5", default-features = false }
-zeknox = { path = "../zeknox/wrappers/rust" }
+# zeknox = { path = "../zeknox/wrappers/rust" }
+zeknox = { git = "https://github.com/elliottech/zeknox", branch = "zz/cuda-integration" }
 
 [profile.release]
 opt-level = 3
diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs
index fdde49534..2f8bb84ae 100644
--- a/plonky2/examples/fibonacci.rs
+++ b/plonky2/examples/fibonacci.rs
@@ -38,7 +38,7 @@ fn work<C: GenericConfig<2>>() -> Result<()> {
         cur_target = temp;
     }
 
-     #[cfg(feature = "cuda")]
+    #[cfg(feature = "cuda")]
     {
         use plonky2_util::log2_ceil;
 
@@ -90,4 +90,4 @@ fn work<C: GenericConfig<2>>() -> Result<()> {
     println!("Prove time: {:?}", timer3.duration_since(timer2));
 
     data.verify(proof)
-}
\ No newline at end of file
+}
diff --git a/plonky2/src/hash/poseidon2/hash.rs b/plonky2/src/hash/poseidon2/hash.rs
index 9fc6b8e4f..e73db08bc 100644
--- a/plonky2/src/hash/poseidon2/hash.rs
+++ b/plonky2/src/hash/poseidon2/hash.rs
@@ -10,9 +10,9 @@ use crate::gates::poseidon2::Poseidon2Gate;
 use crate::hash::hash_types::{HashOut, RichField};
 use crate::hash::hashing::{compress, hash_n_to_hash_no_pad, PlonkyPermutation};
 use crate::iop::ext_target::ExtensionTarget;
-use crate::iop::target::{BoolTarget, Target};use crate::plonk::config::HasherType;
+use crate::iop::target::{BoolTarget, Target};
 use crate::plonk::circuit_builder::CircuitBuilder;
-use crate::plonk::config::{AlgebraicHasher, Hasher};
+use crate::plonk::config::{AlgebraicHasher, Hasher, HasherType};
 
 pub trait Poseidon2: PrimeField64 {
     #[inline]

From 041e1b75d22aaca351ae9aa9c3ee3b01a4a3c311 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-31-221.ap-northeast-1.compute.internal>
Date: Sat, 10 Jan 2026 18:58:51 +0000
Subject: [PATCH 37/37] update

---
 Cargo.toml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 336f94466..3bb243ecf 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -14,8 +14,7 @@ rand = { version = "0.8.4", default-features = false }
 serde = { version = "1.0", default-features = false, features = ["derive"] }
 static_assertions = { version = "1.1.0", default-features = false }
 unroll = { version = "0.1.5", default-features = false }
-# zeknox = { path = "../zeknox/wrappers/rust" }
-zeknox = { git = "https://github.com/elliottech/zeknox", branch = "zz/cuda-integration" }
+zeknox = { path = "../zeknox/wrappers/rust" }
 
 [profile.release]
 opt-level = 3