diff --git a/.gitignore b/.gitignore
index 293a17bb6..695fe9b7b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,6 @@ pgo-data.profdata
 
 # MacOS nuisances
 .DS_Store
+
+*.log
+
diff --git a/Cargo.toml b/Cargo.toml
index 81dbbde49..3bb243ecf 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,10 +9,12 @@ hashbrown = { version = "0.14.3", default-features = false, features = ["ahash",
 itertools = { version = "0.11.0", default-features = false }
 log = { version = "0.4.14", default-features = false }
 num = { version = "0.4", default-features = false, features = ["rand"] }
+once_cell = { version = "1.18.0", default-features = false }
 rand = { version = "0.8.4", default-features = false }
 serde = { version = "1.0", default-features = false, features = ["derive"] }
 static_assertions = { version = "1.1.0", default-features = false }
 unroll = { version = "0.1.5", default-features = false }
+zeknox = { path = "../zeknox/wrappers/rust" }
 
 [profile.release]
 opt-level = 3
diff --git a/field/Cargo.toml b/field/Cargo.toml
index e13f49efd..39ee8ef07 100644
--- a/field/Cargo.toml
+++ b/field/Cargo.toml
@@ -19,6 +19,9 @@ serde = { workspace = true, features = ["alloc"] }
 static_assertions = { workspace = true }
 unroll = { workspace = true }
 
+# cuda accelerator wrapper
+zeknox = { workspace = true }
+
 # Local dependencies
 plonky2_util = { version = "1.0.0", path = "../util", default-features = false }
 
@@ -29,3 +32,12 @@ rustdoc-args = ["--html-in-header", ".cargo/katex-header.html"]
 
 [lints]
 workspace = true
+
+
+[features]
+# default = []
+default = [ "cuda" ]
+# default = [ "cuda", "cuda_sanity_check" ]
+cuda = []
+# sanity check: when this flag is on, the computation will done on both CPU and CUDA, and the results compared
+cuda_sanity_check = ["cuda"]
\ No newline at end of file
diff --git a/field/src/fft.rs b/field/src/fft.rs
index d078ca6c3..6f1d5355d 100644
--- a/field/src/fft.rs
+++ b/field/src/fft.rs
@@ -11,6 +11,36 @@ use crate::types::Field;
 
 pub type FftRootTable<F> = Vec<Vec<F>>;
 
+pub fn batch_fft<F: Field>(input: &[PolynomialCoeffs<F>]) -> Vec<PolynomialValues<F>> {
+    #[cfg(feature = "cuda")]
+    {
+        use zeknox::ntt_batch;
+        use zeknox::types::NTTConfig;
+
+        let mut data = input
+            .iter()
+            .flat_map(|poly| poly.coeffs.clone())
+            .collect::<Vec<F>>();
+        let mut cfg = NTTConfig::default();
+        cfg.batches = input.len() as u32;
+        let poly_len = input[0].len();
+        ntt_batch(0, &mut data, log2_strict(poly_len), cfg);
+
+        data.chunks(poly_len)
+            .map(|chunk| PolynomialValues::new(chunk.to_vec()))
+            .collect()
+    }
+    #[cfg(not(feature = "cuda"))]
+    {
+        let mut res = Vec::with_capacity(input.len());
+        for poly in input.iter() {
+            let pv = fft_with_options(poly.clone(), None, None);
+            res.push(pv);
+        }
+        res
+    }
+}
+
 pub fn fft_root_table<F: Field>(n: usize) -> FftRootTable<F> {
     let lg_n = log2_strict(n);
     // bases[i] = g^2^i, for i = 0, ..., lg_n - 1
@@ -32,16 +62,72 @@ pub fn fft_root_table<F: Field>(n: usize) -> FftRootTable<F> {
     root_table
 }
 
+#[cfg(feature = "cuda")]
+fn fft_dispatch_gpu<F: Field>(
+    input: &mut [F],
+    zero_factor: Option<usize>,
+    root_table: Option<&FftRootTable<F>>,
+) {
+    if F::CUDA_SUPPORT {
+        use zeknox::ntt_batch;
+        use zeknox::types::NTTConfig;
+
+        #[cfg(feature = "cuda_sanity_check")]
+        let cpu_res = {
+            let mut input_clone = input.to_vec();
+            fft_dispatch_cpu(&mut input_clone, zero_factor, root_table);
+            input_clone
+        };
+
+        ntt_batch(
+            0,
+            input,
+            input.len().trailing_zeros() as usize,
+            NTTConfig::default(),
+        );
+
+        #[cfg(feature = "cuda_sanity_check")]
+        for (i, (a, b)) in input.iter().zip(cpu_res.iter()).enumerate() {
+            if a != b {
+                panic!(
+                    "Mismatch at index {}: gpu result = {}, cpu result = {}",
+                    i, a, b
+                );
+            }
+        }
+        return;
+    } else {
+        return fft_dispatch_cpu(input, zero_factor, root_table);
+    }
+}
+
+fn fft_dispatch_cpu<F: Field>(
+    input: &mut [F],
+    zero_factor: Option<usize>,
+    root_table: Option<&FftRootTable<F>>,
+) {
+    if root_table.is_some() {
+        fft_classic(input, zero_factor.unwrap_or(0), root_table.unwrap())
+    } else {
+        let computed = fft_root_table::<F>(input.len());
+        fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref())
+    };
+}
+
 #[inline]
 fn fft_dispatch<F: Field>(
     input: &mut [F],
     zero_factor: Option<usize>,
     root_table: Option<&FftRootTable<F>>,
 ) {
-    let computed_root_table = root_table.is_none().then(|| fft_root_table(input.len()));
-    let used_root_table = root_table.or(computed_root_table.as_ref()).unwrap();
-
-    fft_classic(input, zero_factor.unwrap_or(0), used_root_table);
+    #[cfg(feature = "cuda")]
+    {
+        fft_dispatch_gpu(input, zero_factor, root_table)
+    }
+    #[cfg(not(feature = "cuda"))]
+    {
+        fft_dispatch_cpu(input, zero_factor, root_table)
+    }
 }
 
 #[inline]
@@ -50,6 +136,7 @@ pub fn fft<F: Field>(poly: PolynomialCoeffs<F>) -> PolynomialValues<F> {
 }
 
 #[inline]
+
 pub fn fft_with_options<F: Field>(
     poly: PolynomialCoeffs<F>,
     zero_factor: Option<usize>,
@@ -65,6 +152,28 @@ pub fn ifft<F: Field>(poly: PolynomialValues<F>) -> PolynomialCoeffs<F> {
     ifft_with_options(poly, None, None)
 }
 
+#[inline]
+pub fn ifft_cpu<F: Field>(poly: PolynomialValues<F>) -> PolynomialCoeffs<F> {
+    let n = poly.len();
+    let lg_n = log2_strict(n);
+    let n_inv = F::inverse_2exp(lg_n);
+
+    let PolynomialValues { values: mut buffer } = poly;
+    fft_dispatch_cpu(&mut buffer, None, None);
+
+    // We reverse all values except the first, and divide each by n.
+    buffer[0] *= n_inv;
+    buffer[n / 2] *= n_inv;
+    for i in 1..(n / 2) {
+        let j = n - i;
+        let coeffs_i = buffer[j] * n_inv;
+        let coeffs_j = buffer[i] * n_inv;
+        buffer[i] = coeffs_i;
+        buffer[j] = coeffs_j;
+    }
+    PolynomialCoeffs { coeffs: buffer }
+}
+
 pub fn ifft_with_options<F: Field>(
     poly: PolynomialValues<F>,
     zero_factor: Option<usize>,
@@ -214,6 +323,17 @@ mod tests {
 
     #[test]
     fn fft_and_ifft() {
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            // Initialize twiddle factors for sizes we'll use
+            // degree_padded is 256 = 2^8
+            // lde then add 4 more bits
+            for i in 8..=12 {
+                zeknox::init_twiddle_factors_rs(0, i);
+            }
+        }
+
         type F = GoldilocksField;
         let degree = 200usize;
         let degree_padded = degree.next_power_of_two();
@@ -222,7 +342,7 @@ mod tests {
         // "random", the last degree_padded-degree of them are zero.
         let coeffs = (0..degree)
             .map(|i| F::from_canonical_usize(i * 1337 % 100))
-            .chain(core::iter::repeat_n(F::ZERO, degree_padded - degree))
+            .chain(core::iter::repeat(F::ZERO).take(degree_padded - degree))
             .collect::<Vec<_>>();
         assert_eq!(coeffs.len(), degree_padded);
         let coefficients = PolynomialCoeffs { coeffs };
diff --git a/field/src/goldilocks_extensions.rs b/field/src/goldilocks_extensions.rs
index 8f2d85253..6dd15ce0d 100644
--- a/field/src/goldilocks_extensions.rs
+++ b/field/src/goldilocks_extensions.rs
@@ -21,9 +21,10 @@ impl Extendable<2> for GoldilocksField {
     // DTH_ROOT = W^((ORDER - 1)/2)
     const DTH_ROOT: Self = Self(18446744069414584320);
 
-    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] = [Self(0), Self(11713931119993638672)];
+    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] =
+        [Self(18081566051660590251), Self(16121475356294670766)];
 
-    const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(7226896044987257365)];
+    const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(15659105665374529263)];
 }
 
 impl Mul for QuadraticExtension<GoldilocksField> {
@@ -44,11 +45,15 @@ impl Extendable<4> for GoldilocksField {
     // DTH_ROOT = W^((ORDER - 1)/4)
     const DTH_ROOT: Self = Self(281474976710656);
 
-    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] =
-        [Self(0), Self(8295451483910296135), Self(0), Self(0)];
+    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] = [
+        Self(5024755240244648895),
+        Self(13227474371289740625),
+        Self(3912887029498544536),
+        Self(3900057112666848848),
+    ];
 
     const EXT_POWER_OF_TWO_GENERATOR: [Self; 4] =
-        [Self(0), Self(0), Self(0), Self(17216955519093520442)];
+        [Self(0), Self(0), Self(0), Self(12587610116473453104)];
 }
 
 impl Mul for QuarticExtension<GoldilocksField> {
@@ -70,11 +75,11 @@ impl Extendable<5> for GoldilocksField {
     const DTH_ROOT: Self = Self(1041288259238279555);
 
     const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 5] = [
-        Self(4624713872807171977),
-        Self(381988216716071028),
-        Self(14499722700050429911),
-        Self(4870631734967222356),
-        Self(4518902370426242880),
+        Self(2899034827742553394),
+        Self(13012057356839176729),
+        Self(14593811582388663055),
+        Self(7722900811313895436),
+        Self(4557222484695340057),
     ];
 
     const EXT_POWER_OF_TWO_GENERATOR: [Self; 5] = [
diff --git a/field/src/goldilocks_field.rs b/field/src/goldilocks_field.rs
index a4da1fee0..449b62abd 100644
--- a/field/src/goldilocks_field.rs
+++ b/field/src/goldilocks_field.rs
@@ -77,17 +77,19 @@ impl Field for GoldilocksField {
     const CHARACTERISTIC_TWO_ADICITY: usize = Self::TWO_ADICITY;
 
     // Sage: `g = GF(p).multiplicative_generator()`
-    const MULTIPLICATIVE_GROUP_GENERATOR: Self = Self(14293326489335486720);
+    const MULTIPLICATIVE_GROUP_GENERATOR: Self = Self(7);
 
     // Sage:
     // ```
     // g_2 = g^((p - 1) / 2^32)
     // g_2.multiplicative_order().factor()
     // ```
-    const POWER_OF_TWO_GENERATOR: Self = Self(7277203076849721926);
+    const POWER_OF_TWO_GENERATOR: Self = Self(1753635133440165772);
 
     const BITS: usize = 64;
 
+    const CUDA_SUPPORT: bool = true;
+
     fn order() -> BigUint {
         Self::ORDER.into()
     }
diff --git a/field/src/interpolation.rs b/field/src/interpolation.rs
index df7084572..9772fff56 100644
--- a/field/src/interpolation.rs
+++ b/field/src/interpolation.rs
@@ -77,6 +77,9 @@ pub fn interpolate2<F: Field>(points: [(F, F); 2], x: F) -> F {
 
 #[cfg(test)]
 mod tests {
+    #[cfg(feature = "cuda")]
+    use zeknox::init_twiddle_factors_rs;
+
     use super::*;
     use crate::extension::quartic::QuarticExtension;
     use crate::goldilocks_field::GoldilocksField;
@@ -87,7 +90,12 @@ mod tests {
     fn interpolant_random() {
         type F = GoldilocksField;
 
-        for deg in 0..10 {
+        #[cfg(feature = "cuda")]
+        zeknox::clear_cuda_errors_rs();
+
+        for deg in 2..10 {
+            #[cfg(feature = "cuda")]
+            init_twiddle_factors_rs(0, log2_ceil(deg));
             let domain = F::rand_vec(deg);
             let coeffs = F::rand_vec(deg);
             let coeffs = PolynomialCoeffs { coeffs };
@@ -101,7 +109,13 @@ mod tests {
     fn interpolant_random_roots_of_unity() {
         type F = GoldilocksField;
 
-        for deg_log in 0..4 {
+        #[cfg(feature = "cuda")]
+        zeknox::clear_cuda_errors_rs();
+
+        for deg_log in 1..4 {
+            #[cfg(feature = "cuda")]
+            init_twiddle_factors_rs(0, deg_log);
+
             let deg = 1 << deg_log;
             let domain = F::two_adic_subgroup(deg_log);
             let coeffs = F::rand_vec(deg);
@@ -116,8 +130,15 @@ mod tests {
     fn interpolant_random_overspecified() {
         type F = GoldilocksField;
 
+        #[cfg(feature = "cuda")]
+        zeknox::clear_cuda_errors_rs();
+
         for deg in 0..10 {
             let points = deg + 5;
+
+            #[cfg(feature = "cuda")]
+            init_twiddle_factors_rs(0, log2_ceil(points));
+
             let domain = F::rand_vec(points);
             let coeffs = F::rand_vec(deg);
             let coeffs = PolynomialCoeffs { coeffs };
@@ -137,6 +158,8 @@ mod tests {
         let points = [(F::rand(), F::rand()), (F::rand(), F::rand())];
         let x = F::rand();
 
+        #[cfg(feature = "cuda")]
+        init_twiddle_factors_rs(0, 2);
         let ev0 = interpolant(&points).eval(x);
         let ev1 = interpolate(&points, x, &barycentric_weights(&points));
         let ev2 = interpolate2(points, x);
diff --git a/field/src/lib.rs b/field/src/lib.rs
index 9a2ea4f9c..be1c4b512 100644
--- a/field/src/lib.rs
+++ b/field/src/lib.rs
@@ -24,6 +24,7 @@ pub mod polynomial;
 pub mod secp256k1_base;
 pub mod secp256k1_scalar;
 pub mod types;
+pub mod util;
 pub mod zero_poly_coset;
 
 #[cfg(test)]
diff --git a/field/src/polynomial/division.rs b/field/src/polynomial/division.rs
index 7d85d5492..a34602de4 100644
--- a/field/src/polynomial/division.rs
+++ b/field/src/polynomial/division.rs
@@ -78,7 +78,7 @@ impl<F: Field> PolynomialCoeffs<F> {
             .iter()
             .rev()
             .scan(F::ZERO, |acc, &c| {
-                *acc = *acc * z + c;
+                *acc = c.multiply_accumulate(*acc, z);
                 Some(*acc)
             })
             .collect::<Vec<_>>();
diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs
index c13bbca27..ffe5d18c6 100644
--- a/field/src/polynomial/mod.rs
+++ b/field/src/polynomial/mod.rs
@@ -12,7 +12,7 @@ use plonky2_util::log2_strict;
 use serde::{Deserialize, Serialize};
 
 use crate::extension::{Extendable, FieldExtension};
-use crate::fft::{fft, fft_with_options, ifft, FftRootTable};
+use crate::fft::{fft, fft_with_options, ifft, ifft_cpu, FftRootTable};
 use crate::types::Field;
 
 /// A polynomial in point-value form.
@@ -55,10 +55,17 @@ impl<F: Field> PolynomialValues<F> {
         self.values.len()
     }
 
+    /// Adaptive IFFT: uses GPU if available, otherwise CPU.
     pub fn ifft(self) -> PolynomialCoeffs<F> {
         ifft(self)
     }
 
+    /// Enfored to use CPU IFFT.
+    /// Used for bypass the GPU issue during setup phase.
+    pub fn ifft_cpu(self) -> PolynomialCoeffs<F> {
+        ifft_cpu(self)
+    }
+
     /// Returns the polynomial whose evaluation on the coset `shift*H` is `self`.
     pub fn coset_ifft(self, shift: F) -> PolynomialCoeffs<F> {
         let mut shifted_coeffs = self.ifft();
@@ -438,7 +445,6 @@ impl<F: Field> Mul for &PolynomialCoeffs<F> {
 
 #[cfg(test)]
 mod tests {
-    use std::time::Instant;
 
     use rand::rngs::OsRng;
     use rand::Rng;
@@ -447,6 +453,17 @@ mod tests {
     use crate::goldilocks_field::GoldilocksField;
     use crate::types::Sample;
 
+    #[cfg(feature = "cuda")]
+    fn init_gpu_for_tests() {
+        zeknox::clear_cuda_errors_rs();
+        // Initialize twiddle factors for various sizes
+        for i in 0..=20 {
+            zeknox::init_twiddle_factors_rs(0, i);
+        }
+        let coset_gen_u64 = 7u64;
+        zeknox::init_coset_rs(0, 20, coset_gen_u64);
+    }
+
     #[test]
     fn test_trimmed() {
         type F = GoldilocksField;
@@ -475,6 +492,9 @@ mod tests {
 
     #[test]
     fn test_coset_fft() {
+        #[cfg(feature = "cuda")]
+        init_gpu_for_tests();
+
         type F = GoldilocksField;
 
         let k = 8;
@@ -496,6 +516,9 @@ mod tests {
 
     #[test]
     fn test_coset_ifft() {
+        #[cfg(feature = "cuda")]
+        init_gpu_for_tests();
+
         type F = GoldilocksField;
 
         let k = 8;
@@ -601,7 +624,10 @@ mod tests {
     // Test to see which polynomial division method is faster for divisions of the type
     // `(X^n - 1)/(X - a)
     #[test]
+    #[cfg(not(feature = "cuda"))]
     fn test_division_linear() {
+        use std::time::Instant;
+
         type F = GoldilocksField;
         let mut rng = OsRng;
         let l = 14;
diff --git a/field/src/types.rs b/field/src/types.rs
index 153231c9a..0c6947907 100644
--- a/field/src/types.rs
+++ b/field/src/types.rs
@@ -91,6 +91,9 @@ pub trait Field:
     /// The bit length of the field order.
     const BITS: usize;
 
+    /// Whether this field is supported by cuda
+    const CUDA_SUPPORT: bool = false;
+
     fn order() -> BigUint;
     fn characteristic() -> BigUint;
 
diff --git a/field/src/util.rs b/field/src/util.rs
new file mode 100644
index 000000000..374a38986
--- /dev/null
+++ b/field/src/util.rs
@@ -0,0 +1,20 @@
+use alloc::vec::Vec;
+use core::mem;
+
+// alloc memory for Vec<F>, where every element is 0. (a lot) faster than vec![F::ZERO; len]
+pub unsafe fn vec_zeroed<F>(len: usize) -> Vec<F> {
+    let elem_size = mem::size_of::<F>();
+    debug_assert!(elem_size != 0, "ZST not supported by this helper");
+
+    // Layout for len elements
+    let layout = alloc::alloc::Layout::array::<F>(len).expect("layout overflow");
+
+    // Allocate zeroed memory
+    let ptr = alloc::alloc::alloc_zeroed(layout) as *mut F;
+    if ptr.is_null() {
+        alloc::alloc::handle_alloc_error(layout);
+    }
+
+    // Take ownership as a Vec<F>
+    Vec::from_raw_parts(ptr, len, len)
+}
diff --git a/plonky2/Cargo.toml b/plonky2/Cargo.toml
index 36f26fb79..fafd57d68 100644
--- a/plonky2/Cargo.toml
+++ b/plonky2/Cargo.toml
@@ -11,12 +11,22 @@ repository.workspace = true
 keywords.workspace = true
 categories.workspace = true
 
+
 [features]
-default = ["gate_testing", "parallel", "rand_chacha", "std", "timing"]
+# default = ["gate_testing", "rand_chacha", "std", "timing", "cuda"]
+# default = ["gate_testing", "parallel", "rand_chacha", "std", "timing", ]
+# 
+default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing", ]
+# default = ["gate_testing", "rand_chacha", "std", "cuda", "timing", "cuda_sanity_check"]
+# default = ["gate_testing", "parallel", "rand_chacha", "std", "cuda", "timing", "cuda_sanity_check"]
 gate_testing = []
 parallel = ["hashbrown/rayon", "plonky2_maybe_rayon/parallel"]
 std = ["anyhow/std", "rand/std", "itertools/use_std"]
 timing = ["std", "dep:web-time"]
+cuda = ["plonky2_field/cuda"]
+# sanity check: when this flag is on, the computation will done on both CPU and CUDA, and the results compared
+cuda_sanity_check = [ "cuda", "plonky2_field/cuda_sanity_check" ]
+
 
 [dependencies]
 ahash = { workspace = true }
@@ -26,6 +36,7 @@ itertools = { workspace = true }
 keccak-hash = { version = "0.8.0", default-features = false }
 log = { workspace = true }
 num = { workspace = true }
+once_cell = { workspace = true, features = ["std"] }
 rand = { workspace = true }
 rand_chacha = { version = "0.3.1", optional = true, default-features = false }
 serde = { workspace = true, features = ["rc"] }
@@ -38,6 +49,9 @@ plonky2_field = { version = "1.0.0", path = "../field", default-features = false
 plonky2_maybe_rayon = { version = "1.0.0", path = "../maybe_rayon", default-features = false }
 plonky2_util = { version = "1.0.0", path = "../util", default-features = false }
 
+# cuda accelerator wrapper
+zeknox = { workspace = true }
+
 # Plonky3 dependencies for Poseidon2
 p3-poseidon2 = { git = "https://github.com/Plonky3/Plonky3.git", rev = "eeb4e37b20127c4daa871b2bad0df30a7c7380db" }
 p3-goldilocks = { git = "https://github.com/Plonky3/Plonky3.git", rev = "eeb4e37b20127c4daa871b2bad0df30a7c7380db" }
diff --git a/plonky2/benches/merkle.rs b/plonky2/benches/merkle.rs
index 6230c1343..e9995be1a 100644
--- a/plonky2/benches/merkle.rs
+++ b/plonky2/benches/merkle.rs
@@ -23,7 +23,7 @@ pub(crate) fn bench_merkle_tree<F: RichField, H: Hasher<F>>(c: &mut Criterion) {
         let size = 1 << size_log;
         group.bench_with_input(BenchmarkId::from_parameter(size), &size, |b, _| {
             let leaves = vec![F::rand_vec(ELEMS_PER_LEAF); size];
-            b.iter(|| MerkleTree::<F, H>::new(leaves.clone(), 0));
+            b.iter(|| MerkleTree::<F, H>::new_from_2d(leaves.clone(), 0));
         });
     }
 }
diff --git a/plonky2/examples/fibonacci.rs b/plonky2/examples/fibonacci.rs
index be491aa21..2f8bb84ae 100644
--- a/plonky2/examples/fibonacci.rs
+++ b/plonky2/examples/fibonacci.rs
@@ -38,6 +38,27 @@ fn work<C: GenericConfig<2>>() -> Result<()> {
         cur_target = temp;
     }
 
+    #[cfg(feature = "cuda")]
+    {
+        use plonky2_util::log2_ceil;
+
+        let size = log2_ceil(builder.num_gates());
+
+        zeknox::clear_cuda_errors_rs();
+        println!(
+            "Initializing CUDA twiddle factors for dimeinsions 2^{} and 2^{}",
+            size,
+            size + 3
+        );
+
+        zeknox::init_twiddle_factors_rs(0, size);
+        zeknox::init_twiddle_factors_rs(0, size + 3);
+
+        // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR)
+        let coset_gen_u64 = 7u64;
+        zeknox::init_coset_rs(0, size + 3, coset_gen_u64);
+    }
+
     // Public inputs are the two initial values (provided below) and the result (which is generated).
     builder.register_public_input(initial_a);
     builder.register_public_input(initial_b);
diff --git a/plonky2/src/batch_fri/oracle.rs b/plonky2/src/batch_fri/oracle.rs
index 1f34cfef3..76316d8f5 100644
--- a/plonky2/src/batch_fri/oracle.rs
+++ b/plonky2/src/batch_fri/oracle.rs
@@ -15,7 +15,7 @@ use crate::fri::oracle::PolynomialBatch;
 use crate::fri::proof::FriProof;
 use crate::fri::structure::{FriBatchInfo, FriInstanceInfo};
 use crate::fri::FriParams;
-use crate::hash::batch_merkle_tree::BatchMerkleTree;
+// use crate::hash::batch_merkle_tree::BatchMerkleTree;
 use crate::hash::hash_types::RichField;
 use crate::iop::challenger::Challenger;
 use crate::plonk::config::GenericConfig;
@@ -300,6 +300,18 @@ mod test {
             reduction_arity_bits,
         };
 
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            // Initialize twiddle factors for all dimensions that will be used
+            // This test involves multiple polynomials and recursive verification,
+            // so we initialize a wider range of dimensions to be safe
+            let current_log_size = k0 + fri_params.config.rate_bits;
+            for i in 0..=current_log_size + 5 {
+                zeknox::init_twiddle_factors_rs(0, i);
+            }
+        }
+
         let n0 = 1 << k0;
         let n1 = 1 << k1;
         let n2 = 1 << k2;
diff --git a/plonky2/src/batch_fri/prover.rs b/plonky2/src/batch_fri/prover.rs
index e71fe25b4..a12f87667 100644
--- a/plonky2/src/batch_fri/prover.rs
+++ b/plonky2/src/batch_fri/prover.rs
@@ -104,7 +104,8 @@ pub(crate) fn batch_fri_committed_trees<
 
         reverse_index_bits_in_place(&mut final_values.values);
         let chunked_values = final_values.values.par_chunks(arity).map(flatten).collect();
-        let tree = MerkleTree::<F, C::Hasher>::new(chunked_values, fri_params.config.cap_height);
+        let tree =
+            MerkleTree::<F, C::Hasher>::new_from_2d(chunked_values, fri_params.config.cap_height);
 
         challenger.observe_cap(&tree.cap);
         trees.push(tree);
@@ -263,6 +264,17 @@ mod tests {
         };
 
         let n = 1 << k;
+
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            // Initialize twiddle factors for all dimensions that will be used
+            let current_log_size = k + fri_params.config.rate_bits;
+            for i in 0..=current_log_size {
+                zeknox::init_twiddle_factors_rs(0, i);
+            }
+        }
+
         let trace = PolynomialValues::new((1..n + 1).map(F::from_canonical_i64).collect_vec());
 
         let polynomial_batch: BatchFriOracle<GoldilocksField, C, D> = BatchFriOracle::from_values(
@@ -359,6 +371,16 @@ mod tests {
             reduction_arity_bits,
         };
 
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            // Initialize twiddle factors for all dimensions that will be used
+            let current_log_size = k0 + fri_params.config.rate_bits;
+            for i in 0..=current_log_size {
+                zeknox::init_twiddle_factors_rs(0, i);
+            }
+        }
+
         let n0 = 1 << k0;
         let n1 = 1 << k1;
         let n2 = 1 << k2;
diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs
index e413071a4..292f2a0cf 100644
--- a/plonky2/src/fri/oracle.rs
+++ b/plonky2/src/fri/oracle.rs
@@ -62,10 +62,14 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
         timing: &mut TimingTree,
         fft_root_table: Option<&FftRootTable<F>>,
     ) -> Self {
+        // The first IFFT is always done on CPU to avoid strange GPU issue.
         let coeffs = timed!(
             timing,
-            "IFFT",
-            values.into_par_iter().map(|v| v.ifft()).collect::<Vec<_>>()
+            "CPU IFFT",
+            values
+                .into_par_iter()
+                .map(|v| v.ifft_cpu())
+                .collect::<Vec<_>>()
         );
 
         Self::from_coeffs(
@@ -86,6 +90,35 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
         cap_height: usize,
         timing: &mut TimingTree,
         fft_root_table: Option<&FftRootTable<F>>,
+    ) -> Self {
+        #[cfg(feature = "cuda")]
+        return Self::from_coeffs_gpu(
+            polynomials,
+            rate_bits,
+            blinding,
+            cap_height,
+            timing,
+            fft_root_table,
+        );
+        #[cfg(not(feature = "cuda"))]
+        Self::from_coeffs_cpu(
+            polynomials,
+            rate_bits,
+            blinding,
+            cap_height,
+            timing,
+            fft_root_table,
+        )
+    }
+
+    /// Creates a list polynomial commitment for the polynomials `polynomials`.
+    fn from_coeffs_cpu(
+        polynomials: Vec<PolynomialCoeffs<F>>,
+        rate_bits: usize,
+        blinding: bool,
+        cap_height: usize,
+        timing: &mut TimingTree,
+        fft_root_table: Option<&FftRootTable<F>>,
     ) -> Self {
         let degree = polynomials[0].len();
         let lde_values = timed!(
@@ -99,7 +132,171 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
         let merkle_tree = timed!(
             timing,
             "build Merkle tree",
-            MerkleTree::new(leaves, cap_height)
+            MerkleTree::new_from_2d(leaves, cap_height)
+        );
+
+        Self {
+            polynomials,
+            merkle_tree,
+            degree_log: log2_strict(degree),
+            rate_bits,
+            blinding,
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    fn from_coeffs_gpu(
+        polynomials: Vec<PolynomialCoeffs<F>>,
+        rate_bits: usize,
+        blinding: bool,
+        cap_height: usize,
+        timing: &mut TimingTree,
+        _fft_root_table: Option<&FftRootTable<F>>,
+    ) -> Self {
+        assert!(F::CUDA_SUPPORT, "CUDA is not support for this field");
+
+        let degree = polynomials[0].len();
+
+        // If blinding, salt with two random elements to each leaf vector.
+        let salt_size = if blinding { SALT_SIZE } else { 0 };
+
+        Self::from_coeffs_gpu_helper(
+            polynomials,
+            rate_bits,
+            blinding,
+            cap_height,
+            timing,
+            degree,
+            salt_size,
+        )
+    }
+
+    #[cfg(feature = "cuda")]
+    fn from_coeffs_gpu_helper(
+        polynomials: Vec<PolynomialCoeffs<F>>,
+        rate_bits: usize,
+        blinding: bool,
+        cap_height: usize,
+        timing: &mut TimingTree,
+        degree: usize,
+        salt_size: usize,
+    ) -> Self {
+        use plonky2_field::util::vec_zeroed;
+        use zeknox::device::memory::HostOrDeviceSlice;
+        use zeknox::types::{NTTConfig, TransposeConfig};
+        use zeknox::{ntt_batch_ptr, transpose_rev_batch};
+
+        let lde_size = degree << rate_bits;
+        let num_polys = polynomials.len() + salt_size;
+        let total_alloc_size = num_polys * lde_size;
+
+        let salt_polys = (0..salt_size)
+            .map(|_| F::rand_vec(lde_size))
+            .collect::<Vec<_>>();
+
+        // Step 1: Compute coset FFT on GPU, keeping data on GPU
+        let gpu_lde_values = timed!(timing, "GPU coset FFT", {
+            // Allocate GPU memory for all polynomials
+            log::debug!(
+                "Allocating GPU memory for {} polynomials of size {} (total {} elements)",
+                num_polys,
+                lde_size,
+                total_alloc_size
+            );
+
+            let mut gpu_buffer = timed!(
+                timing,
+                format!("cuda alloc memory for {} elements", total_alloc_size).as_ref(),
+                HostOrDeviceSlice::cuda_malloc(0, total_alloc_size)
+                    .expect("Failed to allocate GPU memory")
+            );
+
+            // Copy all data to GPU in one go
+            let mut flat_data = timed!(timing, "Prepare CPU memory", unsafe {
+                vec_zeroed::<F>(total_alloc_size)
+            });
+
+            timed!(timing, "Copy CPU memory", {
+                for i in 0..polynomials.len() {
+                    flat_data[i * lde_size..i * lde_size + degree]
+                        .copy_from_slice(polynomials[i].coeffs.as_ref())
+                }
+                for i in polynomials.len()..num_polys {
+                    flat_data[i * lde_size..(i + 1) * lde_size]
+                        .copy_from_slice(salt_polys[i - polynomials.len()].as_slice());
+                }
+            });
+
+            timed!(
+                timing,
+                "CPU to GPU",
+                gpu_buffer
+                    .copy_from_host(&flat_data)
+                    .expect("Failed to copy data to GPU")
+            );
+
+            // Perform batched NTT on GPU
+            // Technically we don't really need to do FFTs for the salt polynomial
+            // but then the cuda memory becomes extremely difficult to handle
+            // so we might as well do those FFTs.
+            let log_domain_size = log2_strict(lde_size);
+            let ntt_config = NTTConfig {
+                batches: num_polys as u32,
+                are_inputs_on_device: true,
+                are_outputs_on_device: true,
+                with_coset: true,
+                ..Default::default()
+            };
+            timed!(
+                timing,
+                format!(
+                    "GPU batch NTT for {} poly of degree {}",
+                    num_polys, lde_size
+                )
+                .as_ref(),
+                ntt_batch_ptr(0, gpu_buffer.as_mut_ptr(), log_domain_size, ntt_config)
+            );
+            gpu_buffer
+        });
+
+        // Step 2: Transpose on GPU using Zeknox
+        let gpu_transposed = timed!(timing, "GPU transpose", {
+            let mut gpu_output = HostOrDeviceSlice::cuda_malloc(0, total_alloc_size)
+                .expect("Failed to allocate GPU memory for transpose");
+
+            let log_n = log2_strict(lde_size);
+            let transpose_config = TransposeConfig {
+                batches: num_polys as u32,
+                are_inputs_on_device: true,
+                are_outputs_on_device: true,
+            };
+
+            transpose_rev_batch(
+                0,
+                gpu_output.as_mut_ptr(),
+                gpu_lde_values.as_ptr(),
+                log_n,
+                transpose_config,
+            );
+
+            gpu_output
+        });
+
+        // Step 3: Copy back to CPU
+        let leaves_1d = timed!(timing, "GPU to CPU", {
+            let mut cpu_data = vec![F::ZERO; total_alloc_size];
+
+            gpu_transposed
+                .copy_to_host(&mut cpu_data, total_alloc_size)
+                .expect("Failed to copy data from GPU");
+
+            cpu_data
+        });
+
+        let merkle_tree = timed!(
+            timing,
+            "build Merkle tree",
+            MerkleTree::new_from_1d(leaves_1d, polynomials.len(), cap_height)
         );
 
         Self {
@@ -142,7 +339,7 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
     pub fn get_lde_values(&self, index: usize, step: usize) -> &[F] {
         let index = index * step;
         let index = reverse_bits(index, self.degree_log + self.rate_bits);
-        let slice = &self.merkle_tree.leaves[index];
+        let slice = &self.merkle_tree.get(index);
         &slice[..slice.len() - if self.blinding { SALT_SIZE } else { 0 }]
     }
 
diff --git a/plonky2/src/fri/prover.rs b/plonky2/src/fri/prover.rs
index 24c88ced7..e5792cb2c 100644
--- a/plonky2/src/fri/prover.rs
+++ b/plonky2/src/fri/prover.rs
@@ -101,7 +101,8 @@ fn fri_committed_trees<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>,
             .par_chunks(arity)
             .map(|chunk: &[F::Extension]| flatten(chunk))
             .collect();
-        let tree = MerkleTree::<F, C::Hasher>::new(chunked_values, fri_params.config.cap_height);
+        let tree =
+            MerkleTree::<F, C::Hasher>::new_from_2d(chunked_values, fri_params.config.cap_height);
 
         challenger.observe_cap(&tree.cap);
         trees.push(tree);
diff --git a/plonky2/src/gadgets/arithmetic_extension.rs b/plonky2/src/gadgets/arithmetic_extension.rs
index 9d1088030..6c8a253cf 100644
--- a/plonky2/src/gadgets/arithmetic_extension.rs
+++ b/plonky2/src/gadgets/arithmetic_extension.rs
@@ -629,6 +629,18 @@ mod tests {
 
     #[test]
     fn test_mul_many() -> Result<()> {
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            // Initialize twiddle factors for a range of sizes that might be used
+            for i in 0..=20 {
+                zeknox::init_twiddle_factors_rs(0, i);
+            }
+            // Initialize coset for Goldilocks field (coset generator = 7)
+            let coset_gen_u64 = 7u64;
+            zeknox::init_coset_rs(0, 20, coset_gen_u64);
+        }
+
         const D: usize = 2;
         type C = PoseidonGoldilocksConfig;
         type F = <C as GenericConfig<D>>::F;
@@ -665,6 +677,18 @@ mod tests {
 
     #[test]
     fn test_div_extension() -> Result<()> {
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            // Initialize twiddle factors for a range of sizes that might be used
+            for i in 0..=20 {
+                zeknox::init_twiddle_factors_rs(0, i);
+            }
+            // Initialize coset for Goldilocks field (coset generator = 7)
+            let coset_gen_u64 = 7u64;
+            zeknox::init_coset_rs(0, 20, coset_gen_u64);
+        }
+
         const D: usize = 2;
         type C = PoseidonGoldilocksConfig;
         type F = <C as GenericConfig<D>>::F;
@@ -692,6 +716,18 @@ mod tests {
 
     #[test]
     fn test_mul_algebra() -> Result<()> {
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            // Initialize twiddle factors for a range of sizes that might be used
+            for i in 0..=20 {
+                zeknox::init_twiddle_factors_rs(0, i);
+            }
+            // Initialize coset for Goldilocks field (coset generator = 7)
+            let coset_gen_u64 = 7u64;
+            zeknox::init_coset_rs(0, 20, coset_gen_u64);
+        }
+
         const D: usize = 2;
         type C = KeccakGoldilocksConfig;
         type F = <C as GenericConfig<D>>::F;
diff --git a/plonky2/src/gates/equality_base.rs b/plonky2/src/gates/equality_base.rs
index 50a315e81..86c302e18 100644
--- a/plonky2/src/gates/equality_base.rs
+++ b/plonky2/src/gates/equality_base.rs
@@ -160,7 +160,6 @@ impl<F: RichField + Extendable<D>, const D: usize> Gate<F, D> for EqualityGate {
                 )
             })
             .collect();
-        //println!("generators {:?}", result.len());
         result
     }
 
diff --git a/plonky2/src/hash/keccak.rs b/plonky2/src/hash/keccak.rs
index d3fa8c4b2..61e7cb87e 100644
--- a/plonky2/src/hash/keccak.rs
+++ b/plonky2/src/hash/keccak.rs
@@ -7,7 +7,7 @@ use keccak_hash::keccak;
 
 use crate::hash::hash_types::{BytesHash, RichField};
 use crate::hash::hashing::PlonkyPermutation;
-use crate::plonk::config::Hasher;
+use crate::plonk::config::{Hasher, HasherType};
 use crate::util::serialization::Write;
 
 pub const SPONGE_RATE: usize = 8;
@@ -103,6 +103,7 @@ impl<F: RichField> PlonkyPermutation<F> for KeccakPermutation<F> {
 pub struct KeccakHash<const N: usize>;
 impl<F: RichField, const N: usize> Hasher<F> for KeccakHash<N> {
     const HASH_SIZE: usize = N;
+    const HASHER_TYPE: HasherType = HasherType::Keccak;
     type Hash = BytesHash<N>;
     type Permutation = KeccakPermutation<F>;
 
diff --git a/plonky2/src/hash/merkle_proofs.rs b/plonky2/src/hash/merkle_proofs.rs
index 74963a2a4..cbd20cea5 100644
--- a/plonky2/src/hash/merkle_proofs.rs
+++ b/plonky2/src/hash/merkle_proofs.rs
@@ -240,6 +240,7 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
 
     /// Same as `verify_batch_merkle_proof_to_cap`, except with the final "cap index" as separate parameter,
     /// rather than being contained in `leaf_index_bits`.
+    #[allow(dead_code)]
     pub(crate) fn verify_batch_merkle_proof_to_cap_with_cap_index<H: AlgebraicHasher<F>>(
         &mut self,
         leaf_data: &[Vec<Target>],
@@ -342,7 +343,8 @@ mod tests {
         let n = 1 << log_n;
         let cap_height = 1;
         let leaves = random_data::<F>(n, 7);
-        let tree = MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new(leaves, cap_height);
+        let tree =
+            MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new_from_2d(leaves, cap_height);
         let i: usize = OsRng.gen_range(0..n);
         let proof = tree.prove(i);
 
@@ -359,9 +361,10 @@ mod tests {
         let i_c = builder.constant(F::from_canonical_usize(i));
         let i_bits = builder.split_le(i_c, log_n);
 
-        let data = builder.add_virtual_targets(tree.leaves[i].len());
+        let data = builder.add_virtual_targets(tree.leaf_size);
+        let leaf = tree.get(i);
         for j in 0..data.len() {
-            pw.set_target(data[j], tree.leaves[i][j])?;
+            pw.set_target(data[j], leaf[j])?;
         }
 
         builder.verify_merkle_proof_to_cap::<<C as GenericConfig<D>>::InnerHasher>(
diff --git a/plonky2/src/hash/merkle_tree.rs b/plonky2/src/hash/merkle_tree.rs
index 31bcf5e37..4773d71ab 100644
--- a/plonky2/src/hash/merkle_tree.rs
+++ b/plonky2/src/hash/merkle_tree.rs
@@ -1,16 +1,52 @@
-#[cfg(not(feature = "std"))]
-use alloc::vec::Vec;
 use core::mem::MaybeUninit;
 use core::slice;
+use std::collections::HashSet;
+#[cfg(feature = "cuda")]
+use std::sync::Arc;
+#[cfg(feature = "cuda")]
+use std::sync::Mutex;
+use std::time::Instant;
+#[cfg(not(feature = "std"))]
+use std::vec::Vec;
 
+use num::range;
+#[cfg(feature = "cuda")]
+use once_cell::sync::Lazy;
 use plonky2_maybe_rayon::*;
 use serde::{Deserialize, Serialize};
+#[cfg(feature = "cuda")]
+use zeknox::device::memory::HostOrDeviceSlice;
+#[cfg(feature = "cuda")]
+use zeknox::device::stream::CudaStream;
+#[cfg(feature = "cuda")]
+use zeknox::{
+    fill_digests_buf_linear_gpu_with_gpu_ptr, fill_digests_buf_linear_multigpu_with_gpu_ptr,
+};
 
 use crate::hash::hash_types::RichField;
+#[cfg(feature = "cuda")]
+use crate::hash::hash_types::NUM_HASH_OUT_ELTS;
 use crate::hash::merkle_proofs::MerkleProof;
+#[cfg(feature = "cuda")]
+use crate::plonk::config::HasherType;
 use crate::plonk::config::{GenericHashOut, Hasher};
 use crate::util::log2_strict;
 
+#[cfg(feature = "cuda")]
+pub static GPU_ID: Lazy<Arc<Mutex<u64>>> = Lazy::new(|| Arc::new(Mutex::new(0)));
+
+#[cfg(feature = "cuda")]
+fn print_time(_now: Instant, _msg: &str) {
+    // println!("Time {} {} ms", _msg, _now.elapsed().as_millis());
+}
+
+#[cfg(not(feature = "cuda"))]
+#[allow(dead_code)]
+fn print_time(_now: Instant, _msg: &str) {}
+
+#[cfg(feature = "cuda")]
+const FORCE_SINGLE_GPU: bool = true;
+
 /// The Merkle cap of height `h` of a Merkle tree is the `h`-th layer (from the root) of the tree.
 /// It can be used in place of the root to verify Merkle paths, which are `h` elements shorter.
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
@@ -45,7 +81,10 @@ impl<F: RichField, H: Hasher<F>> MerkleCap<F, H> {
 #[derive(Clone, Debug, Eq, PartialEq)]
 pub struct MerkleTree<F: RichField, H: Hasher<F>> {
     /// The data in the leaves of the Merkle tree.
-    pub leaves: Vec<Vec<F>>,
+    // pub leaves: Vec<Vec<F>>,
+    pub leaves: Vec<F>,
+
+    pub leaf_size: usize,
 
     /// The digests in the tree. Consists of `cap.len()` sub-trees, each corresponding to one
     /// element in `cap`. Each subtree is contiguous and located at
@@ -64,6 +103,7 @@ pub struct MerkleTree<F: RichField, H: Hasher<F>> {
 impl<F: RichField, H: Hasher<F>> Default for MerkleTree<F, H> {
     fn default() -> Self {
         Self {
+            leaf_size: 0,
             leaves: Vec::new(),
             digests: Vec::new(),
             cap: MerkleCap::default(),
@@ -71,7 +111,7 @@ impl<F: RichField, H: Hasher<F>> Default for MerkleTree<F, H> {
     }
 }
 
-pub(crate) fn capacity_up_to_mut<T>(v: &mut Vec<T>, len: usize) -> &mut [MaybeUninit<T>] {
+fn capacity_up_to_mut<T>(v: &mut Vec<T>, len: usize) -> &mut [MaybeUninit<T>] {
     assert!(v.capacity() >= len);
     let v_ptr = v.as_mut_ptr().cast::<MaybeUninit<T>>();
     unsafe {
@@ -83,59 +123,105 @@ pub(crate) fn capacity_up_to_mut<T>(v: &mut Vec<T>, len: usize) -> &mut [MaybeUn
     }
 }
 
-pub(crate) fn fill_subtree<F: RichField, H: Hasher<F>>(
+fn fill_subtree<F: RichField, H: Hasher<F>>(
     digests_buf: &mut [MaybeUninit<H::Hash>],
-    leaves: &[Vec<F>],
+    leaves: &[F],
+    leaf_size: usize,
 ) -> H::Hash {
-    assert_eq!(leaves.len(), digests_buf.len() / 2 + 1);
-    if digests_buf.is_empty() {
-        H::hash_or_noop(&leaves[0])
-    } else {
-        // Layout is: left recursive output || left child digest
-        //             || right child digest || right recursive output.
-        // Split `digests_buf` into the two recursive outputs (slices) and two child digests
-        // (references).
-        let (left_digests_buf, right_digests_buf) = digests_buf.split_at_mut(digests_buf.len() / 2);
-        let (left_digest_mem, left_digests_buf) = left_digests_buf.split_last_mut().unwrap();
-        let (right_digest_mem, right_digests_buf) = right_digests_buf.split_first_mut().unwrap();
-        // Split `leaves` between both children.
-        let (left_leaves, right_leaves) = leaves.split_at(leaves.len() / 2);
-
-        let (left_digest, right_digest) = plonky2_maybe_rayon::join(
-            || fill_subtree::<F, H>(left_digests_buf, left_leaves),
-            || fill_subtree::<F, H>(right_digests_buf, right_leaves),
-        );
+    let leaves_count = leaves.len() / leaf_size;
+
+    // if one leaf => return it hash
+    if leaves_count == 1 {
+        let hash = H::hash_or_noop(leaves);
+        digests_buf[0].write(hash);
+        return hash;
+    }
+    // if two leaves => return their concat hash
+    if leaves_count == 2 {
+        let (leaf1, leaf2) = leaves.split_at(leaf_size);
+        let hash_left = H::hash_or_noop(leaf1);
+        let hash_right = H::hash_or_noop(leaf2);
+        digests_buf[0].write(hash_left);
+        digests_buf[1].write(hash_right);
+        return H::two_to_one(hash_left, hash_right);
+    }
+
+    assert_eq!(leaves_count, digests_buf.len() / 2 + 1);
+
+    // leaves first - we can do all in parallel
+    let (_, digests_leaves) = digests_buf.split_at_mut(digests_buf.len() - leaves_count);
+    digests_leaves
+        .into_par_iter()
+        .enumerate()
+        .for_each(|(leaf_idx, digest)| {
+            let (_, r) = leaves.split_at(leaf_idx * leaf_size);
+            let (leaf, _) = r.split_at(leaf_size);
+            digest.write(H::hash_or_noop(leaf));
+        });
 
-        left_digest_mem.write(left_digest);
-        right_digest_mem.write(right_digest);
-        H::two_to_one(left_digest, right_digest)
+    // internal nodes - we can do in parallel per level
+    let mut last_index = digests_buf.len() - leaves_count;
+
+    for level_log in range(1, log2_strict(leaves_count)).rev() {
+        let level_size = 1 << level_log;
+        let (_, digests_slice) = digests_buf.split_at_mut(last_index - level_size);
+        let (digests_slice, next_digests) = digests_slice.split_at_mut(level_size);
+
+        digests_slice
+            .into_par_iter()
+            .zip(last_index - level_size..last_index)
+            .for_each(|(digest, idx)| {
+                let left_idx = 2 * (idx + 1) - last_index;
+                let right_idx = left_idx + 1;
+
+                unsafe {
+                    let left_digest = next_digests[left_idx].assume_init();
+                    let right_digest = next_digests[right_idx].assume_init();
+                    digest.write(H::two_to_one(left_digest, right_digest));
+                }
+            });
+        last_index -= level_size;
     }
+
+    // return cap hash
+    let hash: <H as Hasher<F>>::Hash;
+    unsafe {
+        let left_digest = digests_buf[0].assume_init();
+        let right_digest = digests_buf[1].assume_init();
+        hash = H::two_to_one(left_digest, right_digest);
+    }
+    hash
 }
 
-pub(crate) fn fill_digests_buf<F: RichField, H: Hasher<F>>(
+fn fill_digests_buf<F: RichField, H: Hasher<F>>(
     digests_buf: &mut [MaybeUninit<H::Hash>],
     cap_buf: &mut [MaybeUninit<H::Hash>],
-    leaves: &[Vec<F>],
+    leaves: &Vec<F>,
+    leaf_size: usize,
     cap_height: usize,
 ) {
     // Special case of a tree that's all cap. The usual case will panic because we'll try to split
     // an empty slice into chunks of `0`. (We would not need this if there was a way to split into
     // `blah` chunks as opposed to chunks _of_ `blah`.)
+    let leaves_count = leaves.len() / leaf_size;
+
     if digests_buf.is_empty() {
-        debug_assert_eq!(cap_buf.len(), leaves.len());
+        debug_assert_eq!(cap_buf.len(), leaves_count);
         cap_buf
             .par_iter_mut()
-            .zip(leaves)
-            .for_each(|(cap_buf, leaf)| {
+            .enumerate()
+            .for_each(|(leaf_idx, cap_buf)| {
+                let (_, r) = leaves.split_at(leaf_idx * leaf_size);
+                let (leaf, _) = r.split_at(leaf_size);
                 cap_buf.write(H::hash_or_noop(leaf));
             });
         return;
     }
 
     let subtree_digests_len = digests_buf.len() >> cap_height;
-    let subtree_leaves_len = leaves.len() >> cap_height;
+    let subtree_leaves_len = leaves_count >> cap_height;
     let digests_chunks = digests_buf.par_chunks_exact_mut(subtree_digests_len);
-    let leaves_chunks = leaves.par_chunks_exact(subtree_leaves_len);
+    let leaves_chunks = leaves.par_chunks_exact(subtree_leaves_len * leaf_size);
     assert_eq!(digests_chunks.len(), cap_buf.len());
     assert_eq!(digests_chunks.len(), leaves_chunks.len());
     digests_chunks.zip(cap_buf).zip(leaves_chunks).for_each(
@@ -143,55 +229,251 @@ pub(crate) fn fill_digests_buf<F: RichField, H: Hasher<F>>(
             // We have `1 << cap_height` sub-trees, one for each entry in `cap`. They are totally
             // independent, so we schedule one task for each. `digests_buf` and `leaves` are split
             // into `1 << cap_height` slices, one for each sub-tree.
-            subtree_cap.write(fill_subtree::<F, H>(subtree_digests, subtree_leaves));
+            subtree_cap.write(fill_subtree::<F, H>(
+                subtree_digests,
+                subtree_leaves,
+                leaf_size,
+            ));
         },
     );
+
+    // TODO - debug code - to remove in future
+    /*
+    let digests_count: u64 = digests_buf.len().try_into().unwrap();
+    let leaves_count: u64 = leaves.len().try_into().unwrap();
+    let cap_height: u64  = cap_height.try_into().unwrap();
+    let leaf_size: u64 = leaves[0].len().try_into().unwrap();
+    let fname = format!("cpu-{}-{}-{}-{}.txt", digests_count, leaves_count, leaf_size, cap_height);
+    let mut file = File::create(fname).unwrap();
+    for digest in digests_buf {
+        unsafe {
+            let hash = digest.assume_init().to_vec();
+            for x in hash {
+                let str = format!("{} ", x.to_canonical_u64());
+                file.write_all(str.as_bytes());
+            }
+        }
+        file.write_all(b"\n");
+    }
+    */
+}
+
+#[cfg(feature = "cuda")]
+#[repr(C)]
+union U8U64 {
+    f1: [u8; 32],
+    f2: [u64; 4],
+}
+
+#[cfg(feature = "cuda")]
+fn fill_digests_buf_gpu<F: RichField, H: Hasher<F>>(
+    digests_buf: &mut [MaybeUninit<H::Hash>],
+    cap_buf: &mut [MaybeUninit<H::Hash>],
+    leaves: &Vec<F>,
+    leaf_size: usize,
+    cap_height: usize,
+) {
+    let leaves_count = leaves.len() / leaf_size;
+
+    let num_gpus: usize = std::env::var("NUM_OF_GPUS")
+        .unwrap_or("1".to_string())
+        .parse()
+        .unwrap();
+
+    let mut gpu_id_lock = GPU_ID.lock().unwrap();
+    let gpu_id = *gpu_id_lock;
+    *gpu_id_lock += 1;
+    if *gpu_id_lock >= num_gpus as u64 {
+        *gpu_id_lock = 0;
+    }
+    log::debug!("Using GPU id {} leave length {}", gpu_id, leaves.len());
+
+    let now = Instant::now();
+    let gpu_leaves_buf_result = HostOrDeviceSlice::cuda_malloc(gpu_id as i32, leaves.len());
+
+    if gpu_leaves_buf_result.is_err() {
+        panic!("CUDA malloc failed, falling back to CPU for Merkle tree generation");
+    }
+
+    let mut gpu_leaves_buf = gpu_leaves_buf_result.unwrap();
+    print_time(now, "alloc gpu leaves buffer");
+
+    let now = Instant::now();
+    let _ = gpu_leaves_buf.copy_from_host(leaves.as_slice());
+    print_time(now, "leaves copy to gpu");
+
+    let now = Instant::now();
+    fill_digests_buf_gpu_ptr::<F, H>(
+        digests_buf,
+        cap_buf,
+        gpu_leaves_buf.as_mut_ptr(),
+        leaves_count,
+        leaf_size,
+        cap_height,
+        gpu_id,
+    );
+    print_time(now, "fill_digests_buf_gpu_ptr");
 }
 
-pub(crate) fn merkle_tree_prove<F: RichField, H: Hasher<F>>(
-    leaf_index: usize,
+#[cfg(feature = "cuda")]
+fn fill_digests_buf_gpu_ptr<F: RichField, H: Hasher<F>>(
+    digests_buf: &mut [MaybeUninit<H::Hash>],
+    cap_buf: &mut [MaybeUninit<H::Hash>],
+    leaves_ptr: *const F,
     leaves_len: usize,
+    leaf_len: usize,
     cap_height: usize,
-    digests: &[H::Hash],
-) -> Vec<H::Hash> {
-    let num_layers = log2_strict(leaves_len) - cap_height;
-    debug_assert_eq!(leaf_index >> (cap_height + num_layers), 0);
-
-    let digest_len = 2 * (leaves_len - (1 << cap_height));
-    assert_eq!(digest_len, digests.len());
-
-    let digest_tree: &[H::Hash] = {
-        let tree_index = leaf_index >> num_layers;
-        let tree_len = digest_len >> cap_height;
-        &digests[tree_len * tree_index..tree_len * (tree_index + 1)]
+    gpu_id: u64,
+) {
+    let digests_count: u64 = digests_buf.len().try_into().unwrap();
+    let leaves_count: u64 = leaves_len.try_into().unwrap();
+    let caps_count: u64 = cap_buf.len().try_into().unwrap();
+    let cap_height: u64 = cap_height.try_into().unwrap();
+    let leaf_size: u64 = leaf_len.try_into().unwrap();
+
+    let now = Instant::now();
+    // if digests_buf is empty (size 0), just allocate a few bytes to avoid errors
+    let digests_size = if digests_buf.len() == 0 {
+        NUM_HASH_OUT_ELTS
+    } else {
+        digests_buf.len() * NUM_HASH_OUT_ELTS
+    };
+    let caps_size = if cap_buf.len() == 0 {
+        NUM_HASH_OUT_ELTS
+    } else {
+        cap_buf.len() * NUM_HASH_OUT_ELTS
     };
 
-    // Mask out high bits to get the index within the sub-tree.
-    let mut pair_index = leaf_index & ((1 << num_layers) - 1);
-    (0..num_layers)
-        .map(|i| {
-            let parity = pair_index & 1;
-            pair_index >>= 1;
-
-            // The layers' data is interleaved as follows:
-            // [layer 0, layer 1, layer 0, layer 2, layer 0, layer 1, layer 0, layer 3, ...].
-            // Each of the above is a pair of siblings.
-            // `pair_index` is the index of the pair within layer `i`.
-            // The index of that the pair within `digests` is
-            // `pair_index * 2 ** (i + 1) + (2 ** i - 1)`.
-            let siblings_index = (pair_index << (i + 1)) + (1 << i) - 1;
-            // We have an index for the _pair_, but we want the index of the _sibling_.
-            // Double the pair index to get the index of the left sibling. Conditionally add `1`
-            // if we are to retrieve the right sibling.
-            let sibling_index = 2 * siblings_index + (1 - parity);
-            digest_tree[sibling_index]
-        })
-        .collect()
+    let mut gpu_digests_buf: HostOrDeviceSlice<'_, F> =
+        HostOrDeviceSlice::cuda_malloc(gpu_id as i32, digests_size).unwrap();
+    let mut gpu_cap_buf: HostOrDeviceSlice<'_, F> =
+        HostOrDeviceSlice::cuda_malloc(gpu_id as i32, caps_size).unwrap();
+
+    unsafe {
+        let num_gpus: usize = std::env::var("NUM_OF_GPUS")
+            .unwrap_or("1".to_string())
+            .parse()
+            .unwrap();
+        if !FORCE_SINGLE_GPU
+            && leaves_count >= (1 << 12)
+            && cap_height > 0
+            && num_gpus > 1
+            && H::HASHER_TYPE == HasherType::PoseidonBN128
+        {
+            // println!("Multi GPU");
+            fill_digests_buf_linear_multigpu_with_gpu_ptr(
+                gpu_digests_buf.as_mut_ptr() as *mut core::ffi::c_void,
+                gpu_cap_buf.as_mut_ptr() as *mut core::ffi::c_void,
+                leaves_ptr as *mut core::ffi::c_void,
+                digests_count,
+                caps_count,
+                leaves_count,
+                leaf_size,
+                cap_height,
+                H::HASHER_TYPE as u64,
+            );
+        } else {
+            // println!("Single GPU");
+            fill_digests_buf_linear_gpu_with_gpu_ptr(
+                gpu_digests_buf.as_mut_ptr() as *mut core::ffi::c_void,
+                gpu_cap_buf.as_mut_ptr() as *mut core::ffi::c_void,
+                leaves_ptr as *mut core::ffi::c_void,
+                digests_count,
+                caps_count,
+                leaves_count,
+                leaf_size,
+                cap_height,
+                H::HASHER_TYPE as u64,
+                gpu_id,
+            );
+        }
+    }
+    print_time(now, "fill init");
+
+    let mut host_digests: Vec<F> = vec![F::ZERO; digests_size];
+    let mut host_caps: Vec<F> = vec![F::ZERO; caps_size];
+    let stream1 = CudaStream::create().unwrap();
+    let stream2 = CudaStream::create().unwrap();
+
+    gpu_digests_buf
+        .copy_to_host_async(host_digests.as_mut_slice(), &stream1)
+        .expect("copy digests");
+    gpu_cap_buf
+        .copy_to_host_async(host_caps.as_mut_slice(), &stream2)
+        .expect("copy caps");
+    stream1.synchronize().expect("cuda sync");
+    stream2.synchronize().expect("cuda sync");
+    stream1.destroy().expect("cuda stream destroy");
+    stream2.destroy().expect("cuda stream destroy");
+
+    let now = Instant::now();
+
+    if digests_buf.len() > 0 {
+        host_digests
+            .chunks_exact(4)
+            .zip(digests_buf)
+            .for_each(|(x, y)| {
+                unsafe {
+                    let mut parts = U8U64 { f1: [0; 32] };
+                    parts.f2[0] = x[0].to_canonical_u64();
+                    parts.f2[1] = x[1].to_canonical_u64();
+                    parts.f2[2] = x[2].to_canonical_u64();
+                    parts.f2[3] = x[3].to_canonical_u64();
+                    let (slice, _) = parts.f1.split_at(H::HASH_SIZE);
+                    let h: H::Hash = H::Hash::from_bytes(slice);
+                    y.write(h);
+                };
+            });
+    }
+
+    if cap_buf.len() > 0 {
+        host_caps.chunks_exact(4).zip(cap_buf).for_each(|(x, y)| {
+            unsafe {
+                let mut parts = U8U64 { f1: [0; 32] };
+                parts.f2[0] = x[0].to_canonical_u64();
+                parts.f2[1] = x[1].to_canonical_u64();
+                parts.f2[2] = x[2].to_canonical_u64();
+                parts.f2[3] = x[3].to_canonical_u64();
+                let (slice, _) = parts.f1.split_at(H::HASH_SIZE);
+                let h: H::Hash = H::Hash::from_bytes(slice);
+                y.write(h);
+            };
+        });
+    }
+    print_time(now, "copy results");
+}
+
+#[cfg(feature = "cuda")]
+fn fill_digests_buf_meta<F: RichField, H: Hasher<F>>(
+    digests_buf: &mut [MaybeUninit<H::Hash>],
+    cap_buf: &mut [MaybeUninit<H::Hash>],
+    leaves: &Vec<F>,
+    leaf_size: usize,
+    cap_height: usize,
+) {
+    // if the input is small or if it Keccak hashing, just do the hashing on CPU
+    if leaf_size <= H::HASH_SIZE / 8 || H::HASHER_TYPE == HasherType::Keccak {
+        fill_digests_buf::<F, H>(digests_buf, cap_buf, leaves, leaf_size, cap_height);
+    } else {
+        fill_digests_buf_gpu::<F, H>(digests_buf, cap_buf, leaves, leaf_size, cap_height);
+    }
+}
+
+#[cfg(not(feature = "cuda"))]
+fn fill_digests_buf_meta<F: RichField, H: Hasher<F>>(
+    digests_buf: &mut [MaybeUninit<H::Hash>],
+    cap_buf: &mut [MaybeUninit<H::Hash>],
+    leaves: &Vec<F>,
+    leaf_size: usize,
+    cap_height: usize,
+) {
+    fill_digests_buf::<F, H>(digests_buf, cap_buf, leaves, leaf_size, cap_height);
 }
 
 impl<F: RichField, H: Hasher<F>> MerkleTree<F, H> {
-    pub fn new(leaves: Vec<Vec<F>>, cap_height: usize) -> Self {
-        let log2_leaves_len = log2_strict(leaves.len());
+    pub fn new_from_1d(leaves_1d: Vec<F>, leaf_size: usize, cap_height: usize) -> Self {
+        let leaves_len = leaves_1d.len() / leaf_size;
+        let log2_leaves_len = log2_strict(leaves_len);
         assert!(
             cap_height <= log2_leaves_len,
             "cap_height={} should be at most log2(leaves.len())={}",
@@ -199,7 +481,7 @@ impl<F: RichField, H: Hasher<F>> MerkleTree<F, H> {
             log2_leaves_len
         );
 
-        let num_digests = 2 * (leaves.len() - (1 << cap_height));
+        let num_digests = 2 * (leaves_len - (1 << cap_height));
         let mut digests = Vec::with_capacity(num_digests);
 
         let len_cap = 1 << cap_height;
@@ -207,7 +489,42 @@ impl<F: RichField, H: Hasher<F>> MerkleTree<F, H> {
 
         let digests_buf = capacity_up_to_mut(&mut digests, num_digests);
         let cap_buf = capacity_up_to_mut(&mut cap, len_cap);
-        fill_digests_buf::<F, H>(digests_buf, cap_buf, &leaves[..], cap_height);
+
+        #[cfg(feature = "cuda_sanity_check")]
+        let (digests_buf_cpu, cap_cpu) = {
+            let mut digests_buf_cpu = digests_buf.to_vec();
+            let mut cap_buf_cpu = cap_buf.to_vec();
+
+            fill_digests_buf::<F, H>(
+                &mut digests_buf_cpu,
+                &mut cap_buf_cpu,
+                &leaves_1d.clone(),
+                leaf_size,
+                cap_height,
+            );
+
+            (digests_buf_cpu, cap_buf_cpu)
+        };
+
+        fill_digests_buf_meta::<F, H>(digests_buf, cap_buf, &leaves_1d, leaf_size, cap_height);
+
+        #[cfg(feature = "cuda_sanity_check")]
+        {
+            for i in 0..num_digests {
+                unsafe {
+                    let hash1 = digests_buf[i].assume_init();
+                    let hash2 = digests_buf_cpu[i].assume_init();
+                    assert_eq!(hash1, hash2, "Digest mismatch at index {}", i);
+                }
+            }
+            for i in 0..len_cap {
+                unsafe {
+                    let hash1 = cap_buf[i].assume_init();
+                    let hash2 = cap_cpu[i].assume_init();
+                    assert_eq!(hash1, hash2, "Cap mismatch at index {}", i);
+                }
+            }
+        }
 
         unsafe {
             // SAFETY: `fill_digests_buf` and `cap` initialized the spare capacity up to
@@ -215,38 +532,355 @@ impl<F: RichField, H: Hasher<F>> MerkleTree<F, H> {
             digests.set_len(num_digests);
             cap.set_len(len_cap);
         }
+        Self {
+            leaves: leaves_1d,
+            leaf_size,
+            digests,
+            cap: MerkleCap(cap),
+        }
+    }
+
+    pub fn new_from_2d(leaves_2d: Vec<Vec<F>>, cap_height: usize) -> Self {
+        let leaf_size = leaves_2d[0].len();
+        let leaves_count = leaves_2d.len();
+        let zeros = vec![F::from_canonical_u64(0); leaf_size];
+        let mut leaves_1d: Vec<F> = Vec::with_capacity(leaves_count * leaf_size);
+        for idx in 0..leaves_count {
+            if leaves_2d[idx].len() == 0 {
+                leaves_1d.extend(zeros.clone());
+            } else {
+                leaves_1d.extend(leaves_2d[idx].clone());
+            }
+        }
+        Self::new_from_1d(leaves_1d, leaf_size, cap_height)
+    }
 
+    pub fn new_from_fields(
+        leaves_1d: Vec<F>,
+        leaf_size: usize,
+        digests: Vec<H::Hash>,
+        cap: MerkleCap<F, H>,
+    ) -> Self {
         Self {
-            leaves,
+            leaves: leaves_1d,
+            leaf_size,
+            digests,
+            cap,
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    pub fn new_from_gpu_leaves(
+        leaves_gpu_ptr: &HostOrDeviceSlice<'_, F>,
+        leaves_len: usize,
+        leaf_len: usize,
+        cap_height: usize,
+    ) -> Self {
+        use plonky2_field::util::vec_zeroed;
+
+        let log2_leaves_len = log2_strict(leaves_len);
+        assert!(
+            cap_height <= log2_leaves_len,
+            "cap_height={} should be at most log2(leaves.len())={}",
+            cap_height,
+            log2_leaves_len
+        );
+
+        // copy data from GPU in async mode
+        let mut host_leaves: Vec<F> = unsafe { vec_zeroed(leaves_len * leaf_len) };
+        let stream_copy = CudaStream::create().unwrap();
+
+        let start = std::time::Instant::now();
+        leaves_gpu_ptr
+            .copy_to_host_async(host_leaves.as_mut_slice(), &stream_copy)
+            .expect("copy to host error");
+        print_time(start, "copy leaves from GPU async");
+
+        let num_digests = 2 * (leaves_len - (1 << cap_height));
+        let mut digests = Vec::with_capacity(num_digests);
+
+        let len_cap = 1 << cap_height;
+        let mut cap = Vec::with_capacity(len_cap);
+
+        let digests_buf = capacity_up_to_mut(&mut digests, num_digests);
+        let cap_buf = capacity_up_to_mut(&mut cap, len_cap);
+        let now = Instant::now();
+        let gpu_id = 0;
+        fill_digests_buf_gpu_ptr::<F, H>(
+            digests_buf,
+            cap_buf,
+            leaves_gpu_ptr.as_ptr(),
+            leaves_len,
+            leaf_len,
+            cap_height,
+            gpu_id,
+        );
+        print_time(now, "fill digests buffer");
+
+        unsafe {
+            // SAFETY: `fill_digests_buf` and `cap` initialized the spare capacity up to
+            // `num_digests` and `len_cap`, resp.
+            digests.set_len(num_digests);
+            cap.set_len(len_cap);
+        }
+        /*
+        println!{"Digest Buffer"};
+        for dg in &digests {
+            println!("{:?}", dg);
+        }
+        println!{"Cap Buffer"};
+        for dg in &cap {
+            println!("{:?}", dg);
+        }
+        */
+        let _ = stream_copy.synchronize();
+        let _ = stream_copy.destroy();
+
+        Self {
+            leaves: host_leaves,
+            leaf_size: leaf_len,
             digests,
             cap: MerkleCap(cap),
         }
     }
 
     pub fn get(&self, i: usize) -> &[F] {
-        &self.leaves[i]
+        let (_, v) = self.leaves.split_at(i * self.leaf_size);
+        let (v, _) = v.split_at(self.leaf_size);
+        v
+    }
+
+    pub fn get_leaves_1d(&self) -> Vec<F> {
+        self.leaves.clone()
+    }
+
+    pub fn get_leaves_2d(&self) -> Vec<Vec<F>> {
+        let v2d: Vec<Vec<F>> = self
+            .leaves
+            .chunks_exact(self.leaf_size)
+            .map(|leaf| leaf.to_vec())
+            .collect();
+        v2d
+    }
+
+    pub fn get_leaves_count(&self) -> usize {
+        self.leaves.len() / self.leaf_size
+    }
+
+    pub fn change_leaf_and_update(&mut self, leaf: Vec<F>, leaf_index: usize) {
+        assert_eq!(leaf.len(), self.leaf_size);
+        let leaves_count = self.leaves.len() / self.leaf_size;
+        assert!(leaf_index < leaves_count);
+
+        let cap_height = log2_strict(self.cap.len());
+        let mut leaves = self.leaves.clone();
+        let start = leaf_index * self.leaf_size;
+        let leaf_copy = leaf.clone();
+        leaf.into_iter()
+            .enumerate()
+            .for_each(|(i, el)| leaves[start + i] = el);
+
+        let digests_len = self.digests.len();
+        let cap_len = self.cap.0.len();
+        let digests_buf = capacity_up_to_mut(&mut self.digests, digests_len);
+        let cap_buf = capacity_up_to_mut(&mut self.cap.0, cap_len);
+        self.leaves = leaves;
+        if digests_buf.is_empty() {
+            cap_buf[leaf_index].write(H::hash_or_noop(leaf_copy.as_slice()));
+        } else {
+            let subtree_leaves_len = leaves_count >> cap_height;
+            let subtree_idx = leaf_index / subtree_leaves_len;
+            let subtree_digests_len = digests_buf.len() >> cap_height;
+            let subtree_offset = subtree_idx * subtree_digests_len;
+            let idx_in_subtree =
+                subtree_digests_len - subtree_leaves_len + leaf_index % subtree_leaves_len;
+            if subtree_leaves_len == 2 {
+                digests_buf[subtree_offset + idx_in_subtree]
+                    .write(H::hash_or_noop(leaf_copy.as_slice()));
+            } else {
+                assert!(subtree_leaves_len > 2);
+                let idx = subtree_offset + idx_in_subtree;
+                digests_buf[idx].write(H::hash_or_noop(leaf_copy.as_slice()));
+                let mut child_idx: i64 = idx_in_subtree as i64;
+                let mut parent_idx: i64 = child_idx / 2 - 1;
+                while child_idx > 1 {
+                    unsafe {
+                        let mut left_idx = subtree_offset + child_idx as usize;
+                        let mut right_idx = subtree_offset + child_idx as usize + 1;
+                        if child_idx & 1 == 1 {
+                            left_idx = subtree_offset + child_idx as usize - 1;
+                            right_idx = subtree_offset + child_idx as usize;
+                        }
+                        let left_digest = digests_buf[left_idx].assume_init();
+                        let right_digest = digests_buf[right_idx].assume_init();
+                        digests_buf[subtree_offset + parent_idx as usize]
+                            .write(H::two_to_one(left_digest, right_digest));
+                    }
+                    child_idx = parent_idx;
+                    parent_idx = child_idx / 2 - 1;
+                }
+            }
+            unsafe {
+                let left_digest = digests_buf[subtree_offset].assume_init();
+                let right_digest = digests_buf[subtree_offset + 1].assume_init();
+                cap_buf[subtree_idx].write(H::two_to_one(left_digest, right_digest));
+            }
+        }
+    }
+
+    pub fn change_leaves_in_range_and_update(
+        &mut self,
+        new_leaves: Vec<Vec<F>>,
+        start_index: usize,
+        end_index: usize,
+    ) {
+        assert_eq!(new_leaves.len(), end_index - start_index);
+        assert_eq!(new_leaves[0].len(), self.leaf_size);
+
+        let tree_leaves_count = self.leaves.len() / self.leaf_size;
+        assert!(start_index < end_index);
+        assert!(end_index < tree_leaves_count);
+
+        let cap_height = log2_strict(self.cap.len());
+        let mut leaves = self.leaves.clone();
+
+        leaves[start_index * self.leaf_size..end_index * self.leaf_size]
+            .chunks_exact_mut(self.leaf_size)
+            .zip(new_leaves.clone())
+            .for_each(|(x, y)| {
+                for j in 0..self.leaf_size {
+                    x[j] = y[j];
+                }
+            });
+
+        let digests_len = self.digests.len();
+        let cap_len = self.cap.0.len();
+        let digests_buf = capacity_up_to_mut(&mut self.digests, digests_len);
+        let cap_buf = capacity_up_to_mut(&mut self.cap.0, cap_len);
+        self.leaves = leaves;
+        if digests_buf.is_empty() {
+            cap_buf[start_index..end_index]
+                .par_iter_mut()
+                .zip(new_leaves)
+                .for_each(|(cap, leaf)| {
+                    cap.write(H::hash_or_noop(leaf.as_slice()));
+                });
+        } else {
+            let subtree_leaves_len = tree_leaves_count >> cap_height;
+            let subtree_digests_len = digests_buf.len() >> cap_height;
+
+            let mut positions: Vec<usize> = (start_index..end_index)
+                .map(|idx| {
+                    let subtree_idx = idx / subtree_leaves_len;
+                    let subtree_offset = subtree_idx * subtree_digests_len;
+                    let idx_in_subtree =
+                        subtree_digests_len - subtree_leaves_len + idx % subtree_leaves_len;
+                    subtree_offset + idx_in_subtree
+                })
+                .collect();
+
+            // TODO change to parallel loop
+            for i in 0..positions.len() {
+                digests_buf[positions[i]].write(H::hash_or_noop(new_leaves[i].as_slice()));
+            }
+
+            if subtree_digests_len > 2 {
+                let rounds = log2_strict(tree_leaves_count) - cap_height - 1;
+                for _ in 0..rounds {
+                    let mut parent_indexes: HashSet<usize> = HashSet::new();
+                    let parents: Vec<usize> = positions
+                        .par_iter()
+                        .map(|pos| {
+                            let subtree_offset = pos / subtree_digests_len;
+                            let idx_in_subtree = pos % subtree_digests_len;
+                            let mut parent_idx = 0;
+                            if idx_in_subtree > 1 {
+                                parent_idx = idx_in_subtree / 2 - 1;
+                            }
+                            subtree_offset * subtree_digests_len + parent_idx
+                        })
+                        .collect();
+                    for p in parents {
+                        parent_indexes.insert(p);
+                    }
+                    positions = parent_indexes.into_iter().collect();
+
+                    // TODO change to parallel loop
+                    for i in 0..positions.len() {
+                        let subtree_offset = positions[i] / subtree_digests_len;
+                        let idx_in_subtree = positions[i] % subtree_digests_len;
+                        let digest_idx =
+                            subtree_offset * subtree_digests_len + 2 * (idx_in_subtree + 1);
+                        unsafe {
+                            let left_digest = digests_buf[digest_idx].assume_init();
+                            let right_digest = digests_buf[digest_idx + 1].assume_init();
+                            digests_buf[positions[i]]
+                                .write(H::two_to_one(left_digest, right_digest));
+                        }
+                    }
+                }
+            }
+
+            let mut cap_indexes: HashSet<usize> = HashSet::new();
+            for idx in start_index..end_index {
+                cap_indexes.insert(idx / subtree_leaves_len);
+            }
+
+            unsafe {
+                for idx in cap_indexes {
+                    let digest_idx = idx * subtree_digests_len;
+                    let left_digest = digests_buf[digest_idx].assume_init();
+                    let right_digest = digests_buf[digest_idx + 1].assume_init();
+                    cap_buf[idx].write(H::two_to_one(left_digest, right_digest));
+                }
+            }
+        }
     }
 
     /// Create a Merkle proof from a leaf index.
     pub fn prove(&self, leaf_index: usize) -> MerkleProof<F, H> {
         let cap_height = log2_strict(self.cap.len());
-        let siblings =
-            merkle_tree_prove::<F, H>(leaf_index, self.leaves.len(), cap_height, &self.digests);
+        let num_layers = log2_strict(self.get_leaves_count()) - cap_height;
+        let subtree_digest_size = (1 << (num_layers + 1)) - 2; // 2 ^ (k+1) - 2
+        let subtree_idx = leaf_index / (1 << num_layers);
+
+        let siblings: Vec<<H as Hasher<F>>::Hash> = Vec::with_capacity(num_layers);
+        if num_layers == 0 {
+            return MerkleProof { siblings };
+        }
+
+        // digests index where we start
+        let idx = subtree_digest_size - (1 << num_layers) + (leaf_index % (1 << num_layers));
+
+        let siblings = (0..num_layers)
+            .map(|i| {
+                // relative index
+                let rel_idx = (idx + 2 - (1 << i + 1)) / (1 << i);
+                // absolute index
+                let mut abs_idx = subtree_idx * subtree_digest_size + rel_idx;
+                if (rel_idx & 1) == 1 {
+                    abs_idx -= 1;
+                } else {
+                    abs_idx += 1;
+                }
+                self.digests[abs_idx]
+            })
+            .collect();
 
         MerkleProof { siblings }
     }
 }
 
 #[cfg(test)]
-pub(crate) mod tests {
+mod tests {
     use anyhow::Result;
 
     use super::*;
     use crate::field::extension::Extendable;
     use crate::hash::merkle_proofs::verify_merkle_proof_to_cap;
-    use crate::plonk::config::{GenericConfig, PoseidonGoldilocksConfig};
+    use crate::plonk::config::{GenericConfig, KeccakGoldilocksConfig, PoseidonGoldilocksConfig};
 
-    pub(crate) fn random_data<F: RichField>(n: usize, k: usize) -> Vec<Vec<F>> {
+    fn random_data<F: RichField>(n: usize, k: usize) -> Vec<Vec<F>> {
         (0..n).map(|_| F::rand_vec(k)).collect()
     }
 
@@ -258,7 +892,7 @@ pub(crate) mod tests {
         leaves: Vec<Vec<F>>,
         cap_height: usize,
     ) -> Result<()> {
-        let tree = MerkleTree::<F, C::Hasher>::new(leaves.clone(), cap_height);
+        let tree = MerkleTree::<F, C::Hasher>::new_from_2d(leaves.clone(), cap_height);
         for (i, leaf) in leaves.into_iter().enumerate() {
             let proof = tree.prove(i);
             verify_merkle_proof_to_cap(leaf, i, &tree.cap, &proof)?;
@@ -266,6 +900,224 @@ pub(crate) mod tests {
         Ok(())
     }
 
+    fn verify_change_leaf_and_update(log_n: usize, cap_h: usize) {
+        const D: usize = 2;
+        type C = PoseidonGoldilocksConfig;
+        type F = <C as GenericConfig<D>>::F;
+
+        let n = 1 << log_n;
+        let k = 7;
+        let mut leaves = random_data::<F>(n, k);
+
+        let mut mt1 =
+            MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new_from_2d(leaves.clone(), cap_h);
+
+        let tmp = random_data::<F>(1, k);
+        leaves[0] = tmp[0].clone();
+        let mt2 = MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new_from_2d(leaves, cap_h);
+
+        mt1.change_leaf_and_update(tmp[0].clone(), 0);
+
+        /*
+        println!("Tree 1");
+        mt1.digests.into_iter().for_each(
+            |x| {
+                println!("{:?}", x);
+            }
+        );
+        println!("Tree 2");
+        mt2.digests.into_iter().for_each(
+            |x| {
+                println!("{:?}", x);
+            }
+        );
+        */
+
+        mt1.digests
+            .into_par_iter()
+            .zip(mt2.digests)
+            .for_each(|(d1, d2)| {
+                assert_eq!(d1, d2);
+            });
+
+        mt1.cap
+            .0
+            .into_par_iter()
+            .zip(mt2.cap.0)
+            .for_each(|(d1, d2)| {
+                assert_eq!(d1, d2);
+            });
+    }
+
+    fn verify_change_leaf_and_update_range_one_by_one(
+        leaves_count: usize,
+        leaf_size: usize,
+        cap_height: usize,
+        start_index: usize,
+        end_index: usize,
+    ) {
+        use plonky2_field::types::Field;
+
+        const D: usize = 2;
+        type C = PoseidonGoldilocksConfig;
+        type F = <C as GenericConfig<D>>::F;
+
+        let raw_leaves: Vec<Vec<F>> = random_data::<F>(leaves_count, leaf_size);
+        let vals: Vec<Vec<F>> = random_data::<F>(end_index - start_index, leaf_size);
+
+        let mut leaves1_1d: Vec<F> = raw_leaves.into_iter().flatten().collect();
+        let leaves2_1d: Vec<F> = leaves1_1d.clone();
+
+        let mut tree2 = MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new_from_1d(
+            leaves2_1d, leaf_size, cap_height,
+        );
+
+        // v1
+        let now = Instant::now();
+        for i in start_index..end_index {
+            for j in 0..leaf_size {
+                leaves1_1d[i * leaf_size + j] = vals[i - start_index][j];
+            }
+        }
+        let tree1 = MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new_from_1d(
+            leaves1_1d, leaf_size, cap_height,
+        );
+        println!("Time V1: {} ms", now.elapsed().as_millis());
+
+        // v2
+        let now = Instant::now();
+        for idx in start_index..end_index {
+            let mut leaf: Vec<F> = vec![F::from_canonical_u64(0); leaf_size];
+            for j in 0..leaf_size {
+                leaf[j] = vals[idx - start_index][j];
+            }
+            tree2.change_leaf_and_update(leaf, idx);
+        }
+        println!("Time V2: {} ms", now.elapsed().as_millis());
+
+        // compare leaves
+        let t2leaves = tree2.get_leaves_1d();
+        tree1
+            .get_leaves_1d()
+            .chunks_exact(leaf_size)
+            .enumerate()
+            .for_each(|(i, x)| {
+                let mut ok = true;
+                for j in 0..leaf_size {
+                    if x[j] != t2leaves[i * leaf_size + j] {
+                        ok = false;
+                        break;
+                    }
+                }
+                if !ok {
+                    println!("Leaves different at index {:?}", i);
+                }
+                assert!(ok);
+            });
+
+        // compare trees
+        tree1.digests.into_iter().enumerate().for_each(|(i, x)| {
+            let y = tree2.digests[i];
+            if x != y {
+                println!("Digests different at index {:?}", i);
+            }
+            assert_eq!(x, y);
+        });
+        tree1.cap.0.into_iter().enumerate().for_each(|(i, x)| {
+            let y = tree2.cap.0[i];
+            if x != y {
+                println!("Cap different at index {:?}", i);
+            }
+            assert_eq!(x, y);
+        });
+    }
+
+    fn verify_change_leaf_and_update_range(
+        leaves_count: usize,
+        leaf_size: usize,
+        cap_height: usize,
+        start_index: usize,
+        end_index: usize,
+    ) {
+        // use plonky2_field::types::Field;
+
+        const D: usize = 2;
+        type C = PoseidonGoldilocksConfig;
+        type F = <C as GenericConfig<D>>::F;
+
+        let raw_leaves: Vec<Vec<F>> = random_data::<F>(leaves_count, leaf_size);
+        let vals: Vec<Vec<F>> = random_data::<F>(end_index - start_index, leaf_size);
+
+        let mut leaves1_1d: Vec<F> = raw_leaves.into_iter().flatten().collect();
+        let leaves2_1d: Vec<F> = leaves1_1d.clone();
+
+        let mut tree2 = MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new_from_1d(
+            leaves2_1d, leaf_size, cap_height,
+        );
+
+        // v1
+        let now = Instant::now();
+        for i in start_index..end_index {
+            for j in 0..leaf_size {
+                leaves1_1d[i * leaf_size + j] = vals[i - start_index][j];
+            }
+        }
+        let tree1 = MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new_from_1d(
+            leaves1_1d, leaf_size, cap_height,
+        );
+        println!("Time V1: {} ms", now.elapsed().as_millis());
+
+        // v2
+        let now = Instant::now();
+        /*
+        for idx in start_index..end_index {
+            let mut leaf: Vec<F> = vec![F::from_canonical_u64(0); leaf_size];
+            for j in 0..leaf_size {
+                leaf[j] = vals[idx - start_index][j];
+            }
+            tree2.change_leaf_and_update(leaf, idx);
+        }
+        */
+        tree2.change_leaves_in_range_and_update(vals, start_index, end_index);
+        println!("Time V2: {} ms", now.elapsed().as_millis());
+
+        // compare leaves
+        let t2leaves = tree2.get_leaves_1d();
+        tree1
+            .get_leaves_1d()
+            .chunks_exact(leaf_size)
+            .enumerate()
+            .for_each(|(i, x)| {
+                let mut ok = true;
+                for j in 0..leaf_size {
+                    if x[j] != t2leaves[i * leaf_size + j] {
+                        ok = false;
+                        break;
+                    }
+                }
+                if !ok {
+                    println!("Leaves different at index {:?}", i);
+                }
+                assert!(ok);
+            });
+
+        // compare trees
+        tree1.digests.into_iter().enumerate().for_each(|(i, x)| {
+            let y = tree2.digests[i];
+            if x != y {
+                println!("Digests different at index {:?}", i);
+            }
+            assert_eq!(x, y);
+        });
+        tree1.cap.0.into_iter().enumerate().for_each(|(i, x)| {
+            let y = tree2.cap.0[i];
+            if x != y {
+                println!("Cap different at index {:?}", i);
+            }
+            assert_eq!(x, y);
+        });
+    }
+
     #[test]
     #[should_panic]
     fn test_cap_height_too_big() {
@@ -277,7 +1129,7 @@ pub(crate) mod tests {
         let cap_height = log_n + 1; // Should panic if `cap_height > len_n`.
 
         let leaves = random_data::<F>(1 << log_n, 7);
-        let _ = MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new(leaves, cap_height);
+        let _ = MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new_from_2d(leaves, cap_height);
     }
 
     #[test]
@@ -296,12 +1148,89 @@ pub(crate) mod tests {
     }
 
     #[test]
-    fn test_merkle_trees() -> Result<()> {
+    fn test_change_leaf_and_update() -> Result<()> {
+        // small tree, 1 cap
+        verify_change_leaf_and_update(3, 0);
+        // small tree, 2 cap
+        verify_change_leaf_and_update(3, 1);
+        // small tree, 4 cap
+        verify_change_leaf_and_update(3, 2);
+        // small tree, all cap
+        verify_change_leaf_and_update(3, 3);
+
+        // big tree
+        verify_change_leaf_and_update(12, 3);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_change_leaf_and_update_range() -> Result<()> {
+        for h in 0..11 {
+            println!(
+                "Run verify_change_leaf_and_update_range_one_by_one() for height {:?}",
+                h
+            );
+            verify_change_leaf_and_update_range_one_by_one(1024, 68, h, 32, 48);
+            println!(
+                "Run verify_change_leaf_and_update_range() for height {:?}",
+                h
+            );
+            verify_change_leaf_and_update_range(1024, 68, h, 32, 48);
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_merkle_trees_poseidon_g64() -> Result<()> {
         const D: usize = 2;
         type C = PoseidonGoldilocksConfig;
         type F = <C as GenericConfig<D>>::F;
 
-        let log_n = 8;
+        // GPU warmup
+        #[cfg(feature = "cuda")]
+        let _x: HostOrDeviceSlice<'_, F> = HostOrDeviceSlice::cuda_malloc(0, 64).unwrap();
+
+        let log_n = 12;
+        let n = 1 << log_n;
+        let leaves = random_data::<F>(n, 7);
+
+        verify_all_leaves::<F, C, D>(leaves, 1)?;
+
+        Ok(())
+    }
+
+    #[cfg(feature = "cuda")]
+    #[test]
+    fn test_merkle_trees_cuda_poseidon_g64() -> Result<()> {
+        const D: usize = 2;
+        type C = PoseidonGoldilocksConfig;
+        type F = <C as GenericConfig<D>>::F;
+
+        let log_n = 14;
+        let n = 1 << log_n;
+        let leaves = random_data::<F>(n, 7);
+        let leaves_1d: Vec<F> = leaves.into_iter().flatten().collect();
+
+        let mut gpu_data: HostOrDeviceSlice<'_, F> =
+            HostOrDeviceSlice::cuda_malloc(0, n * 7).unwrap();
+        gpu_data
+            .copy_from_host(leaves_1d.as_slice())
+            .expect("copy data to gpu");
+
+        MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new_from_gpu_leaves(&gpu_data, n, 7, 1);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_merkle_trees_keccak() -> Result<()> {
+        const D: usize = 2;
+        type C = KeccakGoldilocksConfig;
+        type F = <C as GenericConfig<D>>::F;
+
+        let log_n = 12;
         let n = 1 << log_n;
         let leaves = random_data::<F>(n, 7);
 
diff --git a/plonky2/src/hash/mod.rs b/plonky2/src/hash/mod.rs
index 0d3436973..d8090310d 100644
--- a/plonky2/src/hash/mod.rs
+++ b/plonky2/src/hash/mod.rs
@@ -2,7 +2,7 @@
 //! as well as specific hash functions implementation.
 
 mod arch;
-pub mod batch_merkle_tree;
+// pub mod batch_merkle_tree;
 pub mod hash_types;
 pub mod hashing;
 pub mod keccak;
diff --git a/plonky2/src/hash/path_compression.rs b/plonky2/src/hash/path_compression.rs
index 517576bf0..5a7f7e1ca 100644
--- a/plonky2/src/hash/path_compression.rs
+++ b/plonky2/src/hash/path_compression.rs
@@ -129,8 +129,14 @@ mod tests {
         type F = <C as GenericConfig<D>>::F;
         let h = 10;
         let cap_height = 3;
-        let vs = (0..1 << h).map(|_| vec![F::rand()]).collect::<Vec<_>>();
-        let mt = MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new(vs.clone(), cap_height);
+        let vs = (0..1 << h)
+            .flat_map(|_| vec![F::rand()])
+            .collect::<Vec<_>>();
+        let mt = MerkleTree::<F, <C as GenericConfig<D>>::Hasher>::new_from_1d(
+            vs.clone(),
+            1,
+            cap_height,
+        );
 
         let mut rng = OsRng;
         let k = rng.gen_range(1..=1 << h);
@@ -139,7 +145,10 @@ mod tests {
 
         let compressed_proofs = compress_merkle_proofs(cap_height, &indices, &proofs);
         let decompressed_proofs = decompress_merkle_proofs(
-            &indices.iter().map(|&i| vs[i].clone()).collect::<Vec<_>>(),
+            &indices
+                .iter()
+                .map(|&i| vec![vs[i].clone()])
+                .collect::<Vec<_>>(),
             &indices,
             &compressed_proofs,
             h,
diff --git a/plonky2/src/hash/poseidon.rs b/plonky2/src/hash/poseidon.rs
index a7c763252..2b6dd0ac2 100644
--- a/plonky2/src/hash/poseidon.rs
+++ b/plonky2/src/hash/poseidon.rs
@@ -18,7 +18,7 @@ use crate::hash::hashing::{compress, hash_n_to_hash_no_pad, PlonkyPermutation};
 use crate::iop::ext_target::ExtensionTarget;
 use crate::iop::target::{BoolTarget, Target};
 use crate::plonk::circuit_builder::CircuitBuilder;
-use crate::plonk::config::{AlgebraicHasher, Hasher};
+use crate::plonk::config::{AlgebraicHasher, Hasher, HasherType};
 
 pub const SPONGE_RATE: usize = 8;
 pub const SPONGE_CAPACITY: usize = 4;
@@ -874,6 +874,7 @@ impl<T: Copy + Debug + Default + Eq + Permuter + Send + Sync> PlonkyPermutation<
 pub struct PoseidonHash;
 impl<F: RichField> Hasher<F> for PoseidonHash {
     const HASH_SIZE: usize = 4 * 8;
+    const HASHER_TYPE: HasherType = HasherType::Poseidon;
     type Hash = HashOut<F>;
     type Permutation = PoseidonPermutation<F>;
 
diff --git a/plonky2/src/hash/poseidon2/hash.rs b/plonky2/src/hash/poseidon2/hash.rs
index d6fb72bfd..e73db08bc 100644
--- a/plonky2/src/hash/poseidon2/hash.rs
+++ b/plonky2/src/hash/poseidon2/hash.rs
@@ -12,7 +12,7 @@ use crate::hash::hashing::{compress, hash_n_to_hash_no_pad, PlonkyPermutation};
 use crate::iop::ext_target::ExtensionTarget;
 use crate::iop::target::{BoolTarget, Target};
 use crate::plonk::circuit_builder::CircuitBuilder;
-use crate::plonk::config::{AlgebraicHasher, Hasher};
+use crate::plonk::config::{AlgebraicHasher, Hasher, HasherType};
 
 pub trait Poseidon2: PrimeField64 {
     #[inline]
@@ -498,6 +498,7 @@ fn sum_12<F: PrimeField64>(inputs: &[F]) -> F {
 pub struct Poseidon2Hash;
 impl<F: RichField + Poseidon2> Hasher<F> for Poseidon2Hash {
     const HASH_SIZE: usize = 4 * 8;
+    const HASHER_TYPE: HasherType = HasherType::Poseidon2;
     type Hash = HashOut<F>;
     type Permutation = Poseidon2Permutation<F>;
 
diff --git a/plonky2/src/iop/generator.rs b/plonky2/src/iop/generator.rs
index f81508b7a..d24d8d42b 100644
--- a/plonky2/src/iop/generator.rs
+++ b/plonky2/src/iop/generator.rs
@@ -36,7 +36,6 @@ pub fn generate_partial_witness<
     let config = &common_data.config;
     let generators = &prover_data.generators;
     let generator_indices_by_watches = &prover_data.generator_indices_by_watches;
-
     let mut witness = PartitionWitness::new(
         config.num_wires,
         common_data.degree(),
diff --git a/plonky2/src/lib.rs b/plonky2/src/lib.rs
index 8772ecfc0..4bf8cc982 100644
--- a/plonky2/src/lib.rs
+++ b/plonky2/src/lib.rs
@@ -11,7 +11,7 @@ pub extern crate alloc;
 #[doc(inline)]
 pub use plonky2_field as field;
 
-pub mod batch_fri;
+// pub mod batch_fri;
 pub mod fri;
 pub mod gadgets;
 pub mod gates;
diff --git a/plonky2/src/plonk/circuit_builder.rs b/plonky2/src/plonk/circuit_builder.rs
index 421d15cdc..74fa4be62 100644
--- a/plonky2/src/plonk/circuit_builder.rs
+++ b/plonky2/src/plonk/circuit_builder.rs
@@ -1225,6 +1225,24 @@ impl<F: RichField + Extendable<D>, const D: usize> CircuitBuilder<F, D> {
         let max_fft_points = 1 << (degree_bits + max(rate_bits, log2_ceil(quotient_degree_factor)));
         let fft_root_table = fft_root_table(max_fft_points);
 
+        // Initialize GPU twiddle factors for all sizes we'll use
+        #[cfg(feature = "cuda")]
+        {
+            if F::CUDA_SUPPORT {
+                zeknox::clear_cuda_errors_rs();
+                // Initialize twiddle factors for degree and LDE sizes
+                // degree_bits: the base degree
+                // degree_bits + rate_bits: the LDE size used in from_coeffs
+                // We need to initialize from 0 to cover all potential sizes
+                let max_log_size = degree_bits + max(rate_bits, log2_ceil(quotient_degree_factor));
+                for i in 0..=max_log_size {
+                    zeknox::init_twiddle_factors_rs(0, i);
+                }
+            }
+        }
+
+        // This part of the code on GPU is buggy. So we use CPU for computation.
+        // It does not impact performance as this is only done once during setup.
         let constants_sigmas_commitment = if commit_to_sigma {
             let constants_sigmas_vecs = [constant_vecs, sigma_vecs.clone()].concat();
             PolynomialBatch::<F, C, D>::from_values(
diff --git a/plonky2/src/plonk/circuit_data.rs b/plonky2/src/plonk/circuit_data.rs
index e06e9b69b..61aa36ab0 100644
--- a/plonky2/src/plonk/circuit_data.rs
+++ b/plonky2/src/plonk/circuit_data.rs
@@ -19,6 +19,7 @@ use core::ops::{Range, RangeFrom};
 use std::collections::BTreeMap;
 
 use anyhow::Result;
+use log::Level;
 use serde::Serialize;
 
 use super::circuit_builder::LookupWire;
@@ -213,12 +214,11 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
     }
 
     pub fn prove(&self, inputs: PartialWitness<F>) -> Result<ProofWithPublicInputs<F, C, D>> {
-        prove::<F, C, D>(
-            &self.prover_only,
-            &self.common,
-            inputs,
-            &mut TimingTree::default(),
-        )
+        let mut timing = TimingTree::new("CircuitData::prove", Level::Debug);
+
+        let res = prove::<F, C, D>(&self.prover_only, &self.common, inputs, &mut timing);
+        timing.print();
+        res
     }
 
     pub fn verify(&self, proof_with_pis: ProofWithPublicInputs<F, C, D>) -> Result<()> {
diff --git a/plonky2/src/plonk/config.rs b/plonky2/src/plonk/config.rs
index 4bbcbc4bc..b3c44fce4 100644
--- a/plonky2/src/plonk/config.rs
+++ b/plonky2/src/plonk/config.rs
@@ -24,6 +24,14 @@ use crate::hash::poseidon2::hash::Poseidon2Hash;
 use crate::iop::target::{BoolTarget, Target};
 use crate::plonk::circuit_builder::CircuitBuilder;
 
+#[derive(PartialEq, Debug, Copy, Clone)]
+pub enum HasherType {
+    Poseidon = 0,
+    Keccak = 1,
+    PoseidonBN128 = 2,
+    Poseidon2 = 3,
+}
+
 pub trait GenericHashOut<F: RichField>:
     Copy + Clone + Debug + Eq + PartialEq + Send + Sync + Serialize + DeserializeOwned
 {
@@ -35,6 +43,8 @@ pub trait GenericHashOut<F: RichField>:
 
 /// Trait for hash functions.
 pub trait Hasher<F: RichField>: Sized + Copy + Debug + Eq + PartialEq {
+    const HASHER_TYPE: HasherType;
+
     /// Size of `Hash` in bytes.
     const HASH_SIZE: usize;
 
diff --git a/plonky2/src/plonk/prover.rs b/plonky2/src/plonk/prover.rs
index ac0a683cf..18ce4fc64 100644
--- a/plonky2/src/plonk/prover.rs
+++ b/plonky2/src/plonk/prover.rs
@@ -7,6 +7,7 @@ use core::mem::swap;
 
 use anyhow::{ensure, Result};
 use hashbrown::HashMap;
+use plonky2_field::util::vec_zeroed;
 use plonky2_maybe_rayon::*;
 
 use super::circuit_builder::{LookupChallenges, LookupWire};
@@ -159,17 +160,16 @@ where
         "compute full witness",
         partition_witness.full_witness()
     );
-
     let wires_values: Vec<PolynomialValues<F>> = timed!(
         timing,
         "compute wire polynomials",
+        // Use sequential iteration for deterministic results
         witness
             .wire_values
             .par_iter()
             .map(|column| PolynomialValues::new(column.clone()))
             .collect()
     );
-
     let wires_commitment = timed!(
         timing,
         "compute wires commitment",
@@ -182,7 +182,6 @@ where
             prover_data.fft_root_table.as_ref(),
         )
     );
-
     let mut challenger = Challenger::<F, C::Hasher>::new();
 
     // Observe the FRI config
@@ -270,6 +269,7 @@ where
             &gammas,
             &deltas,
             &alphas,
+            timing,
         )
     );
 
@@ -304,6 +304,7 @@ where
     challenger.observe_cap::<C::Hasher>(&quotient_polys_commitment.merkle_tree.cap);
 
     let zeta = challenger.get_extension_challenge::<D>();
+
     // To avoid leaking witness data, we want to ensure that our opening locations, `zeta` and
     // `g * zeta`, are not in our subgroup `H`. It suffices to check `zeta` only, since
     // `(g * zeta)^n = zeta^n`, where `n` is the order of `g`.
@@ -326,6 +327,7 @@ where
             common_data
         )
     );
+
     challenger.observe_openings(&openings.to_fri_openings());
     let instance = common_data.get_fri_instance(zeta);
 
@@ -621,6 +623,7 @@ fn compute_quotient_polys<
     gammas: &[F],
     deltas: &[F],
     alphas: &[F],
+    timing: &mut TimingTree,
 ) -> Vec<PolynomialCoeffs<F>> {
     let num_challenges = common_data.config.num_challenges;
 
@@ -640,7 +643,12 @@ fn compute_quotient_polys<
     // steps away since we work on an LDE of degree `max_filtered_constraint_degree`.
     let next_step = 1 << quotient_degree_bits;
 
-    let points = F::two_adic_subgroup(common_data.degree_bits() + quotient_degree_bits);
+    let points = timed!(
+        timing,
+        "set up subgroup generators",
+        F::two_adic_subgroup(common_data.degree_bits() + quotient_degree_bits)
+    );
+
     let lde_size = points.len();
 
     let z_h_on_coset = ZeroPolyOnCoset::new(common_data.degree_bits(), quotient_degree_bits);
@@ -649,167 +657,184 @@ fn compute_quotient_polys<
     // These values are used to produce the final RE constraints for each lut,
     // and are the same each time in check_lookup_constraints_batched.
     // lut_poly_evals[i][j] gives the eval for the i'th challenge and the j'th lookup table
-    let lut_re_poly_evals: Vec<Vec<F>> = if has_lookup {
-        let num_lut_slots = LookupTableGate::num_slots(&common_data.config);
-        (0..num_challenges)
-            .map(move |i| {
-                let cur_deltas = &deltas[NUM_COINS_LOOKUP * i..NUM_COINS_LOOKUP * (i + 1)];
-                let cur_challenge_delta = cur_deltas[LookupChallenges::ChallengeDelta as usize];
-
-                (LookupSelectors::StartEnd as usize..common_data.num_lookup_selectors)
-                    .map(|r| {
-                        let lut_row_number = common_data.luts
-                            [r - LookupSelectors::StartEnd as usize]
-                            .len()
-                            .div_ceil(num_lut_slots);
-
-                        get_lut_poly(
-                            common_data,
-                            r - LookupSelectors::StartEnd as usize,
-                            cur_deltas,
-                            num_lut_slots * lut_row_number,
-                        )
-                        .eval(cur_challenge_delta)
-                    })
-                    .collect()
-            })
-            .collect()
-    } else {
-        vec![]
-    };
+    let lut_re_poly_evals: Vec<Vec<F>> = timed!(
+        timing,
+        "compute LUT RE polynomial evals",
+        if has_lookup {
+            let num_lut_slots = LookupTableGate::num_slots(&common_data.config);
+            (0..num_challenges)
+                .map(move |i| {
+                    let cur_deltas = &deltas[NUM_COINS_LOOKUP * i..NUM_COINS_LOOKUP * (i + 1)];
+                    let cur_challenge_delta = cur_deltas[LookupChallenges::ChallengeDelta as usize];
+
+                    (LookupSelectors::StartEnd as usize..common_data.num_lookup_selectors)
+                        .map(|r| {
+                            let lut_row_number = common_data.luts
+                                [r - LookupSelectors::StartEnd as usize]
+                                .len()
+                                .div_ceil(num_lut_slots);
+
+                            get_lut_poly(
+                                common_data,
+                                r - LookupSelectors::StartEnd as usize,
+                                cur_deltas,
+                                num_lut_slots * lut_row_number,
+                            )
+                            .eval(cur_challenge_delta)
+                        })
+                        .collect()
+                })
+                .collect()
+        } else {
+            vec![]
+        }
+    );
 
     let lut_re_poly_evals_refs: Vec<&[F]> =
         lut_re_poly_evals.iter().map(|v| v.as_slice()).collect();
 
-    let points_batches = points.par_chunks(BATCH_SIZE);
     let num_batches = points.len().div_ceil(BATCH_SIZE);
 
-    let quotient_values: Vec<Vec<F>> = points_batches
-        .enumerate()
-        .flat_map(|(batch_i, xs_batch)| {
-            // Each batch must be the same size, except the last one, which may be smaller.
-            debug_assert!(
-                xs_batch.len() == BATCH_SIZE
-                    || (batch_i == num_batches - 1 && xs_batch.len() <= BATCH_SIZE)
-            );
-
-            let indices_batch: Vec<usize> =
-                (BATCH_SIZE * batch_i..BATCH_SIZE * batch_i + xs_batch.len()).collect();
-
-            let mut shifted_xs_batch = Vec::with_capacity(xs_batch.len());
-            let mut local_zs_batch = Vec::with_capacity(xs_batch.len());
-            let mut next_zs_batch = Vec::with_capacity(xs_batch.len());
-
-            let mut local_lookup_batch = Vec::with_capacity(xs_batch.len());
-            let mut next_lookup_batch = Vec::with_capacity(xs_batch.len());
-
-            let mut partial_products_batch = Vec::with_capacity(xs_batch.len());
-            let mut s_sigmas_batch = Vec::with_capacity(xs_batch.len());
-
-            let mut local_constants_batch_refs = Vec::with_capacity(xs_batch.len());
-            let mut local_wires_batch_refs = Vec::with_capacity(xs_batch.len());
-
-            for (&i, &x) in indices_batch.iter().zip(xs_batch) {
-                let shifted_x = F::coset_shift() * x;
-                let i_next = (i + next_step) % lde_size;
-                let local_constants_sigmas = prover_data
-                    .constants_sigmas_commitment
-                    .get_lde_values(i, step);
-                let local_constants = &local_constants_sigmas[common_data.constants_range()];
-                let s_sigmas = &local_constants_sigmas[common_data.sigmas_range()];
-                let local_wires = wires_commitment.get_lde_values(i, step);
-                let local_zs_partial_and_lookup =
-                    zs_partial_products_and_lookup_commitment.get_lde_values(i, step);
-                let next_zs_partial_and_lookup =
-                    zs_partial_products_and_lookup_commitment.get_lde_values(i_next, step);
-
-                let local_zs = &local_zs_partial_and_lookup[common_data.zs_range()];
-
-                let next_zs = &next_zs_partial_and_lookup[common_data.zs_range()];
-
-                let partial_products =
-                    &local_zs_partial_and_lookup[common_data.partial_products_range()];
-
-                if has_lookup {
-                    let local_lookup_zs = &local_zs_partial_and_lookup[common_data.lookup_range()];
-
-                    let next_lookup_zs = &next_zs_partial_and_lookup[common_data.lookup_range()];
-                    debug_assert_eq!(local_lookup_zs.len(), common_data.num_all_lookup_polys());
-
-                    local_lookup_batch.push(local_lookup_zs);
-                    next_lookup_batch.push(next_lookup_zs);
-                }
+    let quotient_values: Vec<Vec<F>> = timed!(timing, "compute quotient value", {
+        points
+            .par_chunks(BATCH_SIZE)
+            .enumerate()
+            .flat_map(|(batch_i, xs_batch)| {
+                // Each batch must be the same size, except the last one, which may be smaller.
+                debug_assert!(
+                    xs_batch.len() == BATCH_SIZE
+                        || (batch_i == num_batches - 1 && xs_batch.len() <= BATCH_SIZE)
+                );
 
-                debug_assert_eq!(local_wires.len(), common_data.config.num_wires);
-                debug_assert_eq!(local_zs.len(), num_challenges);
+                let batch_size = xs_batch.len();
+                let batch_start = BATCH_SIZE * batch_i;
 
-                local_constants_batch_refs.push(local_constants);
-                local_wires_batch_refs.push(local_wires);
+                let mut shifted_xs_batch = Vec::with_capacity(batch_size);
+                let mut local_zs_batch = Vec::with_capacity(batch_size);
+                let mut next_zs_batch = Vec::with_capacity(batch_size);
 
-                shifted_xs_batch.push(shifted_x);
-                local_zs_batch.push(local_zs);
-                next_zs_batch.push(next_zs);
-                partial_products_batch.push(partial_products);
-                s_sigmas_batch.push(s_sigmas);
-            }
+                let mut local_lookup_batch = Vec::with_capacity(batch_size);
+                let mut next_lookup_batch = Vec::with_capacity(batch_size);
+
+                let mut partial_products_batch = Vec::with_capacity(batch_size);
+                let mut s_sigmas_batch = Vec::with_capacity(batch_size);
+
+                let mut local_constants_batch_refs = Vec::with_capacity(batch_size);
+                let mut local_wires_batch_refs = Vec::with_capacity(batch_size);
+
+                for (j, &x) in xs_batch.iter().enumerate() {
+                    let i = batch_start + j;
+                    let shifted_x = F::coset_shift() * x;
+                    let i_next = (i + next_step) % lde_size;
+                    let local_constants_sigmas = prover_data
+                        .constants_sigmas_commitment
+                        .get_lde_values(i, step);
+                    let local_constants = &local_constants_sigmas[common_data.constants_range()];
+                    let s_sigmas = &local_constants_sigmas[common_data.sigmas_range()];
+                    let local_wires = wires_commitment.get_lde_values(i, step);
+                    let local_zs_partial_and_lookup =
+                        zs_partial_products_and_lookup_commitment.get_lde_values(i, step);
+                    let next_zs_partial_and_lookup =
+                        zs_partial_products_and_lookup_commitment.get_lde_values(i_next, step);
+
+                    let local_zs = &local_zs_partial_and_lookup[common_data.zs_range()];
+
+                    let next_zs = &next_zs_partial_and_lookup[common_data.zs_range()];
 
-            // NB (JN): I'm not sure how (in)efficient the below is. It needs measuring.
-            let mut local_constants_batch =
-                vec![F::ZERO; xs_batch.len() * local_constants_batch_refs[0].len()];
-            for i in 0..local_constants_batch_refs[0].len() {
-                for (j, constants) in local_constants_batch_refs.iter().enumerate() {
-                    local_constants_batch[i * xs_batch.len() + j] = constants[i];
+                    let partial_products =
+                        &local_zs_partial_and_lookup[common_data.partial_products_range()];
+
+                    if has_lookup {
+                        let local_lookup_zs =
+                            &local_zs_partial_and_lookup[common_data.lookup_range()];
+
+                        let next_lookup_zs =
+                            &next_zs_partial_and_lookup[common_data.lookup_range()];
+                        debug_assert_eq!(local_lookup_zs.len(), common_data.num_all_lookup_polys());
+
+                        local_lookup_batch.push(local_lookup_zs);
+                        next_lookup_batch.push(next_lookup_zs);
+                    }
+
+                    debug_assert_eq!(local_wires.len(), common_data.config.num_wires);
+                    debug_assert_eq!(local_zs.len(), num_challenges);
+
+                    local_constants_batch_refs.push(local_constants);
+                    local_wires_batch_refs.push(local_wires);
+
+                    shifted_xs_batch.push(shifted_x);
+                    local_zs_batch.push(local_zs);
+                    next_zs_batch.push(next_zs);
+                    partial_products_batch.push(partial_products);
+                    s_sigmas_batch.push(s_sigmas);
                 }
-            }
 
-            let mut local_wires_batch =
-                vec![F::ZERO; xs_batch.len() * local_wires_batch_refs[0].len()];
-            for i in 0..local_wires_batch_refs[0].len() {
-                for (j, wires) in local_wires_batch_refs.iter().enumerate() {
-                    local_wires_batch[i * xs_batch.len() + j] = wires[i];
+                // Optimized transposition with better cache locality
+                let n_constants = local_constants_batch_refs[0].len();
+                let mut local_constants_batch = vec![F::ZERO; xs_batch.len() * n_constants];
+                for i in 0..n_constants {
+                    let offset = i * xs_batch.len();
+                    for (j, constants) in local_constants_batch_refs.iter().enumerate() {
+                        local_constants_batch[offset + j] = constants[i];
+                    }
                 }
-            }
 
-            let vars_batch = EvaluationVarsBaseBatch::new(
-                xs_batch.len(),
-                &local_constants_batch,
-                &local_wires_batch,
-                public_inputs_hash,
-            );
+                let n_wires = local_wires_batch_refs[0].len();
+                let mut local_wires_batch = vec![F::ZERO; xs_batch.len() * n_wires];
+                for i in 0..n_wires {
+                    let offset = i * xs_batch.len();
+                    for (j, wires) in local_wires_batch_refs.iter().enumerate() {
+                        local_wires_batch[offset + j] = wires[i];
+                    }
+                }
 
-            let mut quotient_values_batch = eval_vanishing_poly_base_batch::<F, D>(
-                common_data,
-                &indices_batch,
-                &shifted_xs_batch,
-                vars_batch,
-                &local_zs_batch,
-                &next_zs_batch,
-                &local_lookup_batch,
-                &next_lookup_batch,
-                &partial_products_batch,
-                &s_sigmas_batch,
-                betas,
-                gammas,
-                deltas,
-                alphas,
-                &z_h_on_coset,
-                &lut_re_poly_evals_refs,
-            );
-
-            for (&i, quotient_values) in indices_batch.iter().zip(quotient_values_batch.iter_mut())
-            {
-                let denominator_inv = z_h_on_coset.eval_inverse(i);
-                quotient_values
-                    .iter_mut()
-                    .for_each(|v| *v *= denominator_inv);
-            }
-            quotient_values_batch
-        })
-        .collect();
+                let vars_batch = EvaluationVarsBaseBatch::new(
+                    xs_batch.len(),
+                    &local_constants_batch,
+                    &local_wires_batch,
+                    public_inputs_hash,
+                );
 
-    transpose(&quotient_values)
-        .into_par_iter()
-        .map(PolynomialValues::new)
-        .map(|values| values.coset_ifft(F::coset_shift()))
-        .collect()
+                let indices_batch: Vec<usize> = (batch_start..batch_start + batch_size).collect();
+                let mut quotient_values_batch = eval_vanishing_poly_base_batch::<F, D>(
+                    common_data,
+                    &indices_batch,
+                    &shifted_xs_batch,
+                    vars_batch,
+                    &local_zs_batch,
+                    &next_zs_batch,
+                    &local_lookup_batch,
+                    &next_lookup_batch,
+                    &partial_products_batch,
+                    &s_sigmas_batch,
+                    betas,
+                    gammas,
+                    deltas,
+                    alphas,
+                    &z_h_on_coset,
+                    &lut_re_poly_evals_refs,
+                );
+
+                for (j, quotient_values) in quotient_values_batch.iter_mut().enumerate() {
+                    let i = batch_start + j;
+                    let denominator_inv = z_h_on_coset.eval_inverse(i);
+                    quotient_values
+                        .iter_mut()
+                        .for_each(|v| *v *= denominator_inv);
+                }
+
+                quotient_values_batch
+            })
+            .collect()
+    });
+
+    timed!(
+        timing,
+        "transpose and final ifft",
+        transpose(&quotient_values)
+            .into_par_iter()
+            .map(PolynomialValues::new)
+            .map(|values| values.coset_ifft(F::coset_shift()))
+            .collect()
+    )
 }
diff --git a/plonky2/src/plonk/vanishing_poly.rs b/plonky2/src/plonk/vanishing_poly.rs
index 48179ce63..6a5b8a266 100644
--- a/plonky2/src/plonk/vanishing_poly.rs
+++ b/plonky2/src/plonk/vanishing_poly.rs
@@ -211,20 +211,29 @@ pub(crate) fn eval_vanishing_poly_base_batch<F: RichField + Extendable<D>, const
     let num_challenges = common_data.config.num_challenges;
     let num_routed_wires = common_data.config.num_routed_wires;
 
-    let mut numerator_values = Vec::with_capacity(num_routed_wires);
-    let mut denominator_values = Vec::with_capacity(num_routed_wires);
+    // Pre-allocate reusable buffers with exact capacities
+    let mut numerator_values = Vec::with_capacity(num_routed_wires * num_challenges);
+    let mut denominator_values = Vec::with_capacity(num_routed_wires * num_challenges);
 
     // The L_0(x) (Z(x) - 1) vanishing terms.
     let mut vanishing_z_1_terms = Vec::with_capacity(num_challenges);
     // The terms checking the partial products.
-    let mut vanishing_partial_products_terms = Vec::new();
+    let mut vanishing_partial_products_terms = Vec::with_capacity(num_challenges * num_prods);
 
     // The terms checking the lookup constraints.
-    let mut vanishing_all_lookup_terms = if has_lookup {
+    let lookup_terms_capacity = if has_lookup {
         let num_sldc_polys = common_data.num_lookup_polys - 1;
-        Vec::with_capacity(
-            common_data.config.num_challenges * (4 + common_data.luts.len() + 2 * num_sldc_polys),
-        )
+        num_challenges * (4 + common_data.luts.len() + 2 * num_sldc_polys)
+    } else {
+        0
+    };
+    let mut vanishing_all_lookup_terms = Vec::with_capacity(lookup_terms_capacity);
+
+    // Pre-allocate selector buffer if needed
+    let selector_offset = common_data.selectors_info.num_selectors();
+    let num_lookup_selectors = common_data.num_lookup_selectors;
+    let mut lookup_selectors = if has_lookup && num_lookup_selectors > 0 {
+        Vec::with_capacity(num_lookup_selectors)
     } else {
         Vec::new()
     };
@@ -235,22 +244,20 @@ pub(crate) fn eval_vanishing_poly_base_batch<F: RichField + Extendable<D>, const
         let x = xs_batch[k];
         let vars = vars_batch.view(k);
 
-        let lookup_selectors: Vec<F> = (0..common_data.num_lookup_selectors)
-            .map(|i| vars.local_constants[common_data.selectors_info.num_selectors() + i])
-            .collect();
+        // Reuse lookup_selectors buffer
+        if has_lookup {
+            lookup_selectors.clear();
+            lookup_selectors.extend(
+                (0..num_lookup_selectors).map(|i| vars.local_constants[selector_offset + i]),
+            );
+        }
 
         let local_zs = local_zs_batch[k];
         let next_zs = next_zs_batch[k];
-        let local_lookup_zs = if has_lookup {
-            local_lookup_zs_batch[k]
-        } else {
-            &[]
-        };
-
-        let next_lookup_zs = if has_lookup {
-            next_lookup_zs_batch[k]
+        let (local_lookup_zs, next_lookup_zs) = if has_lookup {
+            (local_lookup_zs_batch[k], next_lookup_zs_batch[k])
         } else {
-            &[]
+            (&[][..], &[][..])
         };
 
         let partial_products = partial_products_batch[k];
@@ -259,6 +266,16 @@ pub(crate) fn eval_vanishing_poly_base_batch<F: RichField + Extendable<D>, const
         let constraint_terms = PackedStridedView::new(&constraint_terms_batch, n, k);
 
         let l_0_x = z_h_on_coset.eval_l_0(index, x);
+
+        // Pre-compute common values for all challenges
+        let beta_x_s_ids: Vec<F> = if num_challenges > 0 {
+            (0..num_routed_wires)
+                .map(|j| common_data.k_is[j] * x)
+                .collect()
+        } else {
+            Vec::new()
+        };
+
         for i in 0..num_challenges {
             let z_x = local_zs[i];
             let z_gx = next_zs[i];
@@ -268,10 +285,10 @@ pub(crate) fn eval_vanishing_poly_base_batch<F: RichField + Extendable<D>, const
             if has_lookup {
                 let cur_deltas = &deltas[NUM_COINS_LOOKUP * i..NUM_COINS_LOOKUP * (i + 1)];
 
-                let cur_local_lookup_zs = &local_lookup_zs
-                    [common_data.num_lookup_polys * i..common_data.num_lookup_polys * (i + 1)];
-                let cur_next_lookup_zs = &next_lookup_zs
-                    [common_data.num_lookup_polys * i..common_data.num_lookup_polys * (i + 1)];
+                let lookup_poly_start = common_data.num_lookup_polys * i;
+                let lookup_poly_end = lookup_poly_start + common_data.num_lookup_polys;
+                let cur_local_lookup_zs = &local_lookup_zs[lookup_poly_start..lookup_poly_end];
+                let cur_next_lookup_zs = &next_lookup_zs[lookup_poly_start..lookup_poly_end];
 
                 let lookup_constraints = check_lookup_constraints_batch(
                     common_data,
@@ -285,17 +302,17 @@ pub(crate) fn eval_vanishing_poly_base_batch<F: RichField + Extendable<D>, const
                 vanishing_all_lookup_terms.extend(lookup_constraints);
             }
 
-            numerator_values.extend((0..num_routed_wires).map(|j| {
-                let wire_value = vars.local_wires[j];
-                let k_i = common_data.k_is[j];
-                let s_id = k_i * x;
-                wire_value + betas[i] * s_id + gammas[i]
-            }));
-            denominator_values.extend((0..num_routed_wires).map(|j| {
+            let beta_i = betas[i];
+            let gamma_i = gammas[i];
+
+            // Compute numerators and denominators in a single pass
+            for j in 0..num_routed_wires {
                 let wire_value = vars.local_wires[j];
-                let s_sigma = s_sigmas[j];
-                wire_value + betas[i] * s_sigma + gammas[i]
-            }));
+                numerator_values
+                    .push(wire_value + gamma_i.multiply_accumulate(beta_i, beta_x_s_ids[j]));
+                denominator_values
+                    .push(wire_value + gamma_i.multiply_accumulate(beta_i, s_sigmas[j]));
+            }
 
             // The partial products considered for this iteration of `i`.
             let current_partial_products = &partial_products[i * num_prods..(i + 1) * num_prods];
@@ -587,7 +604,9 @@ pub fn check_lookup_constraints_batch<F: RichField + Extendable<D>, const D: usi
     // Check RE row transition constraint.
     let mut cur_sum = next_z_re;
     for elt in &current_lookup_combos {
-        cur_sum = cur_sum * deltas[LookupChallenges::ChallengeDelta as usize] + *elt;
+        // cur_sum = cur_sum * deltas[LookupChallenges::ChallengeDelta as usize] + *elt;
+        cur_sum =
+            elt.multiply_accumulate(cur_sum, deltas[LookupChallenges::ChallengeDelta as usize]);
     }
     let unfiltered_re_line = z_re - cur_sum;
 
@@ -639,7 +658,10 @@ pub fn check_lookup_constraints_batch<F: RichField + Extendable<D>, const D: usi
         let lut_sum_prods_with_mul = (poly * lut_degree
             ..min((poly + 1) * lut_degree, num_lut_slots))
             .fold(F::ZERO, |acc, i| {
-                acc + vars.local_wires[LookupTableGate::wire_ith_multiplicity(i)] * lut_prod_i(i)
+                acc.multiply_accumulate(
+                    vars.local_wires[LookupTableGate::wire_ith_multiplicity(i)],
+                    lut_prod_i(i),
+                )
             });
 
         // The previous element is the previous poly of the current row or the last poly of the next row.
@@ -656,7 +678,8 @@ pub fn check_lookup_constraints_batch<F: RichField + Extendable<D>, const D: usi
             .push(lookup_selectors[LookupSelectors::TransSre as usize] * unfiltered_sum_transition);
 
         // Check LDC row and col transitions. It's the same constraint, with a row transition happening for slot == 0.
-        let unfiltered_ldc_transition = lu_prod * (z_x_lookup_sldcs[poly] - prev) + lu_sum_prods;
+        let unfiltered_ldc_transition =
+            lu_sum_prods.multiply_accumulate(lu_prod, z_x_lookup_sldcs[poly] - prev);
         constraints
             .push(lookup_selectors[LookupSelectors::TransLdc as usize] * unfiltered_ldc_transition);
     }
diff --git a/plonky2/src/util/mod.rs b/plonky2/src/util/mod.rs
index 8f9960034..73ec06f7a 100644
--- a/plonky2/src/util/mod.rs
+++ b/plonky2/src/util/mod.rs
@@ -3,7 +3,7 @@
 #[cfg(not(feature = "std"))]
 use alloc::vec::Vec;
 
-use plonky2_maybe_rayon::*;
+use plonky2_maybe_rayon::{MaybeIntoParIter, ParallelIterator};
 #[doc(inline)]
 pub use plonky2_util::*;
 
@@ -23,7 +23,12 @@ pub(crate) fn transpose_poly_values<F: Field>(polys: Vec<PolynomialValues<F>>) -
 }
 
 pub fn transpose<T: Send + Sync + Copy>(matrix: &[Vec<T>]) -> Vec<Vec<T>> {
+    if matrix.is_empty() {
+        return vec![];
+    }
+
     let len = matrix[0].len();
+
     (0..len)
         .into_par_iter()
         .map(|i| matrix.iter().map(|row| row[i]).collect())
diff --git a/plonky2/src/util/serialization/mod.rs b/plonky2/src/util/serialization/mod.rs
index 3755851ac..209b43d3c 100644
--- a/plonky2/src/util/serialization/mod.rs
+++ b/plonky2/src/util/serialization/mod.rs
@@ -321,21 +321,21 @@ pub trait Read {
         H: Hasher<F>,
     {
         let leaves_len = self.read_usize()?;
-        let mut leaves = Vec::with_capacity(leaves_len);
+        let leaf_len = self.read_usize()?;
+        let mut leaves_2d = Vec::with_capacity(leaves_len * leaf_len);
         for _ in 0..leaves_len {
-            let leaf_len = self.read_usize()?;
-            leaves.push(self.read_field_vec(leaf_len)?);
+            leaves_2d.push(self.read_field_vec(leaf_len)?);
         }
 
+        let leaves_1d = leaves_2d.into_iter().flatten().collect();
+
         let digests_len = self.read_usize()?;
         let digests = self.read_hash_vec::<F, H>(digests_len)?;
         let cap_height = self.read_usize()?;
         let cap = self.read_merkle_cap::<F, H>(cap_height)?;
-        Ok(MerkleTree {
-            leaves,
-            digests,
-            cap,
-        })
+        Ok(MerkleTree::new_from_fields(
+            leaves_1d, leaf_len, digests, cap,
+        ))
     }
 
     /// Reads a value of type [`OpeningSet`] from `self` with the given `common_data`.
@@ -1421,10 +1421,11 @@ pub trait Write {
         F: RichField,
         H: Hasher<F>,
     {
-        self.write_usize(tree.leaves.len())?;
-        for i in 0..tree.leaves.len() {
-            self.write_usize(tree.leaves[i].len())?;
-            self.write_field_vec(&tree.leaves[i])?;
+        let leaves_count = tree.get_leaves_count();
+        self.write_usize(leaves_count)?;
+        self.write_usize(tree.leaf_size)?;
+        for i in 0..leaves_count {
+            self.write_field_vec(&tree.get(i))?;
         }
         self.write_hash_vec::<F, H>(&tree.digests)?;
         self.write_usize(tree.cap.height())?;
diff --git a/test_gpu.sh b/test_gpu.sh
new file mode 100755
index 000000000..9979bfeaf
--- /dev/null
+++ b/test_gpu.sh
@@ -0,0 +1,141 @@
+#!/bin/bash
+
+# GPU Testing Script for Plonky2
+# This script validates CUDA setup, zeknox library, and runs GPU-accelerated tests
+
+set -e  # Exit on any error
+
+# Color codes for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+echo "========================================="
+echo "Plonky2 GPU Testing Script"
+echo "========================================="
+echo ""
+
+# Step 1: Check NVIDIA driver and CUDA
+echo -e "${YELLOW}[1/7] Checking NVIDIA driver and CUDA...${NC}"
+if ! command -v nvidia-smi &> /dev/null; then
+    echo -e "${RED}ERROR: nvidia-smi not found. Please install NVIDIA drivers.${NC}"
+    exit 1
+fi
+
+echo "NVIDIA Driver Information:"
+nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv
+echo ""
+
+# Check for CUDA toolkit
+if command -v nvcc &> /dev/null; then
+    echo "CUDA Compiler Version:"
+    nvcc --version | grep "release"
+    echo ""
+elif [ -n "$CUDA_HOME" ]; then
+    echo "CUDA_HOME is set to: $CUDA_HOME"
+    echo ""
+else
+    echo -e "${YELLOW}WARNING: nvcc not found and CUDA_HOME not set. CUDA toolkit may not be installed.${NC}"
+    echo "Continuing anyway as runtime libraries may still be available..."
+    echo ""
+fi
+
+echo -e "${GREEN} NVIDIA driver check passed${NC}"
+echo ""
+
+# Step 2: Check zeknox library
+echo -e "${YELLOW}[2/7] Checking zeknox library...${NC}"
+ZEKNOX_PATH="../zeknox"
+if [ ! -d "$ZEKNOX_PATH" ]; then
+    echo -e "${RED}ERROR: zeknox library not found at $ZEKNOX_PATH${NC}"
+    echo "Expected location: $(cd .. && pwd)/zeknox"
+    exit 1
+fi
+
+if [ -d "$ZEKNOX_PATH/wrappers/rust" ]; then
+    echo "Found zeknox library at: $(cd $ZEKNOX_PATH && pwd)"
+    echo "Rust wrappers directory exists: $ZEKNOX_PATH/wrappers/rust"
+else
+    echo -e "${YELLOW}WARNING: zeknox/wrappers/rust not found, but zeknox directory exists${NC}"
+fi
+
+echo -e "${GREEN} zeknox library check passed${NC}"
+echo ""
+
+# Step 3: Run field tests
+echo -e "${YELLOW}[3/7] Running field tests with GPU acceleration...${NC}"
+echo "Command: cd field && cargo test --release --features=cuda -- --test-threads=1"
+echo ""
+
+cd field
+if cargo test --release --features=cuda -- --test-threads=1; then
+    echo ""
+    echo -e "${GREEN} Field tests passed${NC}"
+else
+    echo ""
+    echo -e "${RED}ERROR: Field tests failed${NC}"
+    cd ..
+    exit 1
+fi
+cd ..
+echo ""
+
+# Step 4: Run fibonacci example with CUDA for correctness
+echo -e "${YELLOW}[4/7] Running fibonacci example with CUDA features...${NC}"
+echo "Command: NUM_OF_GPUS=1 cargo run --release --features=cuda_sanity_check --example fibonacci"
+echo ""
+
+if NUM_OF_GPUS=1 cargo run --release --features=cuda_sanity_check --example fibonacci; then
+    echo ""
+    echo -e "${GREEN} Fibonacci example completed successfully with GPU${NC}"
+else
+    echo ""
+    echo -e "${RED}ERROR: Fibonacci example failed with GPU${NC}"
+    exit 1
+fi
+echo ""
+
+# Step 5: Run fibonacci example with CUDA for speed
+echo -e "${YELLOW}[5/7] Running fibonacci example with CUDA features...${NC}"
+echo "Command: NUM_OF_GPUS=1 cargo run --release --example fibonacci --features=cuda"
+echo ""
+
+if NUM_OF_GPUS=1 cargo run --release --example fibonacci --features=cuda; then
+    echo ""
+    echo -e "${GREEN} Fibonacci example completed successfully with GPU${NC}"
+else
+    echo ""
+    echo -e "${RED}ERROR: Fibonacci example failed with GPU${NC}"
+    exit 1
+fi
+echo ""
+
+
+# Step 6: Run fibonacci example with CPU
+echo -e "${YELLOW}[6/7] Running fibonacci example with CUDA features...${NC}"
+echo "Command: NUM_OF_GPUS=1 cargo run --release --example fibonacci"
+echo ""
+
+if cargo run --release --example fibonacci; then
+    echo ""
+    echo -e "${GREEN} Fibonacci example completed successfully with CPU${NC}"
+else
+    echo ""
+    echo -e "${RED}ERROR: Fibonacci example failed with CPU${NC}"
+    exit 1
+fi
+echo ""
+
+# Step 7: Summary
+echo "========================================="
+echo -e "${GREEN}All GPU tests completed successfully!${NC}"
+echo "========================================="
+echo ""
+echo "Tests run:"
+echo "   NVIDIA driver and CUDA verification"
+echo "   zeknox library verification"
+echo "   Field tests (FFT, polynomials, interpolation, cosets)"
+echo "   Fibonacci proof generation with GPU acceleration"
+echo ""
+echo -e "${GREEN}GPU testing complete!${NC}"