diff --git a/Cargo.toml b/Cargo.toml
index 81dbbde49..eed6218b8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,6 +13,8 @@ rand = { version = "0.8.4", default-features = false }
 serde = { version = "1.0", default-features = false, features = ["derive"] }
 static_assertions = { version = "1.1.0", default-features = false }
 unroll = { version = "0.1.5", default-features = false }
+zeknox_= { path = "../zeknox/wrappers/rust"}
+
 
 [profile.release]
 opt-level = 3
diff --git a/field/Cargo.toml b/field/Cargo.toml
index e13f49efd..49cb04494 100644
--- a/field/Cargo.toml
+++ b/field/Cargo.toml
@@ -19,6 +19,9 @@ serde = { workspace = true, features = ["alloc"] }
 static_assertions = { workspace = true }
 unroll = { workspace = true }
 
+# cuda accelerator wrapper
+zeknox = { workspace = true }
+
 # Local dependencies
 plonky2_util = { version = "1.0.0", path = "../util", default-features = false }
 
@@ -29,3 +32,9 @@ rustdoc-args = ["--html-in-header", ".cargo/katex-header.html"]
 
 [lints]
 workspace = true
+
+
+[features]
+# default = []
+default = [ "cuda" ]
+cuda = []
\ No newline at end of file
diff --git a/field/src/fft.rs b/field/src/fft.rs
index d078ca6c3..eeb86b62d 100644
--- a/field/src/fft.rs
+++ b/field/src/fft.rs
@@ -32,16 +32,230 @@ pub fn fft_root_table<F: Field>(n: usize) -> FftRootTable<F> {
     root_table
 }
 
+#[cfg(feature = "cuda")]
+fn fft_dispatch_gpu<F: Field>(
+    input: &mut [F],
+    zero_factor: Option<usize>,
+    root_table: Option<&FftRootTable<F>>,
+) {
+    use zeknox::ntt_batch;
+    use zeknox::types::NTTConfig;
+    if F::CUDA_SUPPORT {
+        return ntt_batch(
+            0,
+            input.as_mut_ptr(),
+            input.len().trailing_zeros() as usize,
+            NTTConfig::default(),
+        );
+    } else {
+        return fft_dispatch_cpu(input, zero_factor, root_table);
+    }
+}
+
+/// Batch FFT computation for multiple polynomials on GPU
+#[cfg(feature = "cuda")]
+fn fft_batch_dispatch_gpu<F: Field>(
+    inputs: &mut [F],
+    poly_size: usize,
+    num_polys: usize,
+    zero_factor: Option<usize>,
+    root_table: Option<&FftRootTable<F>>,
+) {
+    use zeknox::ntt_batch;
+    use zeknox::types::NTTConfig;
+
+    if F::CUDA_SUPPORT {
+        let mut cfg = NTTConfig::default();
+        cfg.batches = num_polys as u32;
+
+        return ntt_batch(
+            0,
+            inputs.as_mut_ptr(),
+            poly_size.trailing_zeros() as usize,
+            cfg,
+        );
+    } else {
+        // Fallback to CPU: process each polynomial separately
+        for i in 0..num_polys {
+            let start = i * poly_size;
+            let end = start + poly_size;
+            fft_dispatch_cpu(&mut inputs[start..end], zero_factor, root_table);
+        }
+    }
+}
+
+#[cfg(feature = "cuda")]
+pub(crate) fn coset_fft_gpu<F: Field>(
+    poly: PolynomialCoeffs<F>,
+    zero_factor: Option<usize>,
+    root_table: Option<&FftRootTable<F>>,
+) -> PolynomialValues<F> {
+    use zeknox::ntt_batch;
+    use zeknox::types::NTTConfig;
+
+    if !F::CUDA_SUPPORT {
+        // Fallback to CPU if CUDA not supported for this field
+        let modified_poly: PolynomialCoeffs<F> = F::coset_shift()
+            .powers()
+            .zip(&poly.coeffs)
+            .map(|(r, &c)| r * c)
+            .collect::<Vec<_>>()
+            .into();
+        return fft_with_options(modified_poly, zero_factor, root_table);
+    }
+
+    let PolynomialCoeffs { coeffs: mut buffer } = poly;
+    let lg_n = buffer.len().trailing_zeros() as usize;
+
+    // // Initialize coset on GPU
+    // // For Goldilocks field, the coset generator is 7 (MULTIPLICATIVE_GROUP_GENERATOR)
+    // // TODO: Make this generic for other fields if needed
+    // let coset_gen_u64 = 7u64;
+    // init_coset_rs(0, lg_n, coset_gen_u64);
+
+    // Configure NTT for coset
+    let mut cfg = NTTConfig::default();
+    cfg.with_coset = true;
+    cfg.ntt_type = zeknox::types::NTTType::Coset;
+
+    // Perform coset NTT on GPU
+    ntt_batch(0, buffer.as_mut_ptr(), lg_n, cfg);
+
+    PolynomialValues::new(buffer)
+}
+
+/// Batch coset FFT computation for multiple polynomials on GPU
+#[cfg(feature = "cuda")]
+fn coset_fft_batch_gpu<F: Field>(
+    polys: Vec<PolynomialCoeffs<F>>,
+    zero_factor: Option<usize>,
+    root_table: Option<&FftRootTable<F>>,
+) -> Vec<PolynomialValues<F>> {
+    use zeknox::ntt_batch;
+    use zeknox::types::NTTConfig;
+
+    if polys.is_empty() {
+        return Vec::new();
+    }
+
+    let num_polys = polys.len();
+    let poly_size = polys[0].len();
+
+    // Verify all polynomials have the same size
+    assert!(
+        polys.iter().all(|p| p.len() == poly_size),
+        "All polynomials must have the same size for batch coset FFT"
+    );
+
+    if !F::CUDA_SUPPORT {
+        // Fallback to CPU if CUDA not supported for this field
+        return polys
+            .into_iter()
+            .map(|poly| {
+                let modified_poly: PolynomialCoeffs<F> = F::coset_shift()
+                    .powers()
+                    .zip(&poly.coeffs)
+                    .map(|(r, &c)| r * c)
+                    .collect::<Vec<_>>()
+                    .into();
+                fft_with_options(modified_poly, zero_factor, root_table)
+            })
+            .collect();
+    }
+
+    // Flatten all polynomials into a single contiguous buffer
+    let mut buffer: Vec<F> = Vec::with_capacity(num_polys * poly_size);
+    for poly in polys {
+        buffer.extend_from_slice(&poly.coeffs);
+    }
+
+    let lg_n = poly_size.trailing_zeros() as usize;
+
+    // Configure NTT for batch coset
+    let mut cfg = NTTConfig::default();
+    cfg.batches = num_polys as u32;
+    cfg.with_coset = true;
+    cfg.ntt_type = zeknox::types::NTTType::Coset;
+
+    // Perform batch coset NTT on GPU
+    ntt_batch(0, buffer.as_mut_ptr(), lg_n, cfg);
+
+    // Split the buffer back into separate polynomials
+    buffer
+        .chunks(poly_size)
+        .map(|chunk| PolynomialValues::new(chunk.to_vec()))
+        .collect()
+}
+
+/// Compute coset FFT for multiple polynomials in batch.
+/// All polynomials must have the same size (power of 2).
+/// Returns a vector of PolynomialValues in the same order as input.
+pub fn coset_fft_batch<F: Field>(polys: Vec<PolynomialCoeffs<F>>) -> Vec<PolynomialValues<F>> {
+    coset_fft_batch_with_options(polys, None, None)
+}
+
+/// Compute coset FFT for multiple polynomials in batch with options.
+/// All polynomials must have the same size (power of 2).
+/// Returns a vector of PolynomialValues in the same order as input.
+pub fn coset_fft_batch_with_options<F: Field>(
+    polys: Vec<PolynomialCoeffs<F>>,
+    zero_factor: Option<usize>,
+    root_table: Option<&FftRootTable<F>>,
+) -> Vec<PolynomialValues<F>> {
+    #[cfg(feature = "cuda")]
+    return coset_fft_batch_gpu(polys, zero_factor, root_table);
+
+    #[cfg(not(feature = "cuda"))]
+    {
+        // CPU fallback: process each polynomial separately
+        polys
+            .into_iter()
+            .map(|poly| {
+                let modified_poly: PolynomialCoeffs<F> = F::coset_shift()
+                    .powers()
+                    .zip(&poly.coeffs)
+                    .map(|(r, &c)| r * c)
+                    .collect::<Vec<_>>()
+                    .into();
+                fft_with_options(modified_poly, zero_factor, root_table)
+            })
+            .collect()
+    }
+}
+
+fn fft_dispatch_cpu<F: Field>(
+    input: &mut [F],
+    zero_factor: Option<usize>,
+    root_table: Option<&FftRootTable<F>>,
+) {
+    if root_table.is_some() {
+        return fft_classic(input, zero_factor.unwrap_or(0), root_table.unwrap());
+    } else {
+        // let pre_computed = F::pre_compute_fft_root_table(input.len());
+        // if pre_computed.is_some() {
+        //     return fft_classic(input, zero_factor.unwrap_or(0), pre_computed.unwrap());
+        // } else {
+        //     let computed = fft_root_table::<F>(input.len());
+
+        //     return fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref());
+        // }
+        let computed = fft_root_table::<F>(input.len());
+
+        return fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref());
+    };
+}
+
 #[inline]
 fn fft_dispatch<F: Field>(
     input: &mut [F],
     zero_factor: Option<usize>,
     root_table: Option<&FftRootTable<F>>,
 ) {
-    let computed_root_table = root_table.is_none().then(|| fft_root_table(input.len()));
-    let used_root_table = root_table.or(computed_root_table.as_ref()).unwrap();
+    #[cfg(feature = "cuda")]
+    return fft_dispatch_gpu(input, zero_factor, root_table);
 
-    fft_classic(input, zero_factor.unwrap_or(0), used_root_table);
+    #[cfg(not(feature = "cuda"))]
+    return fft_dispatch_cpu(input, zero_factor, root_table);
 }
 
 #[inline]
@@ -60,6 +274,66 @@ pub fn fft_with_options<F: Field>(
     PolynomialValues::new(buffer)
 }
 
+/// Compute FFT for multiple polynomials in batch.
+/// All polynomials must have the same size (power of 2).
+/// Returns a vector of PolynomialValues in the same order as input.
+#[inline]
+pub fn fft_batch<F: Field>(polys: Vec<PolynomialCoeffs<F>>) -> Vec<PolynomialValues<F>> {
+    fft_batch_with_options(polys, None, None)
+}
+
+/// Compute FFT for multiple polynomials in batch with options.
+/// All polynomials must have the same size (power of 2).
+/// Returns a vector of PolynomialValues in the same order as input.
+pub fn fft_batch_with_options<F: Field>(
+    polys: Vec<PolynomialCoeffs<F>>,
+    zero_factor: Option<usize>,
+    root_table: Option<&FftRootTable<F>>,
+) -> Vec<PolynomialValues<F>> {
+    if polys.is_empty() {
+        return Vec::new();
+    }
+
+    let num_polys = polys.len();
+    let poly_size = polys[0].len();
+
+    // Verify all polynomials have the same size
+    assert!(
+        polys.iter().all(|p| p.len() == poly_size),
+        "All polynomials must have the same size for batch FFT"
+    );
+    assert!(
+        poly_size.is_power_of_two(),
+        "Polynomial size must be a power of 2"
+    );
+
+    // Flatten all polynomials into a single contiguous buffer
+    let mut buffer: Vec<F> = Vec::with_capacity(num_polys * poly_size);
+    for poly in polys {
+        buffer.extend_from_slice(&poly.coeffs);
+    }
+
+    // Dispatch to GPU or CPU batch processing
+    #[cfg(feature = "cuda")]
+    fft_batch_dispatch_gpu(&mut buffer, poly_size, num_polys, zero_factor, root_table);
+
+    #[cfg(not(feature = "cuda"))]
+    {
+        // CPU fallback: process each polynomial separately
+        for i in 0..num_polys {
+            let start = i * poly_size;
+            let end = start + poly_size;
+            fft_dispatch_cpu(&mut buffer[start..end], zero_factor, root_table);
+        }
+    }
+
+    // Split the buffer back into separate polynomials
+    buffer
+        .chunks(poly_size)
+        .map(|chunk| PolynomialValues::new(chunk.to_vec()))
+        .collect()
+}
+
 #[inline]
 pub fn ifft<F: Field>(poly: PolynomialValues<F>) -> PolynomialCoeffs<F> {
     ifft_with_options(poly, None, None)
@@ -206,8 +480,10 @@ mod tests {
     use alloc::vec::Vec;
 
     use plonky2_util::{log2_ceil, log2_strict};
+    #[cfg(feature = "cuda")]
+    use zeknox::init_twiddle_factors_rs;
 
-    use crate::fft::{fft, fft_with_options, ifft};
+    use crate::fft::{coset_fft_batch, fft, fft_batch, fft_with_options, ifft};
     use crate::goldilocks_field::GoldilocksField;
     use crate::polynomial::{PolynomialCoeffs, PolynomialValues};
     use crate::types::Field;
@@ -218,6 +494,13 @@ mod tests {
         let degree = 200usize;
         let degree_padded = degree.next_power_of_two();
 
+        #[cfg(feature = "cuda")]
+        let log_degree = {
+            zeknox::clear_cuda_errors_rs();
+            let log_degree = degree_padded.trailing_zeros() as usize;
+            init_twiddle_factors_rs(0, log_degree);
+            log_degree
+        };
         // Create a vector of coeffs; the first degree of them are
         // "random", the last degree_padded-degree of them are zero.
         let coeffs = (0..degree)
@@ -239,6 +522,8 @@ mod tests {
         }
 
         for r in 0..4 {
+            #[cfg(feature = "cuda")]
+            init_twiddle_factors_rs(0, log_degree + r);
             // expand coefficients by factor 2^r by filling with zeros
             let zero_tail = coefficients.lde(r);
             assert_eq!(
@@ -248,6 +533,248 @@ mod tests {
         }
     }
 
+    #[test]
+    #[cfg(feature = "cuda")]
+    fn test_fft_gpu_vs_cpu_single() {
+        type F = GoldilocksField;
+
+        // Test various polynomial sizes
+        for log_size in [8, 10, 12, 14] {
+            let size = 1 << log_size;
+            zeknox::clear_cuda_errors_rs();
+            init_twiddle_factors_rs(0, log_size);
+
+            // Create a random polynomial
+            let coeffs: Vec<F> = (0..size)
+                .map(|i| F::from_canonical_usize(i * 7919 % 1000000))
+                .collect();
+
+            let poly = PolynomialCoeffs {
+                coeffs: coeffs.clone(),
+            };
+
+            // Compute FFT using GPU (via fft function which dispatches to GPU)
+            let gpu_result = fft(poly.clone());
+
+            // Compute FFT using CPU (force CPU path)
+            let mut cpu_buffer = coeffs.clone();
+            super::fft_dispatch_cpu(&mut cpu_buffer, None, None);
+            let cpu_result = PolynomialValues::new(cpu_buffer);
+
+            // Compare results
+            assert_eq!(
+                gpu_result.len(),
+                cpu_result.len(),
+                "GPU and CPU results have different lengths for size {}",
+                size
+            );
+
+            for i in 0..size {
+                assert_eq!(
+                    gpu_result.values[i], cpu_result.values[i],
+                    "Mismatch at index {} for polynomial size {}",
+                    i, size
+                );
+            }
+        }
+    }
+
+    #[test]
+    #[cfg(feature = "cuda")]
+    fn test_fft_batch_gpu_vs_cpu() {
+        type F = GoldilocksField;
+
+        let poly_size: usize = 1 << 10; // 1024 elements
+        let num_polys = 8;
+        let log_size = poly_size.trailing_zeros() as usize;
+
+        zeknox::clear_cuda_errors_rs();
+        init_twiddle_factors_rs(0, log_size);
+
+        // Create multiple random polynomials
+        let polys: Vec<PolynomialCoeffs<F>> = (0..num_polys)
+            .map(|batch_idx| {
+                let coeffs: Vec<F> = (0..poly_size)
+                    .map(|i| F::from_canonical_usize((i * 7919 + batch_idx * 12345) % 1000000))
+                    .collect();
+                PolynomialCoeffs { coeffs }
+            })
+            .collect();
+
+        // Compute batch FFT using GPU
+        let gpu_results = fft_batch(polys.clone());
+
+        // Compute FFT for each polynomial using CPU
+        let cpu_results: Vec<PolynomialValues<F>> = polys
+            .into_iter()
+            .map(|poly| {
+                let mut buffer = poly.coeffs.clone();
+                super::fft_dispatch_cpu(&mut buffer, None, None);
+                PolynomialValues::new(buffer)
+            })
+            .collect();
+
+        // Compare results
+        assert_eq!(gpu_results.len(), cpu_results.len());
+        for (batch_idx, (gpu_result, cpu_result)) in
+            gpu_results.iter().zip(cpu_results.iter()).enumerate()
+        {
+            assert_eq!(gpu_result.len(), cpu_result.len());
+            for i in 0..poly_size {
+                assert_eq!(
+                    gpu_result.values[i], cpu_result.values[i],
+                    "Batch FFT mismatch at batch {} index {}",
+                    batch_idx, i
+                );
+            }
+        }
+    }
+
+    #[test]
+    #[cfg(feature = "cuda")]
+    fn test_coset_fft_gpu_vs_cpu_single() {
+        use zeknox::init_coset_rs;
+
+        use crate::types::PrimeField64;
+        type F = GoldilocksField;
+
+        for log_size in [8, 10, 12] {
+            let size = 1 << log_size;
+            zeknox::clear_cuda_errors_rs();
+            init_twiddle_factors_rs(0, log_size);
+
+            // Initialize coset for GPU
+            let coset_gen_u64 = F::coset_shift().to_canonical_u64();
+            init_coset_rs(0, log_size, coset_gen_u64);
+
+            // Create a random polynomial
+            let coeffs: Vec<F> = (0..size)
+                .map(|i| F::from_canonical_usize(i * 8191 % 1000000))
+                .collect();
+
+            let poly = PolynomialCoeffs {
+                coeffs: coeffs.clone(),
+            };
+
+            // Compute coset FFT using GPU
+            let gpu_result = super::coset_fft_gpu(poly.clone(), None, None);
+
+            // Compute coset FFT using CPU (apply coset shift then FFT)
+            let modified_poly: PolynomialCoeffs<F> = F::coset_shift()
+                .powers()
+                .zip(&coeffs)
+                .map(|(r, &c)| r * c)
+                .collect::<Vec<_>>()
+                .into();
+
+            let mut cpu_buffer = modified_poly.coeffs;
+            super::fft_dispatch_cpu(&mut cpu_buffer, None, None);
+            let cpu_result = PolynomialValues::new(cpu_buffer);
+
+            // Compare results
+            assert_eq!(
+                gpu_result.len(),
+                cpu_result.len(),
+                "GPU and CPU coset FFT results have different lengths for size {}",
+                size
+            );
+
+            for i in 0..size {
+                assert_eq!(
+                    gpu_result.values[i], cpu_result.values[i],
+                    "Coset FFT mismatch at index {} for polynomial size {}",
+                    i, size
+                );
+            }
+        }
+    }
+
+    #[test]
+    #[cfg(feature = "cuda")]
+    fn test_coset_fft_batch_gpu_vs_cpu() {
+        use zeknox::init_coset_rs;
+
+        use crate::types::PrimeField64;
+        type F = GoldilocksField;
+
+        let poly_size: usize = 1 << 10; // 1024 elements
+        let num_polys = 8;
+        let log_size = poly_size.trailing_zeros() as usize;
+
+        zeknox::clear_cuda_errors_rs();
+        init_twiddle_factors_rs(0, log_size);
+
+        // Initialize coset for GPU
+        let coset_gen_u64 = F::coset_shift().to_canonical_u64();
+        init_coset_rs(0, log_size, coset_gen_u64);
+
+        // Create multiple random polynomials
+        let polys: Vec<PolynomialCoeffs<F>> = (0..num_polys)
+            .map(|batch_idx| {
+                let coeffs: Vec<F> = (0..poly_size)
+                    .map(|i| F::from_canonical_usize((i * 8191 + batch_idx * 54321) % 1000000))
+                    .collect();
+                PolynomialCoeffs { coeffs }
+            })
+            .collect();
+
+        // Compute batch coset FFT using GPU
+        let gpu_results = coset_fft_batch(polys.clone());
+
+        // Compute coset FFT for each polynomial using CPU
+        let cpu_results: Vec<PolynomialValues<F>> = polys
+            .into_iter()
+            .map(|poly| {
+                let modified_poly: PolynomialCoeffs<F> = F::coset_shift()
+                    .powers()
+                    .zip(&poly.coeffs)
+                    .map(|(r, &c)| r * c)
+                    .collect::<Vec<_>>()
+                    .into();
+
+                let mut buffer = modified_poly.coeffs;
+                super::fft_dispatch_cpu(&mut buffer, None, None);
+                PolynomialValues::new(buffer)
+            })
+            .collect();
+
+        // Compare results
+        assert_eq!(gpu_results.len(), cpu_results.len());
+        for (batch_idx, (gpu_result, cpu_result)) in
+            gpu_results.iter().zip(cpu_results.iter()).enumerate()
+        {
+            assert_eq!(gpu_result.len(), cpu_result.len());
+            for i in 0..poly_size {
+                assert_eq!(
+                    gpu_result.values[i], cpu_result.values[i],
+                    "Batch coset FFT mismatch at batch {} index {}",
+                    batch_idx, i
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_batch_fft_empty() {
+        type F = GoldilocksField;
+        let polys: Vec<PolynomialCoeffs<F>> = vec![];
+        let results = fft_batch(polys);
+        assert!(results.is_empty());
+    }
+
+    #[test]
+    #[should_panic(expected = "All polynomials must have the same size")]
+    fn test_batch_fft_different_sizes() {
+        type F = GoldilocksField;
+        let poly1 = PolynomialCoeffs {
+            coeffs: vec![F::ONE; 256],
+        };
+        let poly2 = PolynomialCoeffs {
+            coeffs: vec![F::ONE; 512],
+        };
+        let _ = fft_batch(vec![poly1, poly2]);
+    }
+
     fn evaluate_naive<F: Field>(coefficients: &PolynomialCoeffs<F>) -> PolynomialValues<F> {
         let degree = coefficients.len();
         let degree_padded = 1 << log2_ceil(degree);
diff --git a/field/src/goldilocks_extensions.rs b/field/src/goldilocks_extensions.rs
index 8f2d85253..6dd15ce0d 100644
--- a/field/src/goldilocks_extensions.rs
+++ b/field/src/goldilocks_extensions.rs
@@ -21,9 +21,10 @@ impl Extendable<2> for GoldilocksField {
     // DTH_ROOT = W^((ORDER - 1)/2)
     const DTH_ROOT: Self = Self(18446744069414584320);
 
-    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] = [Self(0), Self(11713931119993638672)];
+    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] =
+        [Self(18081566051660590251), Self(16121475356294670766)];
 
-    const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(7226896044987257365)];
+    const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(15659105665374529263)];
 }
 
 impl Mul for QuadraticExtension<GoldilocksField> {
@@ -44,11 +45,15 @@ impl Extendable<4> for GoldilocksField {
     // DTH_ROOT = W^((ORDER - 1)/4)
     const DTH_ROOT: Self = Self(281474976710656);
 
-    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] =
-        [Self(0), Self(8295451483910296135), Self(0), Self(0)];
+    const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] = [
+        Self(5024755240244648895),
+        Self(13227474371289740625),
+        Self(3912887029498544536),
+        Self(3900057112666848848),
+    ];
 
     const EXT_POWER_OF_TWO_GENERATOR: [Self; 4] =
-        [Self(0), Self(0), Self(0), Self(17216955519093520442)];
+        [Self(0), Self(0), Self(0), Self(12587610116473453104)];
 }
 
 impl Mul for QuarticExtension<GoldilocksField> {
@@ -70,11 +75,11 @@ impl Extendable<5> for GoldilocksField {
     const DTH_ROOT: Self = Self(1041288259238279555);
 
     const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 5] = [
-        Self(4624713872807171977),
-        Self(381988216716071028),
-        Self(14499722700050429911),
-        Self(4870631734967222356),
-        Self(4518902370426242880),
+        Self(2899034827742553394),
+        Self(13012057356839176729),
+        Self(14593811582388663055),
+        Self(7722900811313895436),
+        Self(4557222484695340057),
     ];
 
     const EXT_POWER_OF_TWO_GENERATOR: [Self; 5] = [
diff --git a/field/src/goldilocks_field.rs b/field/src/goldilocks_field.rs
index b0191ca59..ae8457744 100644
--- a/field/src/goldilocks_field.rs
+++ b/field/src/goldilocks_field.rs
@@ -77,17 +77,19 @@ impl Field for GoldilocksField {
     const CHARACTERISTIC_TWO_ADICITY: usize = Self::TWO_ADICITY;
 
     // Sage: `g = GF(p).multiplicative_generator()`
-    const MULTIPLICATIVE_GROUP_GENERATOR: Self = Self(14293326489335486720);
+    const MULTIPLICATIVE_GROUP_GENERATOR: Self = Self(7);
 
     // Sage:
     // ```
     // g_2 = g^((p - 1) / 2^32)
     // g_2.multiplicative_order().factor()
     // ```
-    const POWER_OF_TWO_GENERATOR: Self = Self(7277203076849721926);
+    const POWER_OF_TWO_GENERATOR: Self = Self(1753635133440165772);
 
     const BITS: usize = 64;
 
+    const CUDA_SUPPORT: bool = true;
+
     fn order() -> BigUint {
         Self::ORDER.into()
     }
diff --git a/field/src/interpolation.rs b/field/src/interpolation.rs
index df7084572..9772fff56 100644
--- a/field/src/interpolation.rs
+++ b/field/src/interpolation.rs
@@ -77,6 +77,9 @@ pub fn interpolate2<F: Field>(points: [(F, F); 2], x: F) -> F {
 
 #[cfg(test)]
 mod tests {
+    #[cfg(feature = "cuda")]
+    use zeknox::init_twiddle_factors_rs;
+
     use super::*;
     use crate::extension::quartic::QuarticExtension;
     use crate::goldilocks_field::GoldilocksField;
@@ -87,7 +90,12 @@ mod tests {
     fn interpolant_random() {
         type F = GoldilocksField;
 
-        for deg in 0..10 {
+        #[cfg(feature = "cuda")]
+        zeknox::clear_cuda_errors_rs();
+
+        for deg in 2..10 {
+            #[cfg(feature = "cuda")]
+            init_twiddle_factors_rs(0, log2_ceil(deg));
             let domain = F::rand_vec(deg);
             let coeffs = F::rand_vec(deg);
             let coeffs = PolynomialCoeffs { coeffs };
@@ -101,7 +109,13 @@ mod tests {
     fn interpolant_random_roots_of_unity() {
         type F = GoldilocksField;
 
-        for deg_log in 0..4 {
+        #[cfg(feature = "cuda")]
+        zeknox::clear_cuda_errors_rs();
+
+        for deg_log in 1..4 {
+            #[cfg(feature = "cuda")]
+            init_twiddle_factors_rs(0, deg_log);
+
             let deg = 1 << deg_log;
             let domain = F::two_adic_subgroup(deg_log);
             let coeffs = F::rand_vec(deg);
@@ -116,8 +130,15 @@ mod tests {
     fn interpolant_random_overspecified() {
         type F = GoldilocksField;
 
+        #[cfg(feature = "cuda")]
+        zeknox::clear_cuda_errors_rs();
+
         for deg in 0..10 {
             let points = deg + 5;
+
+            #[cfg(feature = "cuda")]
+            init_twiddle_factors_rs(0, log2_ceil(points));
+
             let domain = F::rand_vec(points);
             let coeffs = F::rand_vec(deg);
             let coeffs = PolynomialCoeffs { coeffs };
@@ -137,6 +158,8 @@ mod tests {
         let points = [(F::rand(), F::rand()), (F::rand(), F::rand())];
         let x = F::rand();
 
+        #[cfg(feature = "cuda")]
+        init_twiddle_factors_rs(0, 2);
         let ev0 = interpolant(&points).eval(x);
         let ev1 = interpolate(&points, x, &barycentric_weights(&points));
         let ev2 = interpolate2(points, x);
diff --git a/field/src/lib.rs b/field/src/lib.rs
index c713db885..9a2ea4f9c 100644
--- a/field/src/lib.rs
+++ b/field/src/lib.rs
@@ -4,7 +4,6 @@
 #![deny(rustdoc::broken_intra_doc_links)]
 #![deny(missing_debug_implementations)]
 #![feature(specialization)]
-#![cfg_attr(target_arch = "x86_64", feature(stdarch_x86_avx512))]
 #![cfg_attr(not(test), no_std)]
 
 extern crate alloc;
diff --git a/field/src/polynomial/mod.rs b/field/src/polynomial/mod.rs
index c13bbca27..be8bf0ad9 100644
--- a/field/src/polynomial/mod.rs
+++ b/field/src/polynomial/mod.rs
@@ -283,6 +283,15 @@ impl<F: Field> PolynomialCoeffs<F> {
         zero_factor: Option<usize>,
         root_table: Option<&FftRootTable<F>>,
     ) -> PolynomialValues<F> {
+        #[cfg(feature = "cuda")]
+        {
+            if F::CUDA_SUPPORT && shift == F::coset_shift() {
+                // Use GPU coset FFT directly without CPU-side coefficient modification
+                return crate::fft::coset_fft_gpu(self.clone(), zero_factor, root_table);
+            }
+        }
+
+        // CPU path: multiply by powers of shift, then do regular FFT
         let modified_poly: Self = shift
             .powers()
             .zip(&self.coeffs)
@@ -440,6 +449,8 @@ impl<F: Field> Mul for &PolynomialCoeffs<F> {
 mod tests {
     use std::time::Instant;
 
+    #[cfg(feature = "cuda")]
+    use plonky2_util::log2_ceil;
     use rand::rngs::OsRng;
     use rand::Rng;
 
@@ -479,6 +490,13 @@ mod tests {
 
         let k = 8;
         let n = 1 << k;
+
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            zeknox::init_twiddle_factors_rs(0, k);
+        }
+
         let poly = PolynomialCoeffs::new(F::rand_vec(n));
         let shift = F::rand();
         let coset_evals = poly.coset_fft(shift).values;
@@ -500,6 +518,13 @@ mod tests {
 
         let k = 8;
         let n = 1 << k;
+
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            zeknox::init_twiddle_factors_rs(0, k);
+        }
+
         let evals = PolynomialValues::new(F::rand_vec(n));
         let shift = F::rand();
         let coeffs = evals.clone().coset_ifft(shift);
@@ -520,6 +545,12 @@ mod tests {
         type F = GoldilocksField;
         let mut rng = OsRng;
         let (a_deg, b_deg) = (rng.gen_range(1..10_000), rng.gen_range(1..10_000));
+
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            zeknox::init_twiddle_factors_rs(0, log2_ceil(a_deg + b_deg + 1));
+        }
         let a = PolynomialCoeffs::new(F::rand_vec(a_deg));
         let b = PolynomialCoeffs::new(F::rand_vec(b_deg));
         let m1 = &a * &b;
@@ -537,11 +568,24 @@ mod tests {
         let mut rng = OsRng;
         let a_deg = rng.gen_range(0..1_000);
         let n = rng.gen_range(1..1_000);
+
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            for i in 1..=log2_ceil(max(a_deg, n)) + 1 {
+                zeknox::init_twiddle_factors_rs(0, i);
+            }
+        }
+
         let mut a = PolynomialCoeffs::new(F::rand_vec(a_deg + 1));
+        println!("a {} b {}", a.len(), n);
+
         if a.coeffs[0].is_zero() {
             a.coeffs[0] = F::ONE; // First coefficient needs to be nonzero.
         }
         let b = a.inv_mod_xn(n);
+        println!("a {} b {}", a.len(), b.len());
+
         let mut m = &a * &b;
         m.coeffs.truncate(n);
         m.trim();
@@ -575,6 +619,15 @@ mod tests {
         type F = GoldilocksField;
         let mut rng = OsRng;
         let (a_deg, b_deg) = (rng.gen_range(1..10_000), rng.gen_range(1..10_000));
+
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            for i in 1..=log2_ceil(max(a_deg, b_deg)) + 1 {
+                zeknox::init_twiddle_factors_rs(0, i);
+            }
+        }
+
         let a = PolynomialCoeffs::new(F::rand_vec(a_deg));
         let b = PolynomialCoeffs::new(F::rand_vec(b_deg));
         let (q, r) = a.div_rem(&b);
@@ -606,6 +659,7 @@ mod tests {
         let mut rng = OsRng;
         let l = 14;
         let n = 1 << l;
+
         let g = F::primitive_root_of_unity(l);
         let xn_minus_one = {
             let mut xn_min_one_vec = vec![F::ZERO; n + 1];
@@ -616,6 +670,15 @@ mod tests {
 
         let a = g.exp_u64(rng.gen_range(0..(n as u64)));
         let denom = PolynomialCoeffs::new(vec![-a, F::ONE]);
+
+        #[cfg(feature = "cuda")]
+        {
+            zeknox::clear_cuda_errors_rs();
+            for i in 1..=l + 1 {
+                zeknox::init_twiddle_factors_rs(0, i);
+            }
+        }
+
         let now = Instant::now();
         xn_minus_one.div_rem(&denom);
         println!("Division time: {:?}", now.elapsed());
diff --git a/field/src/types.rs b/field/src/types.rs
index d714b7a84..5a34bb6a3 100644
--- a/field/src/types.rs
+++ b/field/src/types.rs
@@ -91,6 +91,9 @@ pub trait Field:
     /// The bit length of the field order.
     const BITS: usize;
 
+    /// Whether this field is supported by cuda
+    const CUDA_SUPPORT: bool = false;
+
     fn order() -> BigUint;
     fn characteristic() -> BigUint;