Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
0c6091e
fix parameters
zhenfeizhang Nov 14, 2025
7065a82
fix fft
zhenfeizhang Nov 15, 2025
fb3c96f
fix FFT/cosetFFT GPUs
zhenfeizhang Nov 18, 2025
c3aae3d
fix merkle tree
lighter-zz Nov 20, 2025
ea7334c
wip
zhenfeizhang Nov 24, 2025
912f1a2
wip
zhenfeizhang Nov 24, 2025
a04950f
wip
zhenfeizhang Nov 24, 2025
31fc974
Merge remote-tracking branch 'zz-lighter/zz/cuda_integration' into zz…
lighter-zz Nov 24, 2025
5f625e4
clean up
lighter-zz Nov 24, 2025
d48ec0d
Update perm_comp.md
lighter-zz Nov 24, 2025
6139c76
Update perm_comp.md
lighter-zz Nov 24, 2025
2669e9b
fix
lighter-zz Dec 9, 2025
6db10e9
fix again
lighter-zz Dec 9, 2025
022397a
fixes
lighter-zz Dec 9, 2025
914139f
fix
lighter-zz Dec 9, 2025
80d4292
merkle tree good version
Dec 9, 2025
af979c0
working
Dec 11, 2025
6ba6963
wip clean up
Dec 11, 2025
7b659d7
continue clean up
Dec 11, 2025
83f3d96
merkle tree sanity checks
Dec 11, 2025
2c9dd3f
fix more tests
Dec 11, 2025
fee54ed
clean up
Dec 11, 2025
4b82d19
clean up printing
Dec 11, 2025
8761671
clean up and scripts for testing
Dec 11, 2025
d58854c
more clean up
Dec 11, 2025
50a71fb
more clean up
Dec 11, 2025
9246859
fix bugs
Dec 12, 2025
39ca680
working now
Dec 15, 2025
d937578
clean up
Dec 15, 2025
2486825
finished
Dec 15, 2025
e0c397c
clean up
Dec 15, 2025
0606854
fix bug
Dec 16, 2025
ca5a5bb
optimized cpu memory alloce
Dec 16, 2025
6785308
futher opts
Dec 16, 2025
fa060c4
clean up
Jan 10, 2026
dcbfe7e
Merge branch 'dev' into zz/fix_merkle
Jan 10, 2026
2055bbc
fix example
Jan 10, 2026
bd17b4d
update dependency
Jan 10, 2026
041e1b7
update
Jan 10, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,6 @@ pgo-data.profdata

# MacOS nuisances
.DS_Store

*.log

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@ hashbrown = { version = "0.14.3", default-features = false, features = ["ahash",
itertools = { version = "0.11.0", default-features = false }
log = { version = "0.4.14", default-features = false }
num = { version = "0.4", default-features = false, features = ["rand"] }
once_cell = { version = "1.18.0", default-features = false }
rand = { version = "0.8.4", default-features = false }
serde = { version = "1.0", default-features = false, features = ["derive"] }
static_assertions = { version = "1.1.0", default-features = false }
unroll = { version = "0.1.5", default-features = false }
zeknox = { path = "../zeknox/wrappers/rust" }

[profile.release]
opt-level = 3
Expand Down
12 changes: 12 additions & 0 deletions field/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ serde = { workspace = true, features = ["alloc"] }
static_assertions = { workspace = true }
unroll = { workspace = true }

# cuda accelerator wrapper
zeknox = { workspace = true }

# Local dependencies
plonky2_util = { version = "1.0.0", path = "../util", default-features = false }

Expand All @@ -29,3 +32,12 @@ rustdoc-args = ["--html-in-header", ".cargo/katex-header.html"]

[lints]
workspace = true


[features]
# default = []
default = [ "cuda" ]
# default = [ "cuda", "cuda_sanity_check" ]
cuda = []
# sanity check: when this flag is on, the computation will done on both CPU and CUDA, and the results compared
cuda_sanity_check = ["cuda"]
130 changes: 125 additions & 5 deletions field/src/fft.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,36 @@ use crate::types::Field;

pub type FftRootTable<F> = Vec<Vec<F>>;

pub fn batch_fft<F: Field>(input: &[PolynomialCoeffs<F>]) -> Vec<PolynomialValues<F>> {
#[cfg(feature = "cuda")]
{
use zeknox::ntt_batch;
use zeknox::types::NTTConfig;

let mut data = input
.iter()
.flat_map(|poly| poly.coeffs.clone())
.collect::<Vec<F>>();
let mut cfg = NTTConfig::default();
cfg.batches = input.len() as u32;
let poly_len = input[0].len();
ntt_batch(0, &mut data, log2_strict(poly_len), cfg);

data.chunks(poly_len)
.map(|chunk| PolynomialValues::new(chunk.to_vec()))
.collect()
}
#[cfg(not(feature = "cuda"))]
{
let mut res = Vec::with_capacity(input.len());
for poly in input.iter() {
let pv = fft_with_options(poly.clone(), None, None);
res.push(pv);
}
res
}
}

pub fn fft_root_table<F: Field>(n: usize) -> FftRootTable<F> {
let lg_n = log2_strict(n);
// bases[i] = g^2^i, for i = 0, ..., lg_n - 1
Expand All @@ -32,16 +62,72 @@ pub fn fft_root_table<F: Field>(n: usize) -> FftRootTable<F> {
root_table
}

#[cfg(feature = "cuda")]
fn fft_dispatch_gpu<F: Field>(
input: &mut [F],
zero_factor: Option<usize>,
root_table: Option<&FftRootTable<F>>,
) {
if F::CUDA_SUPPORT {
use zeknox::ntt_batch;
use zeknox::types::NTTConfig;

#[cfg(feature = "cuda_sanity_check")]
let cpu_res = {
let mut input_clone = input.to_vec();
fft_dispatch_cpu(&mut input_clone, zero_factor, root_table);
input_clone
};

ntt_batch(
0,
input,
input.len().trailing_zeros() as usize,
NTTConfig::default(),
);

#[cfg(feature = "cuda_sanity_check")]
for (i, (a, b)) in input.iter().zip(cpu_res.iter()).enumerate() {
if a != b {
panic!(
"Mismatch at index {}: gpu result = {}, cpu result = {}",
i, a, b
);
}
}
return;
} else {
return fft_dispatch_cpu(input, zero_factor, root_table);
}
}

fn fft_dispatch_cpu<F: Field>(
input: &mut [F],
zero_factor: Option<usize>,
root_table: Option<&FftRootTable<F>>,
) {
if root_table.is_some() {
fft_classic(input, zero_factor.unwrap_or(0), root_table.unwrap())
} else {
let computed = fft_root_table::<F>(input.len());
fft_classic(input, zero_factor.unwrap_or(0), computed.as_ref())
};
}

#[inline]
fn fft_dispatch<F: Field>(
input: &mut [F],
zero_factor: Option<usize>,
root_table: Option<&FftRootTable<F>>,
) {
let computed_root_table = root_table.is_none().then(|| fft_root_table(input.len()));
let used_root_table = root_table.or(computed_root_table.as_ref()).unwrap();

fft_classic(input, zero_factor.unwrap_or(0), used_root_table);
#[cfg(feature = "cuda")]
{
fft_dispatch_gpu(input, zero_factor, root_table)
}
#[cfg(not(feature = "cuda"))]
{
fft_dispatch_cpu(input, zero_factor, root_table)
}
}

#[inline]
Expand All @@ -50,6 +136,7 @@ pub fn fft<F: Field>(poly: PolynomialCoeffs<F>) -> PolynomialValues<F> {
}

#[inline]

pub fn fft_with_options<F: Field>(
poly: PolynomialCoeffs<F>,
zero_factor: Option<usize>,
Expand All @@ -65,6 +152,28 @@ pub fn ifft<F: Field>(poly: PolynomialValues<F>) -> PolynomialCoeffs<F> {
ifft_with_options(poly, None, None)
}

#[inline]
pub fn ifft_cpu<F: Field>(poly: PolynomialValues<F>) -> PolynomialCoeffs<F> {
let n = poly.len();
let lg_n = log2_strict(n);
let n_inv = F::inverse_2exp(lg_n);

let PolynomialValues { values: mut buffer } = poly;
fft_dispatch_cpu(&mut buffer, None, None);

// We reverse all values except the first, and divide each by n.
buffer[0] *= n_inv;
buffer[n / 2] *= n_inv;
for i in 1..(n / 2) {
let j = n - i;
let coeffs_i = buffer[j] * n_inv;
let coeffs_j = buffer[i] * n_inv;
buffer[i] = coeffs_i;
buffer[j] = coeffs_j;
}
PolynomialCoeffs { coeffs: buffer }
}

pub fn ifft_with_options<F: Field>(
poly: PolynomialValues<F>,
zero_factor: Option<usize>,
Expand Down Expand Up @@ -214,6 +323,17 @@ mod tests {

#[test]
fn fft_and_ifft() {
#[cfg(feature = "cuda")]
{
zeknox::clear_cuda_errors_rs();
// Initialize twiddle factors for sizes we'll use
// degree_padded is 256 = 2^8
// lde then add 4 more bits
for i in 8..=12 {
zeknox::init_twiddle_factors_rs(0, i);
}
}

type F = GoldilocksField;
let degree = 200usize;
let degree_padded = degree.next_power_of_two();
Expand All @@ -222,7 +342,7 @@ mod tests {
// "random", the last degree_padded-degree of them are zero.
let coeffs = (0..degree)
.map(|i| F::from_canonical_usize(i * 1337 % 100))
.chain(core::iter::repeat_n(F::ZERO, degree_padded - degree))
.chain(core::iter::repeat(F::ZERO).take(degree_padded - degree))
.collect::<Vec<_>>();
assert_eq!(coeffs.len(), degree_padded);
let coefficients = PolynomialCoeffs { coeffs };
Expand Down
25 changes: 15 additions & 10 deletions field/src/goldilocks_extensions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@ impl Extendable<2> for GoldilocksField {
// DTH_ROOT = W^((ORDER - 1)/2)
const DTH_ROOT: Self = Self(18446744069414584320);

const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] = [Self(0), Self(11713931119993638672)];
const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 2] =
[Self(18081566051660590251), Self(16121475356294670766)];

const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(7226896044987257365)];
const EXT_POWER_OF_TWO_GENERATOR: [Self; 2] = [Self(0), Self(15659105665374529263)];
}

impl Mul for QuadraticExtension<GoldilocksField> {
Expand All @@ -44,11 +45,15 @@ impl Extendable<4> for GoldilocksField {
// DTH_ROOT = W^((ORDER - 1)/4)
const DTH_ROOT: Self = Self(281474976710656);

const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] =
[Self(0), Self(8295451483910296135), Self(0), Self(0)];
const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 4] = [
Self(5024755240244648895),
Self(13227474371289740625),
Self(3912887029498544536),
Self(3900057112666848848),
];

const EXT_POWER_OF_TWO_GENERATOR: [Self; 4] =
[Self(0), Self(0), Self(0), Self(17216955519093520442)];
[Self(0), Self(0), Self(0), Self(12587610116473453104)];
}

impl Mul for QuarticExtension<GoldilocksField> {
Expand All @@ -70,11 +75,11 @@ impl Extendable<5> for GoldilocksField {
const DTH_ROOT: Self = Self(1041288259238279555);

const EXT_MULTIPLICATIVE_GROUP_GENERATOR: [Self; 5] = [
Self(4624713872807171977),
Self(381988216716071028),
Self(14499722700050429911),
Self(4870631734967222356),
Self(4518902370426242880),
Self(2899034827742553394),
Self(13012057356839176729),
Self(14593811582388663055),
Self(7722900811313895436),
Self(4557222484695340057),
];

const EXT_POWER_OF_TWO_GENERATOR: [Self; 5] = [
Expand Down
6 changes: 4 additions & 2 deletions field/src/goldilocks_field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,17 +77,19 @@ impl Field for GoldilocksField {
const CHARACTERISTIC_TWO_ADICITY: usize = Self::TWO_ADICITY;

// Sage: `g = GF(p).multiplicative_generator()`
const MULTIPLICATIVE_GROUP_GENERATOR: Self = Self(14293326489335486720);
const MULTIPLICATIVE_GROUP_GENERATOR: Self = Self(7);

// Sage:
// ```
// g_2 = g^((p - 1) / 2^32)
// g_2.multiplicative_order().factor()
// ```
const POWER_OF_TWO_GENERATOR: Self = Self(7277203076849721926);
const POWER_OF_TWO_GENERATOR: Self = Self(1753635133440165772);

const BITS: usize = 64;

const CUDA_SUPPORT: bool = true;

fn order() -> BigUint {
Self::ORDER.into()
}
Expand Down
27 changes: 25 additions & 2 deletions field/src/interpolation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ pub fn interpolate2<F: Field>(points: [(F, F); 2], x: F) -> F {

#[cfg(test)]
mod tests {
#[cfg(feature = "cuda")]
use zeknox::init_twiddle_factors_rs;

use super::*;
use crate::extension::quartic::QuarticExtension;
use crate::goldilocks_field::GoldilocksField;
Expand All @@ -87,7 +90,12 @@ mod tests {
fn interpolant_random() {
type F = GoldilocksField;

for deg in 0..10 {
#[cfg(feature = "cuda")]
zeknox::clear_cuda_errors_rs();

for deg in 2..10 {
#[cfg(feature = "cuda")]
init_twiddle_factors_rs(0, log2_ceil(deg));
let domain = F::rand_vec(deg);
let coeffs = F::rand_vec(deg);
let coeffs = PolynomialCoeffs { coeffs };
Expand All @@ -101,7 +109,13 @@ mod tests {
fn interpolant_random_roots_of_unity() {
type F = GoldilocksField;

for deg_log in 0..4 {
#[cfg(feature = "cuda")]
zeknox::clear_cuda_errors_rs();

for deg_log in 1..4 {
#[cfg(feature = "cuda")]
init_twiddle_factors_rs(0, deg_log);

let deg = 1 << deg_log;
let domain = F::two_adic_subgroup(deg_log);
let coeffs = F::rand_vec(deg);
Expand All @@ -116,8 +130,15 @@ mod tests {
fn interpolant_random_overspecified() {
type F = GoldilocksField;

#[cfg(feature = "cuda")]
zeknox::clear_cuda_errors_rs();

for deg in 0..10 {
let points = deg + 5;

#[cfg(feature = "cuda")]
init_twiddle_factors_rs(0, log2_ceil(points));

let domain = F::rand_vec(points);
let coeffs = F::rand_vec(deg);
let coeffs = PolynomialCoeffs { coeffs };
Expand All @@ -137,6 +158,8 @@ mod tests {
let points = [(F::rand(), F::rand()), (F::rand(), F::rand())];
let x = F::rand();

#[cfg(feature = "cuda")]
init_twiddle_factors_rs(0, 2);
let ev0 = interpolant(&points).eval(x);
let ev1 = interpolate(&points, x, &barycentric_weights(&points));
let ev2 = interpolate2(points, x);
Expand Down
1 change: 1 addition & 0 deletions field/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ pub mod polynomial;
pub mod secp256k1_base;
pub mod secp256k1_scalar;
pub mod types;
pub mod util;
pub mod zero_poly_coset;

#[cfg(test)]
Expand Down
2 changes: 1 addition & 1 deletion field/src/polynomial/division.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ impl<F: Field> PolynomialCoeffs<F> {
.iter()
.rev()
.scan(F::ZERO, |acc, &c| {
*acc = *acc * z + c;
*acc = c.multiply_accumulate(*acc, z);
Some(*acc)
})
.collect::<Vec<_>>();
Expand Down
Loading
Loading