diff --git a/Makefile b/Makefile index e4e54f2d5..f29ec030a 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ .PHONY: deps deps-linux deps-macos prepare-test-data compile-programs-asm compile-programs-rust compile-bench \ compile-programs clean-asm clean-rust clean-bench clean-shared clean test test-asm test-no-compile \ test-asm-no-compile test-rust test-rust-no-compile test-executor flamegraph-prover \ -test-fast test-prover test-prover-all test-disk-spill test-math-cuda bench-math-cuda bench-prover bench-prover-cuda build check clippy fmt lint +test-fast test-prover test-prover-all test-disk-spill test-math-cuda test-cuda-integration bench-math-cuda bench-prover bench-prover-cuda build check clippy fmt lint UNAME := $(shell uname) @@ -194,6 +194,12 @@ test-disk-spill: test-math-cuda: cargo test -p math-cuda --release +# End-to-end cuda dispatch coverage (requires NVIDIA GPU + nvcc). +# Asserts every R1/R2/R3 GPU counter fired on a real prove. +test-cuda-integration: + cargo test -p lambda-vm-prover --release --features cuda \ + --test cuda_path_integration -- --ignored --nocapture + # math-cuda quick microbench (median of 10 runs) bench-math-cuda: cargo test -p math-cuda --release --test bench_quick -- --ignored --nocapture diff --git a/crypto/math-cuda/build.rs b/crypto/math-cuda/build.rs index bc84f2653..25c16a634 100644 --- a/crypto/math-cuda/build.rs +++ b/crypto/math-cuda/build.rs @@ -111,4 +111,5 @@ fn main() { compile_ptx("arith.cu", "arith.ptx", have_nvcc); compile_ptx("ntt.cu", "ntt.ptx", have_nvcc); compile_ptx("keccak.cu", "keccak.ptx", have_nvcc); + compile_ptx("barycentric.cu", "barycentric.ptx", have_nvcc); } diff --git a/crypto/math-cuda/kernels/barycentric.cu b/crypto/math-cuda/kernels/barycentric.cu new file mode 100644 index 000000000..5c18bcb88 --- /dev/null +++ b/crypto/math-cuda/kernels/barycentric.cu @@ -0,0 +1,192 @@ +// Barycentric evaluation of a polynomial (given as evaluations on a coset) at +// a single out-of-domain point. Matches the CPU +// `math::polynomial::interpolate_coset_eval_*_with_g_n_inv` pair. +// +// Per column, the barycentric sum is +// S = sum over i of point_i * eval_i * inv_denom_i +// where `point_i` is a base-field coset point, `eval_i` is the polynomial's +// value at that point (base for main-trace columns, ext3 for aux or composition +// columns), and `inv_denom_i = 1 / (z - point_i)` is an ext3 scalar (same for +// every column sharing the evaluation point `z`). +// +// These kernels compute only S. The full OOD value is S scaled by the ext3 +// constant `vanishing * n_inv * g_n_inv`, which is constant across a column, so +// the caller applies it once per column (one ext3 mul per column, independent +// of n). Keeping it on the host means the kernel takes no extra ext3 constant +// argument. +// +// Launch: grid = (num_cols, 1, 1), block = (BARY_BLOCK_DIM, 1, 1). + +#include "goldilocks.cuh" +#include "ext3.cuh" + +// 256 threads/block. One ext3 accumulator per thread in shmem => 6 KiB. +#define BARY_BLOCK_DIM 256 + +__device__ __forceinline__ ext3::Fe3 block_reduce_ext3(ext3::Fe3 my) { + __shared__ uint64_t shm_a[BARY_BLOCK_DIM]; + __shared__ uint64_t shm_b[BARY_BLOCK_DIM]; + __shared__ uint64_t shm_c[BARY_BLOCK_DIM]; + uint32_t tid = threadIdx.x; + shm_a[tid] = my.a; + shm_b[tid] = my.b; + shm_c[tid] = my.c; + __syncthreads(); + for (uint32_t s = BARY_BLOCK_DIM / 2; s > 0; s >>= 1) { + if (tid < s) { + shm_a[tid] = goldilocks::add(shm_a[tid], shm_a[tid + s]); + shm_b[tid] = goldilocks::add(shm_b[tid], shm_b[tid + s]); + shm_c[tid] = goldilocks::add(shm_c[tid], shm_c[tid + s]); + } + __syncthreads(); + } + return ext3::make(shm_a[0], shm_b[0], shm_c[0]); +} + +/// Base-column variant: M base-field columns, each `col_stride` u64 apart. +/// `inv_denoms` is a flat 3N u64 buffer (ext3, interleaved `[a0,b0,c0,...]`). +/// Writes `out_ext3_int`: 3M u64, ext3 interleaved, one accumulator per column. +extern "C" __global__ void barycentric_base_batched( + const uint64_t *columns, + uint64_t col_stride, + const uint64_t *coset_points, + const uint64_t *inv_denoms, + uint64_t n, + uint64_t *out_ext3_int +) { + uint64_t col = blockIdx.x; + const uint64_t *col_data = columns + col * col_stride; + + ext3::Fe3 acc = ext3::zero(); + for (uint64_t i = threadIdx.x; i < n; i += BARY_BLOCK_DIM) { + uint64_t eval = col_data[i]; + uint64_t point = coset_points[i]; + uint64_t pe = goldilocks::mul(point, eval); // F * F -> F + ext3::Fe3 inv_d = ext3::make( + inv_denoms[i * 3 + 0], + inv_denoms[i * 3 + 1], + inv_denoms[i * 3 + 2]); + ext3::Fe3 term = ext3::mul_base(inv_d, pe); // E * F -> E + acc = ext3::add(acc, term); + } + + ext3::Fe3 sum = block_reduce_ext3(acc); + if (threadIdx.x == 0) { + out_ext3_int[col * 3 + 0] = sum.a; + out_ext3_int[col * 3 + 1] = sum.b; + out_ext3_int[col * 3 + 2] = sum.c; + } +} + +/// Same as `barycentric_base_batched` but reads rows at stride `row_stride` +/// within each column. Treats the column as an LDE of length `n * row_stride` +/// and sums over the trace-size coset (every `row_stride`-th row). Lets R3 OOD +/// run directly against the LDE device handle from R1 without copying the +/// strided rows into a separate trace-size buffer. +extern "C" __global__ void barycentric_base_batched_strided( + const uint64_t *columns, + uint64_t col_stride, + uint64_t row_stride, + const uint64_t *coset_points, + const uint64_t *inv_denoms, + uint64_t n, + uint64_t *out_ext3_int +) { + uint64_t col = blockIdx.x; + const uint64_t *col_data = columns + col * col_stride; + + ext3::Fe3 acc = ext3::zero(); + for (uint64_t i = threadIdx.x; i < n; i += BARY_BLOCK_DIM) { + uint64_t eval = col_data[i * row_stride]; + uint64_t point = coset_points[i]; + uint64_t pe = goldilocks::mul(point, eval); + ext3::Fe3 inv_d = ext3::make( + inv_denoms[i * 3 + 0], + inv_denoms[i * 3 + 1], + inv_denoms[i * 3 + 2]); + ext3::Fe3 term = ext3::mul_base(inv_d, pe); + acc = ext3::add(acc, term); + } + + ext3::Fe3 sum = block_reduce_ext3(acc); + if (threadIdx.x == 0) { + out_ext3_int[col * 3 + 0] = sum.a; + out_ext3_int[col * 3 + 1] = sum.b; + out_ext3_int[col * 3 + 2] = sum.c; + } +} + +/// Ext3-column variant: M ext3 columns stored as 3M base slabs. Column `c` +/// lives at `columns[(c*3+k)*col_stride + i]` for component `k` in 0..3. +extern "C" __global__ void barycentric_ext3_batched( + const uint64_t *columns, + uint64_t col_stride, + const uint64_t *coset_points, + const uint64_t *inv_denoms, + uint64_t n, + uint64_t *out_ext3_int +) { + uint64_t col = blockIdx.x; + const uint64_t *slab_a = columns + (col * 3 + 0) * col_stride; + const uint64_t *slab_b = columns + (col * 3 + 1) * col_stride; + const uint64_t *slab_c = columns + (col * 3 + 2) * col_stride; + + ext3::Fe3 acc = ext3::zero(); + for (uint64_t i = threadIdx.x; i < n; i += BARY_BLOCK_DIM) { + ext3::Fe3 eval = ext3::make(slab_a[i], slab_b[i], slab_c[i]); + uint64_t point = coset_points[i]; + // F * E -> E. Point times eval, componentwise on the 3 base components. + ext3::Fe3 pe = ext3::mul_base(eval, point); + // E * E -> E + ext3::Fe3 inv_d = ext3::make( + inv_denoms[i * 3 + 0], + inv_denoms[i * 3 + 1], + inv_denoms[i * 3 + 2]); + ext3::Fe3 term = ext3::mul(pe, inv_d); + acc = ext3::add(acc, term); + } + + ext3::Fe3 sum = block_reduce_ext3(acc); + if (threadIdx.x == 0) { + out_ext3_int[col * 3 + 0] = sum.a; + out_ext3_int[col * 3 + 1] = sum.b; + out_ext3_int[col * 3 + 2] = sum.c; + } +} + +/// Strided ext3 variant for R3 OOD of aux LDE. +extern "C" __global__ void barycentric_ext3_batched_strided( + const uint64_t *columns, + uint64_t col_stride, + uint64_t row_stride, + const uint64_t *coset_points, + const uint64_t *inv_denoms, + uint64_t n, + uint64_t *out_ext3_int +) { + uint64_t col = blockIdx.x; + const uint64_t *slab_a = columns + (col * 3 + 0) * col_stride; + const uint64_t *slab_b = columns + (col * 3 + 1) * col_stride; + const uint64_t *slab_c = columns + (col * 3 + 2) * col_stride; + + ext3::Fe3 acc = ext3::zero(); + for (uint64_t i = threadIdx.x; i < n; i += BARY_BLOCK_DIM) { + uint64_t lde_i = i * row_stride; + ext3::Fe3 eval = ext3::make(slab_a[lde_i], slab_b[lde_i], slab_c[lde_i]); + uint64_t point = coset_points[i]; + ext3::Fe3 pe = ext3::mul_base(eval, point); + ext3::Fe3 inv_d = ext3::make( + inv_denoms[i * 3 + 0], + inv_denoms[i * 3 + 1], + inv_denoms[i * 3 + 2]); + ext3::Fe3 term = ext3::mul(pe, inv_d); + acc = ext3::add(acc, term); + } + + ext3::Fe3 sum = block_reduce_ext3(acc); + if (threadIdx.x == 0) { + out_ext3_int[col * 3 + 0] = sum.a; + out_ext3_int[col * 3 + 1] = sum.b; + out_ext3_int[col * 3 + 2] = sum.c; + } +} diff --git a/crypto/math-cuda/src/barycentric.rs b/crypto/math-cuda/src/barycentric.rs new file mode 100644 index 000000000..b4eb12dfd --- /dev/null +++ b/crypto/math-cuda/src/barycentric.rs @@ -0,0 +1,227 @@ +//! Barycentric evaluation on device. Matches the CPU +//! [`interpolate_coset_eval_ext_with_g_n_inv`](math::polynomial::interpolate_coset_eval_ext_with_g_n_inv) +//! +//! The kernels compute only the unscaled barycentric sum +//! S = sum over i of point_i * eval_i * inv_denom_i +//! per column. The caller multiplies each `S` by the ext3 scalar +//! `(z^N - g^N) * 1/N * 1/g^N` to get the final OOD value. That scaling is +//! one ext3 mul per column and stays on host. + +use cudarc::driver::{LaunchConfig, PushKernelArg}; + +use crate::Result; +use crate::device::backend; +use crate::lde::{GpuLdeBase, GpuLdeExt3}; + +const BLOCK_DIM: u32 = 256; + +/// Barycentric sums over M base-field columns, each of length `n`, laid out +/// with stride `col_stride` (so column `c` is at `columns[c*col_stride .. +/// c*col_stride + n]`). `inv_denoms` is 3N u64 (ext3 interleaved). +/// Returns 3M u64 (ext3 interleaved), one per column. +pub fn barycentric_base( + columns: &[u64], + col_stride: usize, + coset_points: &[u64], + inv_denoms_ext3: &[u64], + n: usize, + num_cols: usize, +) -> Result> { + assert_eq!(coset_points.len(), n); + assert_eq!(inv_denoms_ext3.len(), 3 * n); + assert!(columns.len() >= num_cols * col_stride); + // Kernel reads col_data[0..n] per column, so col_stride must cover at + // least n u64s. Smaller strides would read past the column boundary. + assert!( + col_stride >= n, + "col_stride {col_stride} < n {n}: kernel reads col_data[0..n] but stride is shorter" + ); + if num_cols == 0 || n == 0 { + return Ok(vec![0; 3 * num_cols]); + } + + let be = backend()?; + let stream = be.next_stream(); + + let cols_dev = stream.clone_htod(&columns[..num_cols * col_stride])?; + let points_dev = stream.clone_htod(coset_points)?; + let inv_dev = stream.clone_htod(inv_denoms_ext3)?; + let mut out_dev = stream.alloc_zeros::(3 * num_cols)?; + + let col_stride_u64 = col_stride as u64; + let n_u64 = n as u64; + let cfg = LaunchConfig { + grid_dim: (num_cols as u32, 1, 1), + block_dim: (BLOCK_DIM, 1, 1), + shared_mem_bytes: 0, + }; + unsafe { + stream + .launch_builder(&be.barycentric_base_batched) + .arg(&cols_dev) + .arg(&col_stride_u64) + .arg(&points_dev) + .arg(&inv_dev) + .arg(&n_u64) + .arg(&mut out_dev) + .launch(cfg)?; + } + let out = stream.clone_dtoh(&out_dev)?; + stream.synchronize()?; + Ok(out) +} + +/// Same as [`barycentric_base`] but `columns` holds M ext3 columns in the +/// de-interleaved layout: slab `c*3 + k` at offset `(c*3+k)*col_stride`. +/// `columns.len() >= num_cols * 3 * col_stride`. +pub fn barycentric_ext3( + columns: &[u64], + col_stride: usize, + coset_points: &[u64], + inv_denoms_ext3: &[u64], + n: usize, + num_cols: usize, +) -> Result> { + assert_eq!(coset_points.len(), n); + assert_eq!(inv_denoms_ext3.len(), 3 * n); + assert!(columns.len() >= num_cols * 3 * col_stride); + // Each ext3 slab is read at indices [0..n), so col_stride must cover at + // least n u64s. Smaller strides would read past the slab boundary. + assert!( + col_stride >= n, + "col_stride {col_stride} < n {n}: kernel reads slab[0..n] but stride is shorter" + ); + if num_cols == 0 || n == 0 { + return Ok(vec![0; 3 * num_cols]); + } + + let be = backend()?; + let stream = be.next_stream(); + + let cols_dev = stream.clone_htod(&columns[..num_cols * 3 * col_stride])?; + let points_dev = stream.clone_htod(coset_points)?; + let inv_dev = stream.clone_htod(inv_denoms_ext3)?; + let mut out_dev = stream.alloc_zeros::(3 * num_cols)?; + + let col_stride_u64 = col_stride as u64; + let n_u64 = n as u64; + let cfg = LaunchConfig { + grid_dim: (num_cols as u32, 1, 1), + block_dim: (BLOCK_DIM, 1, 1), + shared_mem_bytes: 0, + }; + unsafe { + stream + .launch_builder(&be.barycentric_ext3_batched) + .arg(&cols_dev) + .arg(&col_stride_u64) + .arg(&points_dev) + .arg(&inv_dev) + .arg(&n_u64) + .arg(&mut out_dev) + .launch(cfg)?; + } + let out = stream.clone_dtoh(&out_dev)?; + stream.synchronize()?; + Ok(out) +} + +/// Run `barycentric_base_batched_strided` over the base LDE already on +/// device (`main_handle`), summing over the trace-size coset (every +/// `row_stride = blowup_factor`-th row). H2Ds only the coset points and +/// inv_denoms. The column data never crosses PCIe. +pub fn barycentric_base_on_device( + main_handle: &GpuLdeBase, + row_stride: usize, + coset_points: &[u64], + inv_denoms_ext3: &[u64], + n: usize, +) -> Result> { + assert_eq!(coset_points.len(), n); + assert_eq!(inv_denoms_ext3.len(), 3 * n); + let num_cols = main_handle.m; + if num_cols == 0 || n == 0 { + return Ok(vec![0; 3 * num_cols]); + } + let col_stride = main_handle.lde_size; + + let be = backend()?; + let stream = be.next_stream(); + + let points_dev = stream.clone_htod(coset_points)?; + let inv_dev = stream.clone_htod(inv_denoms_ext3)?; + let mut out_dev = stream.alloc_zeros::(3 * num_cols)?; + + let col_stride_u64 = col_stride as u64; + let row_stride_u64 = row_stride as u64; + let n_u64 = n as u64; + let cfg = LaunchConfig { + grid_dim: (num_cols as u32, 1, 1), + block_dim: (BLOCK_DIM, 1, 1), + shared_mem_bytes: 0, + }; + unsafe { + stream + .launch_builder(&be.barycentric_base_batched_strided) + .arg(main_handle.buf.as_ref()) + .arg(&col_stride_u64) + .arg(&row_stride_u64) + .arg(&points_dev) + .arg(&inv_dev) + .arg(&n_u64) + .arg(&mut out_dev) + .launch(cfg)?; + } + let out = stream.clone_dtoh(&out_dev)?; + stream.synchronize()?; + Ok(out) +} + +/// Ext3 counterpart of [`barycentric_base_on_device`]. Reads the aux LDE +/// from the de-interleaved device handle. +pub fn barycentric_ext3_on_device( + aux_handle: &GpuLdeExt3, + row_stride: usize, + coset_points: &[u64], + inv_denoms_ext3: &[u64], + n: usize, +) -> Result> { + assert_eq!(coset_points.len(), n); + assert_eq!(inv_denoms_ext3.len(), 3 * n); + let num_cols = aux_handle.m; + if num_cols == 0 || n == 0 { + return Ok(vec![0; 3 * num_cols]); + } + let col_stride = aux_handle.lde_size; + + let be = backend()?; + let stream = be.next_stream(); + + let points_dev = stream.clone_htod(coset_points)?; + let inv_dev = stream.clone_htod(inv_denoms_ext3)?; + let mut out_dev = stream.alloc_zeros::(3 * num_cols)?; + + let col_stride_u64 = col_stride as u64; + let row_stride_u64 = row_stride as u64; + let n_u64 = n as u64; + let cfg = LaunchConfig { + grid_dim: (num_cols as u32, 1, 1), + block_dim: (BLOCK_DIM, 1, 1), + shared_mem_bytes: 0, + }; + unsafe { + stream + .launch_builder(&be.barycentric_ext3_batched_strided) + .arg(aux_handle.buf.as_ref()) + .arg(&col_stride_u64) + .arg(&row_stride_u64) + .arg(&points_dev) + .arg(&inv_dev) + .arg(&n_u64) + .arg(&mut out_dev) + .launch(cfg)?; + } + let out = stream.clone_dtoh(&out_dev)?; + stream.synchronize()?; + Ok(out) +} diff --git a/crypto/math-cuda/src/device.rs b/crypto/math-cuda/src/device.rs index 3af2ed546..353932ba6 100644 --- a/crypto/math-cuda/src/device.rs +++ b/crypto/math-cuda/src/device.rs @@ -93,6 +93,7 @@ impl Drop for PinnedStaging { const ARITH_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/arith.ptx")); const NTT_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/ntt.ptx")); const KECCAK_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/keccak.ptx")); +const BARY_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/barycentric.ptx")); /// Number of CUDA streams in the pool. Larger pools let many rayon-parallel /// callers overlap on the GPU without serializing on stream ownership. The /// default stream is deliberately excluded because it synchronises with all @@ -143,6 +144,12 @@ pub struct Backend { pub keccak_fri_leaves_ext3: CudaFunction, pub keccak_merkle_level: CudaFunction, + // barycentric.ptx + pub barycentric_base_batched: CudaFunction, + pub barycentric_ext3_batched: CudaFunction, + pub barycentric_base_batched_strided: CudaFunction, + pub barycentric_ext3_batched_strided: CudaFunction, + // Twiddle caches keyed by log_n. fwd_twiddles: Mutex>>>>, inv_twiddles: Mutex>>>>, @@ -160,6 +167,7 @@ impl Backend { let arith = ctx.load_module(Ptx::from_src(ARITH_PTX))?; let ntt = ctx.load_module(Ptx::from_src(NTT_PTX))?; let keccak = ctx.load_module(Ptx::from_src(KECCAK_PTX))?; + let bary = ctx.load_module(Ptx::from_src(BARY_PTX))?; let mut streams = Vec::with_capacity(STREAM_POOL_SIZE); for _ in 0..STREAM_POOL_SIZE { @@ -212,6 +220,12 @@ impl Backend { keccak_comp_poly_leaves_ext3: keccak.load_function("keccak_comp_poly_leaves_ext3")?, keccak_fri_leaves_ext3: keccak.load_function("keccak_fri_leaves_ext3")?, keccak_merkle_level: keccak.load_function("keccak_merkle_level")?, + barycentric_base_batched: bary.load_function("barycentric_base_batched")?, + barycentric_ext3_batched: bary.load_function("barycentric_ext3_batched")?, + barycentric_base_batched_strided: bary + .load_function("barycentric_base_batched_strided")?, + barycentric_ext3_batched_strided: bary + .load_function("barycentric_ext3_batched_strided")?, fwd_twiddles: Mutex::new(vec![None; max_log]), inv_twiddles: Mutex::new(vec![None; max_log]), ctx, diff --git a/crypto/math-cuda/src/lib.rs b/crypto/math-cuda/src/lib.rs index d1b4e1210..a5bb8defb 100644 --- a/crypto/math-cuda/src/lib.rs +++ b/crypto/math-cuda/src/lib.rs @@ -4,6 +4,7 @@ //! element-wise arith) is either internal to the LDE pipeline or used by the //! parity test suite. +pub mod barycentric; pub mod device; pub mod lde; pub mod merkle; diff --git a/crypto/math-cuda/src/merkle.rs b/crypto/math-cuda/src/merkle.rs index 6faf12b51..932e81325 100644 --- a/crypto/math-cuda/src/merkle.rs +++ b/crypto/math-cuda/src/merkle.rs @@ -55,6 +55,7 @@ pub fn keccak_leaves_base( &mut out_dev.as_view_mut(), )?; let out = stream.clone_dtoh(&out_dev)?; + stream.synchronize()?; Ok(out) } @@ -89,6 +90,7 @@ pub fn keccak_leaves_ext3( &mut out_dev.as_view_mut(), )?; let out = stream.clone_dtoh(&out_dev)?; + stream.synchronize()?; Ok(out) } @@ -211,6 +213,7 @@ pub fn build_merkle_tree_on_device(hashed_leaves: &[u8]) -> Result> { build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, leaves_len)?; let out = stream.clone_dtoh(&nodes_dev)?; + stream.synchronize()?; Ok(out) } @@ -280,6 +283,7 @@ pub fn build_comp_poly_tree_from_evals_ext3(parts_interleaved: &[&[u64]]) -> Res build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, num_leaves)?; let out = stream.clone_dtoh(&nodes_dev)?; + stream.synchronize()?; drop(staging); Ok(out) } @@ -325,6 +329,7 @@ pub fn build_fri_layer_tree_from_evals_ext3(evals: &[u64]) -> Result> { build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, num_leaves)?; let out = stream.clone_dtoh(&nodes_dev)?; + stream.synchronize()?; Ok(out) } diff --git a/crypto/math-cuda/tests/barycentric.rs b/crypto/math-cuda/tests/barycentric.rs new file mode 100644 index 000000000..e9fac4096 --- /dev/null +++ b/crypto/math-cuda/tests/barycentric.rs @@ -0,0 +1,144 @@ +//! Parity: GPU barycentric sum vs CPU. We don't call the upstream +//! `interpolate_coset_eval_*_with_g_n_inv` directly because the GPU kernel +//! returns only the unscaled sum and the caller applies the ext3 scale. We +//! replicate the same unscaled sum on CPU for comparison. + +use math::field::element::FieldElement; +use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField; +use math::field::goldilocks::GoldilocksField; +use math::field::traits::IsPrimeField; +use math_cuda::barycentric::{barycentric_base, barycentric_ext3}; +use rand::{Rng, SeedableRng}; +use rand_chacha::ChaCha8Rng; + +type Fp = FieldElement; +type Fp3 = FieldElement; + +fn canon_triplet(e: &Fp3) -> [u64; 3] { + [ + GoldilocksField::canonical(e.value()[0].value()), + GoldilocksField::canonical(e.value()[1].value()), + GoldilocksField::canonical(e.value()[2].value()), + ] +} + +fn canon_triplet_raw(t: &[u64]) -> [u64; 3] { + [ + GoldilocksField::canonical(&t[0]), + GoldilocksField::canonical(&t[1]), + GoldilocksField::canonical(&t[2]), + ] +} + +fn random_fp(rng: &mut ChaCha8Rng) -> Fp { + Fp::from_raw(rng.r#gen::()) +} +fn random_fp3(rng: &mut ChaCha8Rng) -> Fp3 { + Fp3::new([random_fp(rng), random_fp(rng), random_fp(rng)]) +} + +#[test] +fn barycentric_base_sum_matches_cpu() { + for &(log_n, num_cols) in &[(4u32, 1usize), (8, 5), (10, 17), (12, 3)] { + let n = 1 << log_n; + let mut rng = ChaCha8Rng::seed_from_u64(100 + log_n as u64 * 7 + num_cols as u64); + + let coset_points: Vec = (0..n).map(|_| random_fp(&mut rng)).collect(); + let inv_denoms: Vec = (0..n).map(|_| random_fp3(&mut rng)).collect(); + + // Lay out columns base: column c contiguous slab of n u64s. + let cols_fp: Vec> = (0..num_cols) + .map(|_| (0..n).map(|_| random_fp(&mut rng)).collect()) + .collect(); + let mut columns_flat = vec![0u64; num_cols * n]; + for (c, col) in cols_fp.iter().enumerate() { + for (r, e) in col.iter().enumerate() { + columns_flat[c * n + r] = *e.value(); + } + } + let points_raw: Vec = coset_points.iter().map(|e| *e.value()).collect(); + let inv_denoms_raw: Vec = inv_denoms + .iter() + .flat_map(|e| { + [ + *e.value()[0].value(), + *e.value()[1].value(), + *e.value()[2].value(), + ] + }) + .collect(); + + let gpu = + barycentric_base(&columns_flat, n, &points_raw, &inv_denoms_raw, n, num_cols).unwrap(); + + for (c, col) in cols_fp.iter().enumerate() { + // CPU reference sum. Force ext3 by embedding the base product. + let mut sum = Fp3::zero(); + for i in 0..n { + let pe_base: Fp = &coset_points[i] * &col[i]; // F * F = F + // Base * ext3 = ext3 (base is on the left, IsSubFieldOf direction). + let pe_ext3: Fp3 = &pe_base * &inv_denoms[i]; // F * E = E + sum = &sum + &pe_ext3; + } + let gpu_sum = canon_triplet_raw(&gpu[c * 3..(c + 1) * 3]); + let cpu_sum = canon_triplet(&sum); + assert_eq!( + gpu_sum, cpu_sum, + "base col {c} log_n={log_n} num_cols={num_cols}" + ); + } + } +} + +#[test] +fn barycentric_ext3_sum_matches_cpu() { + for &(log_n, num_cols) in &[(4u32, 1usize), (8, 3), (10, 7)] { + let n = 1 << log_n; + let mut rng = ChaCha8Rng::seed_from_u64(200 + log_n as u64 * 11 + num_cols as u64); + + let coset_points: Vec = (0..n).map(|_| random_fp(&mut rng)).collect(); + let inv_denoms: Vec = (0..n).map(|_| random_fp3(&mut rng)).collect(); + let cols_fp3: Vec> = (0..num_cols) + .map(|_| (0..n).map(|_| random_fp3(&mut rng)).collect()) + .collect(); + + // De-interleaved layout: 3 base slabs per ext3 column. + let mut columns_flat = vec![0u64; num_cols * 3 * n]; + for (c, col) in cols_fp3.iter().enumerate() { + for (r, e) in col.iter().enumerate() { + columns_flat[(c * 3) * n + r] = *e.value()[0].value(); + columns_flat[(c * 3 + 1) * n + r] = *e.value()[1].value(); + columns_flat[(c * 3 + 2) * n + r] = *e.value()[2].value(); + } + } + let points_raw: Vec = coset_points.iter().map(|e| *e.value()).collect(); + let inv_denoms_raw: Vec = inv_denoms + .iter() + .flat_map(|e| { + [ + *e.value()[0].value(), + *e.value()[1].value(), + *e.value()[2].value(), + ] + }) + .collect(); + + let gpu = + barycentric_ext3(&columns_flat, n, &points_raw, &inv_denoms_raw, n, num_cols).unwrap(); + + for (c, col) in cols_fp3.iter().enumerate() { + let mut sum = Fp3::zero(); + for i in 0..n { + let pe: Fp3 = &coset_points[i] * &col[i]; // F * E = E + let term: Fp3 = &pe * &inv_denoms[i]; // E * E = E + sum = &sum + &term; + } + let gpu_sum = canon_triplet_raw(&gpu[c * 3..(c + 1) * 3]); + let cpu_sum = canon_triplet(&sum); + assert_eq!( + gpu_sum, cpu_sum, + "ext3 col {c} log_n={log_n} num_cols={num_cols}" + ); + } + } +} diff --git a/crypto/math-cuda/tests/barycentric_strided.rs b/crypto/math-cuda/tests/barycentric_strided.rs new file mode 100644 index 000000000..653ef4e38 --- /dev/null +++ b/crypto/math-cuda/tests/barycentric_strided.rs @@ -0,0 +1,147 @@ +//! Parity: strided barycentric kernels (used by R3 OOD on device LDE handles) +//! match the non-strided kernels fed a pre-strided column buffer. + +use std::sync::Arc; + +use math::field::element::FieldElement; +use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField; +use math::field::goldilocks::GoldilocksField; +use math_cuda::barycentric::{ + barycentric_base, barycentric_base_on_device, barycentric_ext3, barycentric_ext3_on_device, +}; +use math_cuda::device::backend; +use math_cuda::lde::{GpuLdeBase, GpuLdeExt3}; +use rand::{Rng, SeedableRng}; +use rand_chacha::ChaCha8Rng; + +type Fp = FieldElement; +type Fp3 = FieldElement; + +fn rand_fp(rng: &mut ChaCha8Rng) -> Fp { + Fp::from_raw(rng.r#gen::()) +} +fn rand_fp3(rng: &mut ChaCha8Rng) -> Fp3 { + Fp3::new([rand_fp(rng), rand_fp(rng), rand_fp(rng)]) +} + +fn run_base(log_trace: u32, blowup: usize, num_cols: usize, seed: u64) { + let n = 1usize << log_trace; + let lde_size = n * blowup; + let mut rng = ChaCha8Rng::seed_from_u64(seed); + let lde_data: Vec> = (0..num_cols) + .map(|_| (0..lde_size).map(|_| rand_fp(&mut rng)).collect()) + .collect(); + let coset_points: Vec = (0..n).map(|_| rng.r#gen::()).collect(); + let inv_denoms_ext3: Vec = (0..(n * 3)).map(|_| rng.r#gen::()).collect(); + + // Pack full LDE column-major for device. + let mut lde_flat = vec![0u64; num_cols * lde_size]; + for (c, col) in lde_data.iter().enumerate() { + for (r, v) in col.iter().enumerate() { + lde_flat[c * lde_size + r] = *v.value(); + } + } + let be = backend().unwrap(); + let stream = be.next_stream(); + let lde_dev = stream.clone_htod(&lde_flat).unwrap(); + stream.synchronize().unwrap(); + let handle = GpuLdeBase { + buf: Arc::new(lde_dev), + m: num_cols, + lde_size, + }; + + // Pre-strided buffer for non-strided reference: trace-size picks of each col. + let mut pre_strided = vec![0u64; num_cols * n]; + for c in 0..num_cols { + for i in 0..n { + pre_strided[c * n + i] = lde_flat[c * lde_size + i * blowup]; + } + } + + let reference = barycentric_base( + &pre_strided, + n, + &coset_points, + &inv_denoms_ext3, + n, + num_cols, + ) + .unwrap(); + + let strided = + barycentric_base_on_device(&handle, blowup, &coset_points, &inv_denoms_ext3, n).unwrap(); + + assert_eq!( + reference, strided, + "base strided mismatch (log_trace={log_trace}, blowup={blowup}, cols={num_cols})" + ); +} + +fn run_ext3(log_trace: u32, blowup: usize, num_cols: usize, seed: u64) { + let n = 1usize << log_trace; + let lde_size = n * blowup; + let mut rng = ChaCha8Rng::seed_from_u64(seed); + let lde_data: Vec> = (0..num_cols) + .map(|_| (0..lde_size).map(|_| rand_fp3(&mut rng)).collect()) + .collect(); + let coset_points: Vec = (0..n).map(|_| rng.r#gen::()).collect(); + let inv_denoms_ext3: Vec = (0..(n * 3)).map(|_| rng.r#gen::()).collect(); + + // Pack LDE de-interleaved: (m*3) by lde_size. + let mut lde_flat = vec![0u64; num_cols * 3 * lde_size]; + for (c, col) in lde_data.iter().enumerate() { + for (r, v) in col.iter().enumerate() { + lde_flat[(c * 3) * lde_size + r] = *v.value()[0].value(); + lde_flat[(c * 3 + 1) * lde_size + r] = *v.value()[1].value(); + lde_flat[(c * 3 + 2) * lde_size + r] = *v.value()[2].value(); + } + } + let be = backend().unwrap(); + let stream = be.next_stream(); + let lde_dev = stream.clone_htod(&lde_flat).unwrap(); + stream.synchronize().unwrap(); + let handle = GpuLdeExt3 { + buf: Arc::new(lde_dev), + m: num_cols, + lde_size, + }; + + // Pre-strided buffer for non-strided reference. + let mut pre_strided = vec![0u64; num_cols * 3 * n]; + for c in 0..num_cols { + for i in 0..n { + pre_strided[(c * 3) * n + i] = lde_flat[(c * 3) * lde_size + i * blowup]; + pre_strided[(c * 3 + 1) * n + i] = lde_flat[(c * 3 + 1) * lde_size + i * blowup]; + pre_strided[(c * 3 + 2) * n + i] = lde_flat[(c * 3 + 2) * lde_size + i * blowup]; + } + } + let reference = barycentric_ext3( + &pre_strided, + n, + &coset_points, + &inv_denoms_ext3, + n, + num_cols, + ) + .unwrap(); + + let strided = + barycentric_ext3_on_device(&handle, blowup, &coset_points, &inv_denoms_ext3, n).unwrap(); + + assert_eq!(reference, strided, "ext3 strided mismatch"); +} + +#[test] +fn bary_base_strided_small() { + for (log_t, blowup, cols) in [(4u32, 2usize, 3usize), (8, 4, 10), (12, 2, 5)] { + run_base(log_t, blowup, cols, 1000 + log_t as u64); + } +} + +#[test] +fn bary_ext3_strided_small() { + for (log_t, blowup, cols) in [(4u32, 2usize, 2usize), (8, 4, 5), (10, 2, 3)] { + run_ext3(log_t, blowup, cols, 2000 + log_t as u64); + } +} diff --git a/crypto/math-cuda/tests/comp_poly_tree.rs b/crypto/math-cuda/tests/comp_poly_tree.rs new file mode 100644 index 000000000..29e33b6fe --- /dev/null +++ b/crypto/math-cuda/tests/comp_poly_tree.rs @@ -0,0 +1,235 @@ +//! Parity: GPU fused `evaluate_poly_coset_batch_ext3_into_with_merkle_tree` +//! (LDE + row-pair Keccak leaves + Merkle inner tree) against the same CPU +//! pipeline produced by `commit_composition_polynomial`. + +use math::field::element::FieldElement; +use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField; +use math::field::goldilocks::GoldilocksField; +use math::field::traits::{IsField, IsPrimeField}; +use math::polynomial::Polynomial; +use math::traits::ByteConversion; +use rand::{Rng, SeedableRng}; +use rand_chacha::ChaCha8Rng; +use sha3::{Digest, Keccak256}; + +type Fp = FieldElement; +type Fp3 = FieldElement; + +fn reverse_index(i: u64, n: u64) -> u64 { + let log_n = n.trailing_zeros(); + i.reverse_bits() >> (64 - log_n) +} + +fn offset_weights(n: usize, offset: u64) -> Vec { + let mut w = Vec::with_capacity(n); + let mut cur = 1u64; + for _ in 0..n { + w.push(cur); + cur = GoldilocksField::mul(&cur, &offset); + } + w +} + +fn rand_ext3(rng: &mut ChaCha8Rng) -> Fp3 { + Fp3::new([ + Fp::from_raw(rng.r#gen::()), + Fp::from_raw(rng.r#gen::()), + Fp::from_raw(rng.r#gen::()), + ]) +} + +fn ext3_to_u64s(col: &[Fp3]) -> Vec { + let mut out = Vec::with_capacity(col.len() * 3); + for e in col { + out.push(*e.value()[0].value()); + out.push(*e.value()[1].value()); + out.push(*e.value()[2].value()); + } + out +} + +fn u64s_to_ext3(raw: &[u64]) -> Vec { + let mut out = Vec::with_capacity(raw.len() / 3); + for i in 0..raw.len() / 3 { + out.push(Fp3::new([ + Fp::from_raw(raw[i * 3]), + Fp::from_raw(raw[i * 3 + 1]), + Fp::from_raw(raw[i * 3 + 2]), + ])); + } + out +} + +fn canon_ext3(e: &Fp3) -> [u64; 3] { + [ + GoldilocksField::canonical(e.value()[0].value()), + GoldilocksField::canonical(e.value()[1].value()), + GoldilocksField::canonical(e.value()[2].value()), + ] +} + +/// CPU: evaluate polynomial on coset via `Polynomial::evaluate_offset_fft`. +fn cpu_evaluate(coefs: &[Fp3], blowup: usize, offset: &Fp) -> Vec { + let poly = Polynomial::new(coefs); + Polynomial::evaluate_offset_fft::(&poly, blowup, None, offset).unwrap() +} + +fn cpu_hash_pair(left: &[u8; 32], right: &[u8; 32]) -> [u8; 32] { + let mut h = Keccak256::new(); + h.update(left); + h.update(right); + let mut out = [0u8; 32]; + out.copy_from_slice(&h.finalize()); + out +} + +/// CPU: `commit_composition_polynomial`-style tree root over num_rows/2 leaves. +fn cpu_tree_nodes(parts: &[Vec]) -> Vec<[u8; 32]> { + let num_rows = parts[0].len(); + let num_parts = parts.len(); + let num_leaves = num_rows / 2; + assert!(num_leaves.is_power_of_two() && num_leaves >= 1); + let byte_len = 24; + + let hashed_leaves: Vec<[u8; 32]> = (0..num_leaves) + .map(|leaf_idx| { + let br_0 = reverse_index(2 * leaf_idx as u64, num_rows as u64) as usize; + let br_1 = reverse_index(2 * leaf_idx as u64 + 1, num_rows as u64) as usize; + let total_bytes = 2 * num_parts * byte_len; + let mut buf = vec![0u8; total_bytes]; + let mut offset = 0; + for part in parts.iter() { + part[br_0].write_bytes_be(&mut buf[offset..offset + byte_len]); + offset += byte_len; + } + for part in parts.iter() { + part[br_1].write_bytes_be(&mut buf[offset..offset + byte_len]); + offset += byte_len; + } + let mut h = Keccak256::new(); + h.update(&buf); + let mut r = [0u8; 32]; + r.copy_from_slice(&h.finalize()); + r + }) + .collect(); + + let total = 2 * num_leaves - 1; + let mut nodes: Vec<[u8; 32]> = vec![[0u8; 32]; total]; + for (i, leaf) in hashed_leaves.iter().enumerate() { + nodes[num_leaves - 1 + i] = *leaf; + } + let mut level_begin = num_leaves - 1; + while level_begin != 0 { + let new_begin = level_begin / 2; + let n_pairs = level_begin - new_begin; + for j in 0..n_pairs { + let left = nodes[level_begin + 2 * j]; + let right = nodes[level_begin + 2 * j + 1]; + nodes[new_begin + j] = cpu_hash_pair(&left, &right); + } + level_begin = new_begin; + } + nodes +} + +fn run_parity(log_n: u32, blowup: usize, num_parts: usize, seed: u64) { + let n = 1usize << log_n; + let lde_size = n * blowup; + assert!(lde_size >= 2); + let mut rng = ChaCha8Rng::seed_from_u64(seed); + + // Random ext3 coefficient vectors per part. + let parts_cpu: Vec> = (0..num_parts) + .map(|_| (0..n).map(|_| rand_ext3(&mut rng)).collect()) + .collect(); + + // CPU LDE via evaluate_offset_fft, then CPU tree. + let offset_u64 = rng.r#gen::() | 1; + let offset = Fp::from_raw(offset_u64); + let cpu_lde_parts: Vec> = parts_cpu + .iter() + .map(|c| cpu_evaluate(c, blowup, &offset)) + .collect(); + let cpu_nodes = cpu_tree_nodes(&cpu_lde_parts); + + // GPU fused call. + let weights = offset_weights(n, offset_u64); + let coefs_u64: Vec> = parts_cpu.iter().map(|c| ext3_to_u64s(c)).collect(); + let coefs_slices: Vec<&[u64]> = coefs_u64.iter().map(|v| v.as_slice()).collect(); + let mut outputs_raw: Vec> = (0..num_parts).map(|_| vec![0u64; 3 * lde_size]).collect(); + let mut outputs_slices: Vec<&mut [u64]> = + outputs_raw.iter_mut().map(|v| v.as_mut_slice()).collect(); + // R2 leaves are row-pairs: num_leaves = lde_size / 2, so + // tight_total_nodes = 2 * num_leaves - 1 = lde_size - 1. + let total_nodes = lde_size - 1; + let mut nodes_bytes = vec![0u8; total_nodes * 32]; + + math_cuda::lde::evaluate_poly_coset_batch_ext3_into_with_merkle_tree( + &coefs_slices, + n, + blowup, + &weights, + &mut outputs_slices, + &mut nodes_bytes, + ) + .unwrap(); + + // Compare LDE parts. + for (c, cpu_col) in cpu_lde_parts.iter().enumerate() { + let gpu_col = u64s_to_ext3(&outputs_raw[c]); + for i in 0..lde_size { + assert_eq!( + canon_ext3(&gpu_col[i]), + canon_ext3(&cpu_col[i]), + "LDE mismatch part {c} row {i} log_n={log_n} blowup={blowup}" + ); + } + } + + // Compare tree nodes. GPU writes `2*num_leaves - 1 = lde_size - 1` nodes. + let num_leaves = lde_size / 2; + let tight_total = 2 * num_leaves - 1; + assert_eq!(cpu_nodes.len(), tight_total); + for i in 0..tight_total { + let g = &nodes_bytes[i * 32..(i + 1) * 32]; + let c = &cpu_nodes[i]; + assert_eq!( + g, c, + "tree node {i} mismatch at log_n={log_n} blowup={blowup} parts={num_parts}" + ); + } +} + +#[test] +fn comp_poly_tree_small() { + for log_n in 2u32..=5 { + for &blowup in &[2usize, 4, 8] { + for &parts in &[1usize, 2, 4] { + run_parity( + log_n, + blowup, + parts, + 1000 + log_n as u64 * 31 + parts as u64, + ); + } + } + } +} + +#[test] +fn comp_poly_tree_medium() { + for &(log_n, blowup, parts) in &[(10u32, 4usize, 4usize), (12, 2, 3)] { + run_parity( + log_n, + blowup, + parts, + 2000 + log_n as u64 * 11 + parts as u64, + ); + } +} + +#[test] +fn comp_poly_tree_large() { + run_parity(14, 2, 4, 9999); +} diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs index 6c0d8d9e9..530e2f6b9 100644 --- a/crypto/stark/src/gpu_lde.rs +++ b/crypto/stark/src/gpu_lde.rs @@ -5,17 +5,21 @@ //! launch overhead dominates. Produces the same natural-order, non-canonical //! LDE evaluations as the CPU path. +use core::mem::transmute_copy; use std::any::TypeId; use std::slice::{from_raw_parts, from_raw_parts_mut}; use std::sync::OnceLock; use std::sync::atomic::{AtomicU64, Ordering}; +use crypto::merkle_tree::merkle::MerkleTree; +use crypto::merkle_tree::traits::IsMerkleTreeBackend; use math::field::element::FieldElement; use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField; use math::field::goldilocks::GoldilocksField; -use math::field::traits::{IsField, IsSubFieldOf}; +use math::field::traits::{IsFFTField, IsField, IsSubFieldOf}; use crate::domain::Domain; +use crate::trace::LDETraceTable; /// Break-even LDE size. For LDE sizes smaller than this, the CPU /// `coset_lde_full_expand` completes in a few hundred microseconds and the @@ -56,6 +60,9 @@ pub fn reset_all_gpu_call_counters() { GPU_EXTEND_HALVES_CALLS.store(0, Ordering::Relaxed); GPU_LEAF_HASH_CALLS.store(0, Ordering::Relaxed); GPU_MERKLE_TREE_CALLS.store(0, Ordering::Relaxed); + GPU_PARTS_LDE_CALLS.store(0, Ordering::Relaxed); + GPU_BARY_CALLS.store(0, Ordering::Relaxed); + GPU_COMP_POLY_TREE_CALLS.store(0, Ordering::Relaxed); } pub(crate) static GPU_EXTEND_HALVES_CALLS: AtomicU64 = AtomicU64::new(0); @@ -267,7 +274,10 @@ fn alloc_merkle_nodes(lde_size: usize) -> Option<(Vec<[u8; 32]>, usize)> { // SAFETY: every byte will be overwritten via the GPU D2H before the // contents are read. The caller computes the byte-length view from the // returned `nodes` Vec using `total_nodes.checked_mul(32)`. - unsafe { nodes.set_len(total_nodes) }; + #[allow(clippy::uninit_vec)] + unsafe { + nodes.set_len(total_nodes) + }; Some((nodes, total_nodes)) } @@ -342,13 +352,14 @@ where /// /// Returns `None` when the GPU path doesn't apply (too small, or CPU path /// should be used); in that case the caller runs its existing rayon::join. +#[allow(clippy::type_complexity)] pub(crate) fn try_extend_two_halves_gpu( h0: &[FieldElement], h1: &[FieldElement], domain: &Domain, ) -> Option<(Vec>, Vec>)> where - F: math::field::traits::IsFFTField + IsField + 'static, + F: IsFFTField + IsField + 'static, E: IsField + 'static, F: IsSubFieldOf, { @@ -391,7 +402,7 @@ where // F == GoldilocksField by TypeId check above, so value is u64. let v: u64 = unsafe { *(w.value() as *const _ as *const u64) }; weights_u64.push(v); - w = w * &g_inv; + w *= &g_inv; } // Pre-allocate outputs. @@ -438,14 +449,11 @@ pub(crate) fn try_expand_leaf_and_tree_batched_keep( columns: &mut [Vec>], blowup_factor: usize, weights: &[FieldElement], -) -> Option<( - crypto::merkle_tree::merkle::MerkleTree, - math_cuda::lde::GpuLdeBase, -)> +) -> Option<(MerkleTree, math_cuda::lde::GpuLdeBase)> where F: IsField + 'static, E: IsField + 'static, - B: crypto::merkle_tree::traits::IsMerkleTreeBackend, + B: IsMerkleTreeBackend, { let (n, lde_size) = match check_base_layout::(columns, blowup_factor) { LayoutDispatch::Empty | LayoutDispatch::Skip => return None, @@ -486,7 +494,7 @@ where } }; - let tree = crypto::merkle_tree::merkle::MerkleTree::::from_precomputed_nodes(nodes)?; + let tree = MerkleTree::::from_precomputed_nodes(nodes)?; Some((tree, handle)) } @@ -498,14 +506,11 @@ pub(crate) fn try_expand_leaf_and_tree_batched_ext3_keep( columns: &mut [Vec>], blowup_factor: usize, weights: &[FieldElement], -) -> Option<( - crypto::merkle_tree::merkle::MerkleTree, - math_cuda::lde::GpuLdeExt3, -)> +) -> Option<(MerkleTree, math_cuda::lde::GpuLdeExt3)> where F: IsField + 'static, E: IsField + 'static, - B: crypto::merkle_tree::traits::IsMerkleTreeBackend, + B: IsMerkleTreeBackend, { let (n, lde_size) = match check_ext3_layout::(columns, blowup_factor) { LayoutDispatch::Empty | LayoutDispatch::Skip => return None, @@ -547,7 +552,7 @@ where } }; - let tree = crypto::merkle_tree::merkle::MerkleTree::::from_precomputed_nodes(nodes)?; + let tree = MerkleTree::::from_precomputed_nodes(nodes)?; Some((tree, handle)) } @@ -603,3 +608,398 @@ static GPU_MERKLE_TREE_CALLS: AtomicU64 = AtomicU64::new(0); pub fn gpu_merkle_tree_calls() -> u64 { GPU_MERKLE_TREE_CALLS.load(Ordering::Relaxed) } + +// ============================================================================ +// PR-3: R2 composition-parts LDE + Merkle commit + R3 OOD barycentric +// ============================================================================ + +/// R2 dispatch counter: incremented once per +/// [`try_evaluate_parts_on_lde_gpu`] call that actually routed to the GPU. +pub(crate) static GPU_PARTS_LDE_CALLS: AtomicU64 = AtomicU64::new(0); +pub fn gpu_parts_lde_calls() -> u64 { + GPU_PARTS_LDE_CALLS.load(Ordering::Relaxed) +} + +/// R3 dispatch counter: incremented once per `try_barycentric_*_on_handle` +/// call that actually routed to the GPU. +pub(crate) static GPU_BARY_CALLS: AtomicU64 = AtomicU64::new(0); +pub fn gpu_bary_calls() -> u64 { + GPU_BARY_CALLS.load(Ordering::Relaxed) +} + +/// R2 comp-poly tree counter: incremented once per +/// [`try_build_comp_poly_tree_gpu`] call that actually routed to the GPU. +/// Distinct from `GPU_MERKLE_TREE_CALLS` (R1 main/aux tree builds) so the +/// two dispatch sites can be diagnosed independently. +pub(crate) static GPU_COMP_POLY_TREE_CALLS: AtomicU64 = AtomicU64::new(0); +pub fn gpu_comp_poly_tree_calls() -> u64 { + GPU_COMP_POLY_TREE_CALLS.load(Ordering::Relaxed) +} + +/// Trace-size threshold for the R3 OOD barycentric GPU path. Below this the +/// rayon CPU path is competitive and PCIe round-trip overhead would dominate. +/// Override via `LAMBDA_VM_GPU_BARY_THRESHOLD`. +const DEFAULT_GPU_BARY_THRESHOLD: usize = 1 << 14; + +fn gpu_bary_threshold() -> usize { + static CACHED: OnceLock = OnceLock::new(); + *CACHED.get_or_init(|| { + std::env::var("LAMBDA_VM_GPU_BARY_THRESHOLD") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(DEFAULT_GPU_BARY_THRESHOLD) + }) +} + +/// One ext3 scalar `(n_inv * g_n_inv) * (z^N - g^N)`. Returned as `[u64;3]`. +/// +/// The GPU kernels compute only the unscaled barycentric sum per column. +/// Applying this scalar on the host is one ext3 multiply per column, cheap +/// next to the trace-size reduction. +fn ood_ext3_scalar( + coset_offset_pow_n: &FieldElement, + n_inv: &FieldElement, + g_n_inv: &FieldElement, + z_pow_n: &FieldElement, +) -> [u64; 3] +where + F: IsField + IsSubFieldOf + 'static, + E: IsField + 'static, +{ + // (z^N - g^N) in E, via sub_subfield (E - F -> E). + let vanishing = z_pow_n.sub_subfield(coset_offset_pow_n); + let base_scalar = n_inv * g_n_inv; // F * F -> F + let scalar_ext3: FieldElement = &base_scalar * &vanishing; // F * E -> E + // SAFETY: TypeId-checked at the caller (`try_barycentric_*_on_handle`). + // E == Degree3GoldilocksExtensionField, whose backing is + // `[FieldElement; 3]`, layout-equivalent to `[u64; 3]`. + let ptr = &scalar_ext3 as *const FieldElement as *const u64; + unsafe { [*ptr, *ptr.add(1), *ptr.add(2)] } +} + +/// Multiply each raw GPU ext3 sum by the host-computed ext3 scalar. `sums_raw` +/// is `3 * num_cols` u64s (interleaved). Returns the final OOD evaluations as +/// `Vec>` of length `num_cols`. +fn apply_ext3_scalar(sums_raw: &[u64], scalar: [u64; 3], num_cols: usize) -> Vec> +where + E: IsField + 'static, +{ + type Gl = GoldilocksField; + type Ext3 = Degree3GoldilocksExtensionField; + + assert_eq!(sums_raw.len(), 3 * num_cols); + // Avoids the `E != Ext3` path reaching the unsafe `transmute_copy` below + // that is UB in that case. Cost is one TypeId comparison per call. + assert_eq!( + TypeId::of::(), + TypeId::of::(), + "apply_ext3_scalar: E must be Ext3" + ); + + let scalar_e: FieldElement = FieldElement::::new([ + FieldElement::::from_raw(scalar[0]), + FieldElement::::from_raw(scalar[1]), + FieldElement::::from_raw(scalar[2]), + ]); + + let mut out: Vec> = Vec::with_capacity(num_cols); + for c in 0..num_cols { + let s: FieldElement = FieldElement::::new([ + FieldElement::::from_raw(sums_raw[c * 3]), + FieldElement::::from_raw(sums_raw[c * 3 + 1]), + FieldElement::::from_raw(sums_raw[c * 3 + 2]), + ]); + let final_ext3 = &s * &scalar_e; + // SAFETY: TypeId-checked at the caller. E == Ext3, identical layout. + let final_e: FieldElement = + unsafe { transmute_copy::, FieldElement>(&final_ext3) }; + out.push(final_e); + } + out +} + +/// R2 GPU dispatch: batched ext3 LDE over `parts_coefs` (composition-poly +/// coefficient parts). Returns the LDE evaluations as `Vec>>` +/// of length `lde_size` per part on success, `None` to fall through to the CPU +/// path. Used by `round_2_compute_composition_polynomial` in the +/// `number_of_parts > 2` branch. +/// +/// Inputs are immutable (`&[&[FieldElement]]`) and outputs are fresh, so +/// there is no `restore_columns_on_err` needed. `Err` just returns `None` +/// and the caller's coefficient slices are left untouched. +pub(crate) fn try_evaluate_parts_on_lde_gpu( + parts_coefs: &[&[FieldElement]], + blowup_factor: usize, + domain_size: usize, + offset: &FieldElement, +) -> Option>>> +where + F: IsFFTField + IsField + IsSubFieldOf + 'static, + E: IsField + 'static, +{ + if parts_coefs.is_empty() { + return Some(Vec::new()); + } + if !domain_size.is_power_of_two() || !blowup_factor.is_power_of_two() { + return None; + } + let lde_size = domain_size.checked_mul(blowup_factor)?; + if lde_size < gpu_lde_threshold() { + return None; + } + if TypeId::of::() != TypeId::of::() { + return None; + } + if TypeId::of::() != TypeId::of::() { + return None; + } + let m = parts_coefs.len(); + + // Weights: `offset^k` for k in 0..domain_size. F is Goldilocks by check above. + let mut weights_u64 = Vec::with_capacity(domain_size); + let mut w = FieldElement::::one(); + for _ in 0..domain_size { + // SAFETY: F == Goldilocks per TypeId check. FieldElement is + // #[repr(transparent)] over u64. + let v: u64 = unsafe { *(w.value() as *const _ as *const u64) }; + weights_u64.push(v); + w *= offset; + } + + // Pack parts into per-part `3 * domain_size` u64 buffers (zero-padded). + let mut part_bufs: Vec> = Vec::with_capacity(m); + for part in parts_coefs.iter() { + let mut buf = vec![0u64; 3 * domain_size]; + let len = part.len().min(domain_size); + // SAFETY: E == Ext3; backing is `[FieldElement; 3]` = `[u64; 3]`. + let src_ptr = part.as_ptr() as *const u64; + let src_len = len.checked_mul(3).expect("part src len overflow"); + let src = unsafe { from_raw_parts(src_ptr, src_len) }; + buf[..src_len].copy_from_slice(src); + part_bufs.push(buf); + } + let input_slices: Vec<&[u64]> = part_bufs.iter().map(|v| v.as_slice()).collect(); + + let mut outputs: Vec>> = (0..m) + .map(|_| vec![FieldElement::::zero(); lde_size]) + .collect(); + let gpu_result = { + let mut out_slices: Vec<&mut [u64]> = outputs + .iter_mut() + .map(|o| { + let ptr = o.as_mut_ptr() as *mut u64; + let byte_len = lde_size.checked_mul(3).expect("ext3 out len overflow"); + // SAFETY: E == Ext3 per TypeId check; Vec> of + // length `lde_size` is layout-equivalent to `[u64; 3 * lde_size]`. + unsafe { from_raw_parts_mut(ptr, byte_len) } + }) + .collect(); + math_cuda::lde::evaluate_poly_coset_batch_ext3_into( + &input_slices, + domain_size, + blowup_factor, + &weights_u64, + &mut out_slices, + ) + }; + if gpu_result.is_err() { + // Outputs are local and dropped on return; caller's inputs are + // immutable, so no restore is needed. + return None; + } + GPU_PARTS_LDE_CALLS.fetch_add(1, Ordering::Relaxed); + Some(outputs) +} + +/// R2 GPU dispatch: build the composition-polynomial Merkle tree from the +/// host-side ext3 LDE eval Vecs produced by [`try_evaluate_parts_on_lde_gpu`] +/// (or the CPU path). Uses the same row-pair leaf pattern as the CPU +/// `commit_composition_polynomial`: each leaf hashes 2 consecutive +/// bit-reversed rows. +/// +/// Returns `None` to fall through to the CPU path when the type or size +/// conditions don't hold; returns `None` on a math-cuda `Err` so the caller +/// recomputes on CPU. +pub(crate) fn try_build_comp_poly_tree_gpu( + lde_parts: &[Vec>], +) -> Option> +where + E: IsField + 'static, + B: IsMerkleTreeBackend, +{ + if lde_parts.is_empty() { + return None; + } + let lde_size = lde_parts[0].len(); + if !lde_size.is_power_of_two() || lde_size < 2 { + return None; + } + if lde_size < gpu_lde_threshold() { + return None; + } + if TypeId::of::() != TypeId::of::() { + return None; + } + // All parts must have the same LDE length. + if lde_parts.iter().any(|p| p.len() != lde_size) { + return None; + } + + // SAFETY: E == Ext3 per TypeId check. FieldElement backing is + // `[FieldElement; 3]`, layout-equivalent to `[u64; 3]`. + let raw_parts: Vec<&[u64]> = lde_parts + .iter() + .map(|p| { + let byte_len = p.len().checked_mul(3).expect("ext3 part byte len overflow"); + unsafe { from_raw_parts(p.as_ptr() as *const u64, byte_len) } + }) + .collect(); + + let nodes_bytes = match math_cuda::merkle::build_comp_poly_tree_from_evals_ext3(&raw_parts) { + Ok(v) => v, + Err(_) => return None, + }; + + // lde_size is an even power of two >= 2, so 2*num_leaves == lde_size and + // tight_total_nodes = lde_size - 1 >= 1. No overflow or underflow possible. + let tight_total_nodes = lde_size - 1; + let expected_byte_len = tight_total_nodes + .checked_mul(32) + .expect("comp-poly node byte length overflow"); + debug_assert_eq!(nodes_bytes.len(), expected_byte_len); + + let nodes: Vec<[u8; 32]> = nodes_bytes + .chunks_exact(32) + .map(|c| { + c.try_into() + .expect("chunks_exact(32) yields exactly 32 bytes") + }) + .collect(); + GPU_COMP_POLY_TREE_CALLS.fetch_add(1, Ordering::Relaxed); + // Falls back to CPU on `None`, matching the R1 paths (lines 496, 557). + MerkleTree::::from_precomputed_nodes(nodes) +} + +/// R3 GPU dispatch: batched strided barycentric OOD evaluation over the main +/// (base-field) LDE columns kept on device from R1. Operates on the +/// device-resident LDE in place; only the coset points and inv_denoms are +/// copied to the device, not the columns. Returns the OOD evaluations +/// as `Vec>` of length `num_main_cols` (already scaled by +/// `vanishing * n_inv * g_n_inv`), or `None` if the GPU handle is absent, +/// types don't match, the trace-size domain is below threshold, or the +/// math-cuda call returns `Err`. +#[allow(clippy::too_many_arguments)] +pub(crate) fn try_barycentric_base_on_handle( + lde_trace: &LDETraceTable, + row_stride: usize, + coset_points: &[FieldElement], + coset_offset_pow_n: &FieldElement, + n_inv: &FieldElement, + g_n_inv: &FieldElement, + z_pow_n: &FieldElement, + inv_denoms: &[FieldElement], +) -> Option>> +where + F: IsField + IsSubFieldOf + 'static, + E: IsField + 'static, +{ + if TypeId::of::() != TypeId::of::() { + return None; + } + if TypeId::of::() != TypeId::of::() { + return None; + } + let main = lde_trace.gpu_main()?; + let num_cols = main.m; + if num_cols == 0 { + return Some(Vec::new()); + } + let n = coset_points.len(); + if !n.is_power_of_two() || n < gpu_bary_threshold() { + return None; + } + if inv_denoms.len() != n || main.lde_size != n.checked_mul(row_stride)? { + return None; + } + + // SAFETY: F == Goldilocks per TypeId check; FieldElement is + // #[repr(transparent)] over u64. + let points_raw: &[u64] = unsafe { from_raw_parts(coset_points.as_ptr() as *const u64, n) }; + // SAFETY: E == Ext3 per TypeId check; FieldElement backing is + // `[FieldElement; 3]` = `[u64; 3]`. + let inv_denoms_len = n.checked_mul(3).expect("inv_denoms u64 len overflow"); + let inv_denoms_raw: &[u64] = + unsafe { from_raw_parts(inv_denoms.as_ptr() as *const u64, inv_denoms_len) }; + + let sums_raw = match math_cuda::barycentric::barycentric_base_on_device( + main, + row_stride, + points_raw, + inv_denoms_raw, + n, + ) { + Ok(v) => v, + Err(_) => return None, + }; + GPU_BARY_CALLS.fetch_add(1, Ordering::Relaxed); + + let scalar = ood_ext3_scalar::(coset_offset_pow_n, n_inv, g_n_inv, z_pow_n); + Some(apply_ext3_scalar::(&sums_raw, scalar, num_cols)) +} + +/// Ext3 counterpart of [`try_barycentric_base_on_handle`] for the aux LDE. +/// Reads `lde_trace.gpu_aux()` (the de-interleaved 3-slab device buffer). +#[allow(clippy::too_many_arguments)] +pub(crate) fn try_barycentric_ext3_on_handle( + lde_trace: &LDETraceTable, + row_stride: usize, + coset_points: &[FieldElement], + coset_offset_pow_n: &FieldElement, + n_inv: &FieldElement, + g_n_inv: &FieldElement, + z_pow_n: &FieldElement, + inv_denoms: &[FieldElement], +) -> Option>> +where + F: IsField + IsSubFieldOf + 'static, + E: IsField + 'static, +{ + if TypeId::of::() != TypeId::of::() { + return None; + } + if TypeId::of::() != TypeId::of::() { + return None; + } + let aux = lde_trace.gpu_aux()?; + let num_cols = aux.m; + if num_cols == 0 { + return Some(Vec::new()); + } + let n = coset_points.len(); + if !n.is_power_of_two() || n < gpu_bary_threshold() { + return None; + } + if inv_denoms.len() != n || aux.lde_size != n.checked_mul(row_stride)? { + return None; + } + + let points_raw: &[u64] = unsafe { from_raw_parts(coset_points.as_ptr() as *const u64, n) }; + let inv_denoms_len = n.checked_mul(3).expect("inv_denoms u64 len overflow"); + let inv_denoms_raw: &[u64] = + unsafe { from_raw_parts(inv_denoms.as_ptr() as *const u64, inv_denoms_len) }; + + let sums_raw = match math_cuda::barycentric::barycentric_ext3_on_device( + aux, + row_stride, + points_raw, + inv_denoms_raw, + n, + ) { + Ok(v) => v, + Err(_) => return None, + }; + GPU_BARY_CALLS.fetch_add(1, Ordering::Relaxed); + + let scalar = ood_ext3_scalar::(coset_offset_pow_n, n_inv, g_n_inv, z_pow_n); + Some(apply_ext3_scalar::(&sums_raw, scalar, num_cols)) +} diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index dd26e020a..2c02eacf5 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -1020,28 +1020,66 @@ pub trait IsStarkProver< Polynomial::interpolate_offset_fft(&constraint_evaluations, &domain.coset_offset) .unwrap(); let composition_poly_parts = composition_poly.break_in_parts(number_of_parts); - composition_poly_parts - .iter() - .map(|part| { - evaluate_polynomial_on_lde_domain( - part, - domain.blowup_factor, - domain.interpolation_domain_size, - &domain.coset_offset, - ) - .unwrap() - }) - .collect() + + let cpu_eval = || -> Vec>> { + composition_poly_parts + .iter() + .map(|part| { + evaluate_polynomial_on_lde_domain( + part, + domain.blowup_factor, + domain.interpolation_domain_size, + &domain.coset_offset, + ) + .unwrap() + }) + .collect() + }; + + // GPU fast path: batched ext3 LDE for all parts in one call. + // Non-`_keep` variant. The device buffer is freed at end of + // call; downstream R3 reads the host-side evals. PR-4 will + // upgrade to the `_keep` variant to retain the handle for R4 + // DEEP composition. + #[cfg(feature = "cuda")] + { + let parts_slices: Vec<&[FieldElement]> = composition_poly_parts + .iter() + .map(|p| p.coefficients.as_slice()) + .collect(); + crate::gpu_lde::try_evaluate_parts_on_lde_gpu::( + &parts_slices, + domain.blowup_factor, + domain.interpolation_domain_size, + &domain.coset_offset, + ) + .unwrap_or_else(cpu_eval) + } + #[cfg(not(feature = "cuda"))] + cpu_eval() }; #[cfg(feature = "instruments")] let fft_dur = t_sub.elapsed(); #[cfg(feature = "instruments")] let t_sub = Instant::now(); - let Some((composition_poly_merkle_tree, composition_poly_root)) = - Self::commit_composition_polynomial(&lde_composition_poly_parts_evaluations) - else { - return Err(ProvingError::EmptyCommitment); + // GPU fast path for the comp-poly Merkle commit: row-pair Keccak + // leaves + device-side inner tree, both wrapping the host eval Vecs. + #[cfg(feature = "cuda")] + let gpu_tree = crate::gpu_lde::try_build_comp_poly_tree_gpu::< + FieldExtension, + BatchedMerkleTreeBackend, + >(&lde_composition_poly_parts_evaluations); + #[cfg(not(feature = "cuda"))] + let gpu_tree: Option> = None; + + let (composition_poly_merkle_tree, composition_poly_root) = match gpu_tree { + Some(tree) => { + let root = tree.root; + (tree, root) + } + None => Self::commit_composition_polynomial(&lde_composition_poly_parts_evaluations) + .ok_or(ProvingError::EmptyCommitment)?, }; #[cfg(feature = "instruments")] let merkle_dur = t_sub.elapsed(); diff --git a/crypto/stark/src/trace.rs b/crypto/stark/src/trace.rs index c8e96987b..f4469447d 100644 --- a/crypto/stark/src/trace.rs +++ b/crypto/stark/src/trace.rs @@ -436,8 +436,8 @@ pub fn get_trace_evaluations_from_lde( dc: &DomainConstants, ) -> Table where - F: IsSubFieldOf + IsFFTField, - E: IsField, + F: IsSubFieldOf + IsFFTField + 'static, + E: IsField + 'static, { let n = domain.interpolation_domain_size; let bf = domain.blowup_factor; @@ -468,58 +468,113 @@ where let vanishing = z_pow_n.sub_subfield(&dc.offset_pow_n); let vanishing_factor = &n_inv_g_n_inv * &vanishing; - // Precompute inv_denoms = 1/(eval_point - coset_point_i) — shared across all columns + // Precompute inv_denoms = 1/(eval_point - coset_point_i), shared across all columns. + // Stays on CPU: the batch-invert cost at this scale (n * num_eval_points) is already + // rayon-parallelised across tables, and a GPU port regressed wall time in a + // 2x15-trial A/B due to stream contention from many concurrent launches. let inv_denoms = barycentric_inv_denoms(eval_point, &dc.points); - // Precompute col_scale[i] = point[i] * inv_denom[i] — shared across ALL columns. - // This eliminates N redundant F×E multiplies per column. - let col_scale: Vec> = dc - .points - .iter() - .zip(inv_denoms.iter()) - .map(|(point, inv_d)| point * inv_d) - .collect(); - - // Evaluate all main columns directly from LDE (no extraction copy). - // For main columns (base field F): sum = Σ col_scale[i] * lde_col[i*bf] - // lde_col[i*bf] is F, col_scale[i] is E; use F×E → E mixed arithmetic. - #[cfg(feature = "parallel")] - let main_iter = (0..num_main_cols).into_par_iter(); - #[cfg(not(feature = "parallel"))] - let main_iter = 0..num_main_cols; - let main_evals: Vec> = main_iter - .map(|col_idx| { - let lde_col = &lde_trace.main_columns[col_idx]; - let sum = col_scale + // col_scale[i] = point[i] * inv_denom[i], shared across ALL CPU column + // loops below. Computed lazily on first CPU-fallback use so the all-GPU + // path pays nothing, while the all-CPU and mixed paths only pay once. + let mut col_scale: Option>> = None; + + // GPU fast path: batched strided barycentric over the main-trace LDE + // already on device. Returns `None` when the GPU R1 path didn't run + // for this table (handle absent), the size is below threshold, types + // don't match, or the math-cuda call errored. Caller falls through + // to the existing rayon CPU loop. + #[cfg(feature = "cuda")] + let main_gpu = crate::gpu_lde::try_barycentric_base_on_handle::( + lde_trace, + bf, + &dc.points, + &dc.offset_pow_n, + &dc.size_inv, + &dc.offset_pow_n_inv, + &z_pow_n, + &inv_denoms, + ); + #[cfg(not(feature = "cuda"))] + let main_gpu: Option>> = None; + + let main_evals: Vec> = if let Some(v) = main_gpu { + v + } else { + let col_scale = col_scale.get_or_insert_with(|| { + dc.points .iter() - .enumerate() - .fold(FieldElement::::zero(), |acc, (i, scale)| { - acc + &lde_col[i * bf] * scale - }); - &vanishing_factor * &sum - }) - .collect(); + .zip(inv_denoms.iter()) + .map(|(point, inv_d)| point * inv_d) + .collect() + }); + // Evaluate all main columns directly from LDE (no extraction copy). + // For main columns (base field F): sum = sum over i of col_scale[i] * lde_col[i*bf]. + // lde_col[i*bf] is F, col_scale[i] is E; use F*E -> E mixed arithmetic. + #[cfg(feature = "parallel")] + let main_iter = (0..num_main_cols).into_par_iter(); + #[cfg(not(feature = "parallel"))] + let main_iter = 0..num_main_cols; + main_iter + .map(|col_idx| { + let lde_col = &lde_trace.main_columns[col_idx]; + let sum = col_scale + .iter() + .enumerate() + .fold(FieldElement::::zero(), |acc, (i, scale)| { + acc + &lde_col[i * bf] * scale + }); + &vanishing_factor * &sum + }) + .collect() + }; table_data.extend(main_evals); - // Evaluate all aux columns directly from LDE (no extraction copy). - // For aux columns (extension field E): sum = Σ col_scale[i] * lde_col[i*bf] - // Both col_scale and lde_col are in E, so each multiply is E×E → E. - #[cfg(feature = "parallel")] - let aux_iter = (0..num_aux_cols).into_par_iter(); - #[cfg(not(feature = "parallel"))] - let aux_iter = 0..num_aux_cols; - let aux_evals: Vec> = aux_iter - .map(|col_idx| { - let lde_col = &lde_trace.aux_columns[col_idx]; - let sum = col_scale + // GPU fast path for aux columns reading the de-interleaved ext3 LDE handle. + #[cfg(feature = "cuda")] + let aux_gpu = crate::gpu_lde::try_barycentric_ext3_on_handle::( + lde_trace, + bf, + &dc.points, + &dc.offset_pow_n, + &dc.size_inv, + &dc.offset_pow_n_inv, + &z_pow_n, + &inv_denoms, + ); + #[cfg(not(feature = "cuda"))] + let aux_gpu: Option>> = None; + + let aux_evals: Vec> = if let Some(v) = aux_gpu { + v + } else { + let col_scale = col_scale.get_or_insert_with(|| { + dc.points .iter() - .enumerate() - .fold(FieldElement::::zero(), |acc, (i, scale)| { - acc + scale * &lde_col[i * bf] - }); - &vanishing_factor * &sum - }) - .collect(); + .zip(inv_denoms.iter()) + .map(|(point, inv_d)| point * inv_d) + .collect() + }); + // Evaluate all aux columns directly from LDE (no extraction copy). + // For aux columns (extension field E): sum = sum over i of col_scale[i] * lde_col[i*bf]. + // Both col_scale and lde_col are in E, so each multiply is E*E -> E. + #[cfg(feature = "parallel")] + let aux_iter = (0..num_aux_cols).into_par_iter(); + #[cfg(not(feature = "parallel"))] + let aux_iter = 0..num_aux_cols; + aux_iter + .map(|col_idx| { + let lde_col = &lde_trace.aux_columns[col_idx]; + let sum = col_scale + .iter() + .enumerate() + .fold(FieldElement::::zero(), |acc, (i, scale)| { + acc + scale * &lde_col[i * bf] + }); + &vanishing_factor * &sum + }) + .collect() + }; table_data.extend(aux_evals); } diff --git a/prover/tests/cuda_path_integration.rs b/prover/tests/cuda_path_integration.rs new file mode 100644 index 000000000..e54becb85 --- /dev/null +++ b/prover/tests/cuda_path_integration.rs @@ -0,0 +1,49 @@ +//! End-to-end GPU path coverage. Proves a real ELF with the `cuda` feature on +//! and asserts every dispatch counter introduced by the cuda backend actually +//! fired. Catches silent CPU-fallback regressions where the prover would still +//! emit a valid proof but the GPU path got skipped (size below threshold, type +//! mismatch, handle plumbing broken, etc.). +//! +//! `#[ignore]`'d so the no-GPU CI path skips it. Run via `make test-cuda-integration` +//! or `cargo test -p lambda-vm-prover --release --features cuda --test cuda_path_integration -- --ignored --nocapture`. +#![cfg(feature = "cuda")] + +use lambda_vm_prover::prove; +use lambda_vm_prover::test_utils::asm_elf_bytes; +use stark::gpu_lde::{ + gpu_bary_calls, gpu_comp_poly_tree_calls, gpu_lde_calls, gpu_parts_lde_calls, + reset_all_gpu_call_counters, +}; + +#[test] +#[ignore = "requires GPU; run with --ignored --nocapture"] +fn gpu_path_fires_end_to_end() { + // Warm-up amortises PTX load + pool warm-up + first-call pinned alloc + // so the profiled-pass counter assertions reflect only steady-state work. + let elf = asm_elf_bytes("fib_iterative_1M"); + let _ = prove(&elf).expect("warm-up prove"); + + reset_all_gpu_call_counters(); + + let _ = prove(&elf).expect("prove"); + + // R1 main + aux fused LDE+Merkle. Fires for every table above the LDE + // threshold; fib_iterative_1M has plenty. + assert!(gpu_lde_calls() > 0, "R1 GPU LDE path did not fire"); + + // R3 OOD barycentric reads the R1 main + aux device handles. Fires once + // per (eval-point, base/ext3) pair for every table that took the R1 GPU + // path. + assert!(gpu_bary_calls() > 0, "R3 GPU barycentric did not fire"); + + // R2 ext3 LDE of composition-poly parts. Only fires when an AIR's + // `number_of_parts > 2`. The branch and shift tables have degree-3 + // transition constraints, so this triggers on any non-trivial prove. + assert!(gpu_parts_lde_calls() > 0, "R2 GPU parts LDE did not fire"); + + // R2 comp-poly Merkle tree build, paired with the parts LDE above. + assert!( + gpu_comp_poly_tree_calls() > 0, + "R2 GPU comp-poly tree did not fire" + ); +}