diff --git a/bin/cli/Cargo.toml b/bin/cli/Cargo.toml index 87bb1c8fc..f91175037 100644 --- a/bin/cli/Cargo.toml +++ b/bin/cli/Cargo.toml @@ -18,3 +18,4 @@ env_logger = "0.11" jemalloc-stats = ["dep:tikv-jemalloc-ctl"] disk-spill = ["prover/disk-spill"] instruments = ["prover/instruments", "stark/instruments"] +phased-fft = ["prover/phased-fft"] diff --git a/crypto/math/Cargo.toml b/crypto/math/Cargo.toml index 85979a7c4..93ad93a2f 100644 --- a/crypto/math/Cargo.toml +++ b/crypto/math/Cargo.toml @@ -34,6 +34,10 @@ default = ["parallel", "std"] std = ["alloc", "serde?/std", "serde_json?/std"] alloc = [] parallel = ["dep:rayon"] +# Route the forward FFT through the phased (Bailey four-step) path when the +# LDE size is large enough. Output convention unchanged — natural order. +# Off by default so A/B bench can toggle. +phased-fft = ["parallel"] lambdaworks-serde-binary = ["dep:serde", "alloc"] lambdaworks-serde-string = ["dep:serde", "dep:serde_json", "alloc"] proptest = ["dep:proptest"] @@ -45,4 +49,8 @@ getrandom = { version = "0.2.15", features = ["js"] } [[bench]] name = "goldilocks_benchmark" +harness = false + +[[bench]] +name = "phased_fft_benchmark" harness = false \ No newline at end of file diff --git a/crypto/math/benches/phased_fft_benchmark.rs b/crypto/math/benches/phased_fft_benchmark.rs new file mode 100644 index 000000000..4a3644a58 --- /dev/null +++ b/crypto/math/benches/phased_fft_benchmark.rs @@ -0,0 +1,141 @@ +//! Phased FFT vs Bowers FFT — single-column and multi-column. + +use std::hint::black_box; +use std::time::Duration; + +use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use rand::{RngCore, SeedableRng}; + +use math::fft::bit_reversing::in_place_bit_reverse_permute; +use math::fft::bowers_fft::{LayerTwiddles, bowers_fft_opt_fused_parallel}; +use math::fft::phased_fft::{ + PhasedFftContext, bowers_phased_fft_multicol, bowers_phased_fft_with_buf, +}; +use math::field::element::FieldElement; +use math::field::goldilocks::GoldilocksField; + +#[cfg(feature = "parallel")] +use rayon::prelude::*; + +type F = GoldilocksField; +type FE = FieldElement; + +fn random_input(log_n: usize, seed: u64) -> Vec { + let n = 1usize << log_n; + let mut rng = rand_chacha::ChaCha20Rng::seed_from_u64(seed); + (0..n).map(|_| FE::from(rng.next_u64())).collect() +} + +fn bench_single_col(c: &mut Criterion) { + let mut group = c.benchmark_group("fft_single_col"); + group.sample_size(10); + group.measurement_time(Duration::from_secs(8)); + + for &log_n in &[20usize, 22, 24] { + let n = 1usize << log_n; + let input = random_input(log_n, 1); + let bowers_twiddles = LayerTwiddles::::new(log_n as u64).unwrap(); + let phased_ctx = PhasedFftContext::::new(log_n).unwrap(); + + group.bench_with_input( + BenchmarkId::new("bowers", format!("2^{log_n}")), + &n, + |b, _| { + b.iter_batched( + || input.clone(), + |mut data| { + bowers_fft_opt_fused_parallel(&mut data, &bowers_twiddles).unwrap(); + in_place_bit_reverse_permute(&mut data); + black_box(data); + }, + criterion::BatchSize::LargeInput, + ); + }, + ); + + let mut scratch: Vec = Vec::with_capacity(n); + group.bench_with_input( + BenchmarkId::new("phased", format!("2^{log_n}")), + &n, + |b, _| { + b.iter_batched( + || input.clone(), + |mut data| { + bowers_phased_fft_with_buf::(&mut data, &phased_ctx, &mut scratch) + .unwrap(); + black_box(data); + }, + criterion::BatchSize::LargeInput, + ); + }, + ); + } + + group.finish(); +} + +/// Multi-column bench: realistic trace LDE shape. CPU table is 74 cols at +/// log_n = 21, MEMW at 49 cols. We probe a few (cols × log_n) combinations. +fn bench_multi_col(c: &mut Criterion) { + let mut group = c.benchmark_group("fft_multi_col"); + group.sample_size(10); + group.measurement_time(Duration::from_secs(10)); + + for &(cols, log_n) in &[(74usize, 21usize), (49, 21), (24, 22), (12, 24)] { + let n = 1usize << log_n; + let bowers_twiddles = LayerTwiddles::::new(log_n as u64).unwrap(); + let phased_ctx = PhasedFftContext::::new(log_n).unwrap(); + let columns: Vec> = (0..cols) + .map(|c| random_input(log_n, c as u64 + 1)) + .collect(); + + // Baseline 1: rayon par_iter over columns, each calls bowers + // independently. This mirrors what the prover does today. + group.bench_with_input( + BenchmarkId::new("bowers_par", format!("{cols}cols_2^{log_n}")), + &n, + |b, _| { + b.iter_batched( + || columns.clone(), + |mut data| { + #[cfg(feature = "parallel")] + data.par_iter_mut().for_each(|col| { + bowers_fft_opt_fused_parallel(col, &bowers_twiddles).unwrap(); + in_place_bit_reverse_permute(col); + }); + #[cfg(not(feature = "parallel"))] + for col in data.iter_mut() { + bowers_fft_opt_fused_parallel(col, &bowers_twiddles).unwrap(); + in_place_bit_reverse_permute(col); + } + black_box(data); + }, + criterion::BatchSize::LargeInput, + ); + }, + ); + + // Phased multi-col: shared ctx + per-worker buffer pool. + group.bench_with_input( + BenchmarkId::new("phased_multicol", format!("{cols}cols_2^{log_n}")), + &n, + |b, _| { + b.iter_batched( + || columns.clone(), + |mut data| { + let mut refs: Vec<&mut [FE]> = + data.iter_mut().map(|c| c.as_mut_slice()).collect(); + bowers_phased_fft_multicol::(&mut refs, &phased_ctx).unwrap(); + black_box(data); + }, + criterion::BatchSize::LargeInput, + ); + }, + ); + } + + group.finish(); +} + +criterion_group!(benches, bench_single_col, bench_multi_col); +criterion_main!(benches); diff --git a/crypto/math/src/fft/mod.rs b/crypto/math/src/fft/mod.rs index fd0d1c4e2..127a5a4cd 100644 --- a/crypto/math/src/fft/mod.rs +++ b/crypto/math/src/fft/mod.rs @@ -3,6 +3,8 @@ pub mod bit_reversing; pub mod bowers_fft; pub mod errors; #[cfg(feature = "alloc")] +pub mod phased_fft; +#[cfg(feature = "alloc")] pub mod roots_of_unity; #[cfg(all(test, feature = "alloc"))] diff --git a/crypto/math/src/fft/phased_fft.rs b/crypto/math/src/fft/phased_fft.rs new file mode 100644 index 000000000..2a5b0d0f6 --- /dev/null +++ b/crypto/math/src/fft/phased_fft.rs @@ -0,0 +1,607 @@ +//! Phased Bowers FFT — 2-phase Bailey four-step prototype. +//! +//! This is a probe to evaluate whether a phased / cache-blocked NTT +//! beats the current single-pass `bowers_fft_opt_fused_parallel` for +//! large `N` (especially `log_n >= 22` where the working set spills out +//! of L3 and the naive layer-by-layer Bowers pattern becomes +//! memory-bound). +//! +//! # Approach +//! +//! The standard Bailey four-step. View the input as an `M × K` row-major +//! matrix with `N = M·K` and split log_n roughly in half: +//! +//! 1. Transpose input → `K × M`. +//! 2. FFT_M on each of the K rows (reuses existing Bowers fused +//! kernels; each row fits in L1). +//! 3. Multiply pointwise by `ω_N^(s·d)` where `s` is the row index +//! `[0, K)` and `d` the column index `[0, M)`. +//! 4. Transpose → `M × K`. +//! 5. FFT_K on each of the M rows. +//! 6. Final transpose → linear output `Y[0], …, Y[N-1]`. +//! +//! Phase twiddles are computed on-the-fly (no upfront `N`-sized table) +//! — for `log_n = 26` the inter-phase twiddle table would cost ~1 GB +//! otherwise. We do `K` powers `ω_K^s` and step the accumulator +//! inside the row, one mul per element. +//! +//! Output convention: **natural order**. Equivalent to +//! `bowers_fft_opt_fused_parallel(x, ...); in_place_bit_reverse_permute(x)`. +//! +//! # Status +//! +//! Prototype. Correctness verified by equivalence tests against the +//! existing Bowers path; performance measured by `phased_fft_bench`. +//! Not yet wired into the prover / polynomial API. + +use crate::fft::bit_reversing::in_place_bit_reverse_permute; +use crate::fft::bowers_fft::{LayerTwiddles, bowers_fft_opt_fused}; +use crate::fft::errors::FFTError; +use crate::field::{ + element::FieldElement, + traits::{IsFFTField, IsField, IsSubFieldOf}, +}; +use alloc::vec::Vec; + +#[cfg(feature = "parallel")] +use rayon::prelude::*; + +/// Tile size for the out-of-place transpose. 32 × 32 × 8 B = 8 KB per +/// tile fits the L1 of Goldilocks base-field elements (extension-field +/// tiles are ~3× larger but still fit). +const TRANSPOSE_TILE: usize = 32; + +/// Out-of-place tiled transpose, parallelised over bands of output rows. +/// +/// Same contract as [`tiled_transpose`] but splits `output` into +/// `TRANSPOSE_TILE`-row bands processed in parallel. Each band reads a +/// `TRANSPOSE_TILE`-wide column strip of `input` (tiled along `rows` for +/// cache locality) and writes a contiguous chunk of `output`. Use for +/// the standalone single-FFT path where no outer parallel loop exists. +#[cfg(feature = "parallel")] +pub fn tiled_transpose_par( + input: &[T], + output: &mut [T], + rows: usize, + cols: usize, +) { + debug_assert_eq!(input.len(), rows * cols); + debug_assert_eq!(output.len(), rows * cols); + + output + .par_chunks_mut(rows * TRANSPOSE_TILE) + .enumerate() + .for_each(|(band, out_band)| { + let c0 = band * TRANSPOSE_TILE; + let c_end = (c0 + TRANSPOSE_TILE).min(cols); + for tile_r in (0..rows).step_by(TRANSPOSE_TILE) { + let r_end = (tile_r + TRANSPOSE_TILE).min(rows); + for c in c0..c_end { + let out_row = &mut out_band[(c - c0) * rows..(c - c0) * rows + rows]; + for r in tile_r..r_end { + out_row[r] = input[r * cols + c].clone(); + } + } + } + }); +} + +/// Dispatch to the parallel transpose when the feature is on, else the +/// sequential one. Used by the standalone single-FFT path. +#[inline] +fn transpose_single( + input: &[T], + output: &mut [T], + rows: usize, + cols: usize, +) { + #[cfg(feature = "parallel")] + tiled_transpose_par(input, output, rows, cols); + #[cfg(not(feature = "parallel"))] + tiled_transpose(input, output, rows, cols); +} + +/// Out-of-place tiled transpose (sequential). +/// +/// `input` is `rows × cols` row-major; `output` becomes `cols × rows` +/// row-major. Tile-based for cache locality: each tile pair is +/// `TRANSPOSE_TILE × TRANSPOSE_TILE` elements, sized to fit L1. +/// +/// Use this when the caller is already parallelising at a coarser grain +/// (e.g. across columns in [`bowers_phased_fft_multicol`]); nesting a +/// parallel transpose under a parallel column loop oversubscribes the +/// pool. +pub fn tiled_transpose(input: &[T], output: &mut [T], rows: usize, cols: usize) { + debug_assert_eq!(input.len(), rows * cols); + debug_assert_eq!(output.len(), rows * cols); + + for tile_r in (0..rows).step_by(TRANSPOSE_TILE) { + let r_end = (tile_r + TRANSPOSE_TILE).min(rows); + for tile_c in (0..cols).step_by(TRANSPOSE_TILE) { + let c_end = (tile_c + TRANSPOSE_TILE).min(cols); + for r in tile_r..r_end { + for c in tile_c..c_end { + output[c * rows + r] = input[r * cols + c].clone(); + } + } + } + } +} + +/// Pre-built phased-FFT twiddle context. Build once per `(log_n)` pair, +/// reuse across many invocations to amortise the LayerTwiddles +/// allocation + the `ω_N` lookup. +pub struct PhasedFftContext { + pub log_n: usize, + pub log_m: usize, + pub log_k: usize, + pub twiddles_m: LayerTwiddles, + /// `None` when `log_k == log_m` — caller reuses `twiddles_m`. + pub twiddles_k: Option>, + pub omega_n: FieldElement, +} + +impl PhasedFftContext { + pub fn new(log_n: usize) -> Result { + let n = 1usize << log_n; + if log_n < 4 { + return Err(FFTError::InputError(n)); + } + let log_m = log_n.div_ceil(2); + let log_k = log_n - log_m; + let twiddles_m = LayerTwiddles::::new(log_m as u64) + .ok_or(FFTError::InputError(1usize << log_m))?; + let twiddles_k = if log_k == log_m { + None + } else { + Some( + LayerTwiddles::::new(log_k as u64) + .ok_or(FFTError::InputError(1usize << log_k))?, + ) + }; + let omega_n = F::get_primitive_root_of_unity(log_n as u64) + .map_err(|_| FFTError::InputError(n))?; + Ok(Self { + log_n, + log_m, + log_k, + twiddles_m, + twiddles_k, + omega_n, + }) + } + + #[inline] + fn twiddles_k_ref(&self) -> &LayerTwiddles { + self.twiddles_k.as_ref().unwrap_or(&self.twiddles_m) + } +} + +/// 2-phase Bailey four-step FFT. +/// +/// Splits the `N = 2^log_n` problem into two cache-friendly FFTs of +/// sizes `M = 2^⌈log_n/2⌉` and `K = 2^⌊log_n/2⌋`, with a pointwise +/// phase-twiddle multiply between them. +/// +/// Output: natural order (equivalent to +/// `bowers_fft_opt_fused_parallel` followed by +/// `in_place_bit_reverse_permute`). +/// +/// For `log_n < 4` falls back to a single-pass Bowers (the phased +/// machinery has fixed overhead that doesn't pay back at tiny sizes). +/// +/// Convenience wrapper that builds the `PhasedFftContext` per call. +/// Hot-path callers should prefer [`bowers_phased_fft_with_context`]. +/// +/// # Errors +/// Returns `FFTError::InputError` if `input.len()` is not a power of +/// two, or if the field lacks a primitive root of unity of the +/// required order. +pub fn bowers_phased_fft(input: &mut [FieldElement]) -> Result<(), FFTError> +where + F: IsFFTField + IsSubFieldOf, + E: IsField + Send + Sync, + FieldElement: Send + Sync, + FieldElement: Send + Sync + Clone, +{ + let n = input.len(); + if !n.is_power_of_two() { + return Err(FFTError::InputError(n)); + } + if n <= 1 { + return Ok(()); + } + let log_n = n.trailing_zeros() as usize; + + if log_n < 4 { + let twiddles = LayerTwiddles::::new(log_n as u64).ok_or(FFTError::InputError(n))?; + bowers_fft_opt_fused(input, &twiddles)?; + in_place_bit_reverse_permute(input); + return Ok(()); + } + + let ctx = PhasedFftContext::::new(log_n)?; + bowers_phased_fft_with_context(input, &ctx) +} + +/// 2-phase Bailey four-step FFT using a pre-built twiddle context. +/// +/// Skips the inner-twiddle and `ω_N` setup that +/// [`bowers_phased_fft`] pays per call. Caller must size `input` to +/// `1 << ctx.log_n`. Allocates an internal `N`-element scratch buffer +/// per call — for repeated calls (multi-column / multi-poly), use +/// [`bowers_phased_fft_with_buf`] instead. +pub fn bowers_phased_fft_with_context( + input: &mut [FieldElement], + ctx: &PhasedFftContext, +) -> Result<(), FFTError> +where + F: IsFFTField + IsSubFieldOf, + E: IsField + Send + Sync, + FieldElement: Send + Sync, + FieldElement: Send + Sync + Clone, +{ + let mut buf: Vec> = Vec::new(); + bowers_phased_fft_with_buf(input, ctx, &mut buf) +} + +/// 2-phase Bailey four-step FFT, reusing a caller-supplied scratch +/// buffer. The buffer is resized to `1 << ctx.log_n` on demand; +/// repeat callers can hold a long-lived `Vec` and pay only the +/// initial allocation. +pub fn bowers_phased_fft_with_buf( + input: &mut [FieldElement], + ctx: &PhasedFftContext, + buf: &mut Vec>, +) -> Result<(), FFTError> +where + F: IsFFTField + IsSubFieldOf, + E: IsField + Send + Sync, + FieldElement: Send + Sync, + FieldElement: Send + Sync + Clone, +{ + // Standalone single-FFT path: parallelise *within* this FFT. + phased_fft_core(input, ctx, buf, true) +} + +/// Sequential-inner variant of [`bowers_phased_fft_with_buf`]. +/// +/// Runs the transposes, inner row-FFTs and phase-twiddle pass on a +/// single thread. Use when the caller already parallelises at a coarser +/// grain — e.g. the prover's per-column `par_iter` LDE — so nesting a +/// second rayon level here would oversubscribe the pool. The Bailey +/// cache-blocking (reduced DRAM traffic) still applies; only intra-FFT +/// parallelism is dropped. +pub fn bowers_phased_fft_seq_with_buf( + input: &mut [FieldElement], + ctx: &PhasedFftContext, + buf: &mut Vec>, +) -> Result<(), FFTError> +where + F: IsFFTField + IsSubFieldOf, + E: IsField + Send + Sync, + FieldElement: Send + Sync, + FieldElement: Send + Sync + Clone, +{ + phased_fft_core(input, ctx, buf, false) +} + +/// Core Bailey four-step worker shared by the single-FFT and +/// multi-column entry points. +/// +/// `parallel` selects the intra-FFT parallelism strategy: +/// - `true` — parallelise the transposes, per-row inner FFTs and the +/// phase-twiddle pass (the only work in flight). +/// - `false` — fully sequential, for callers already saturating the +/// pool at a coarser grain (one column per worker in +/// [`bowers_phased_fft_multicol`]); avoids nested-rayon +/// oversubscription. +fn phased_fft_core( + input: &mut [FieldElement], + ctx: &PhasedFftContext, + buf: &mut Vec>, + parallel: bool, +) -> Result<(), FFTError> +where + F: IsFFTField + IsSubFieldOf, + E: IsField + Send + Sync, + FieldElement: Send + Sync, + FieldElement: Send + Sync + Clone, +{ + let n = 1usize << ctx.log_n; + if input.len() != n { + return Err(FFTError::InputError(input.len())); + } + let m = 1usize << ctx.log_m; + let k = 1usize << ctx.log_k; + let twiddles_m = &ctx.twiddles_m; + let twiddles_k_ref = ctx.twiddles_k_ref(); + let omega_n = &ctx.omega_n; + + if buf.len() < n { + buf.resize(n, FieldElement::::zero()); + } + let buf: &mut [FieldElement] = &mut buf.as_mut_slice()[..n]; + + let transpose = |src: &[FieldElement], dst: &mut [FieldElement], r: usize, c: usize| { + if parallel { + transpose_single(src, dst, r, c); + } else { + tiled_transpose(src, dst, r, c); + } + }; + let run_rows = |slice: &mut [FieldElement], + row_len: usize, + tw: &LayerTwiddles| + -> Result<(), FFTError> { + #[cfg(feature = "parallel")] + if parallel { + return slice + .par_chunks_mut(row_len) + .try_for_each(|row| -> Result<(), FFTError> { + bowers_fft_opt_fused(row, tw)?; + in_place_bit_reverse_permute(row); + Ok(()) + }); + } + for row in slice.chunks_exact_mut(row_len) { + bowers_fft_opt_fused(row, tw)?; + in_place_bit_reverse_permute(row); + } + Ok(()) + }; + + // Step 1: transpose input (M × K) → buf (K × M). + transpose(input, buf, m, k); + // Step 2: FFT_M on each of K rows in buf. + run_rows(buf, m, twiddles_m)?; + // Step 3: phase twiddle multiply on buf (K × M). + apply_phase_twiddles::(buf, k, m, omega_n, parallel); + // Step 4: transpose buf (K × M) → input (M × K). + transpose(buf, input, k, m); + // Step 5: FFT_K on each of M rows in input. + run_rows(input, k, twiddles_k_ref)?; + // Step 6: final transpose to natural linear order. Lands in buf, so + // copy back (parallel copy to stay off the serial critical path). + transpose(input, buf, m, k); + copy_into(input, buf, parallel); + + Ok(()) +} + +/// `dst.clone_from_slice(src)`, parallelised when requested. +#[inline] +fn copy_into(dst: &mut [FieldElement], src: &[FieldElement], parallel: bool) +where + E: IsField, + FieldElement: Send + Sync + Clone, +{ + #[cfg(feature = "parallel")] + if parallel { + dst.par_chunks_mut(1 << 16) + .zip(src.par_chunks(1 << 16)) + .for_each(|(d, s)| d.clone_from_slice(s)); + return; + } + dst.clone_from_slice(src); +} + +/// Apply the inter-phase twiddle correction in-place. +/// +/// `buf` is `rows × cols` row-major; `omega_n` is the primitive `N`-th +/// root of unity where `N = rows · cols`. Multiplies `buf[s·cols + d]` +/// by `ω_N^(s·d)` for `s ∈ [0, rows)`, `d ∈ [0, cols)`. +/// +/// Rows are independent; parallelises across `s` when `parallel`. +fn apply_phase_twiddles( + buf: &mut [FieldElement], + rows: usize, + cols: usize, + omega_n: &FieldElement, + parallel: bool, +) where + F: IsFFTField + IsSubFieldOf, + E: IsField + Send + Sync, + FieldElement: Send + Sync, + FieldElement: Send + Sync, +{ + debug_assert_eq!(buf.len(), rows * cols); + + let apply_row = |(s, row): (usize, &mut [FieldElement])| { + if s == 0 { + return; + } + let omega_s = omega_n.pow(s as u64); + let mut accum = FieldElement::::one(); + for slot in row.iter_mut() { + *slot = &accum * &*slot; + accum = &accum * &omega_s; + } + }; + + #[cfg(feature = "parallel")] + if parallel { + buf.par_chunks_mut(cols).enumerate().for_each(apply_row); + return; + } + buf.chunks_exact_mut(cols).enumerate().for_each(apply_row); +} + +/// Multi-column phased FFT: runs N-point FFTs over many columns in +/// parallel, sharing the PhasedFftContext and amortising the scratch +/// buffer across rayon workers via for_each_init. +/// +/// Each column receives the same FFT (output in natural order). Use +/// when you have a batch of independent polynomials to evaluate at the +/// same LDE domain — typical for STARK trace LDE. +/// +/// # Memory model +/// +/// One scratch buffer of N field elements per active worker thread +/// (= rayon::current_num_threads() typically). Independent of column +/// count: 100 columns processed by 8 workers ⇒ 8 buffers, not 100. +pub fn bowers_phased_fft_multicol( + columns: &mut [&mut [FieldElement]], + ctx: &PhasedFftContext, +) -> Result<(), FFTError> +where + F: IsFFTField + IsSubFieldOf, + E: IsField + Send + Sync, + FieldElement: Send + Sync, + FieldElement: Send + Sync + Clone, +{ + let n = 1usize << ctx.log_n; + + #[cfg(feature = "parallel")] + { + columns + .par_iter_mut() + .try_for_each_init( + || Vec::>::with_capacity(n), + |buf, col| -> Result<(), FFTError> { + // Sequential inner: this loop already parallelises + // across columns; nesting would oversubscribe rayon. + phased_fft_core(col, ctx, buf, false) + }, + )?; + } + #[cfg(not(feature = "parallel"))] + { + let mut buf: Vec> = Vec::with_capacity(n); + for col in columns.iter_mut() { + bowers_phased_fft_with_buf(col, ctx, &mut buf)?; + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::field::goldilocks::GoldilocksField; + + type F = GoldilocksField; + type FE = FieldElement; + + /// Reference: existing Bowers + bit_reverse → natural order output. + fn reference_fft(input: &[FE]) -> Vec { + let log_n = input.len().trailing_zeros() as u64; + let twiddles = LayerTwiddles::::new(log_n).unwrap(); + let mut buf = input.to_vec(); + bowers_fft_opt_fused(&mut buf, &twiddles).unwrap(); + in_place_bit_reverse_permute(&mut buf); + buf + } + + fn random_input(log_n: usize, seed: u64) -> Vec { + // Deterministic pseudo-random over Goldilocks: chain LCG on u64. + let n = 1usize << log_n; + let mut state = seed.wrapping_mul(0x9E37_79B9_7F4A_7C15).wrapping_add(1); + (0..n) + .map(|_| { + state = state.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407); + FE::from(state) + }) + .collect() + } + + #[test] + fn phased_equivalence_log4() { + let input = random_input(4, 42); + let expected = reference_fft(&input); + let mut actual = input; + bowers_phased_fft::(&mut actual).unwrap(); + assert_eq!(actual, expected, "log_n = 4 mismatch"); + } + + #[test] + fn phased_equivalence_log5_odd_split() { + // log_n = 5 → log_m = 3, log_k = 2 (uneven split). + let input = random_input(5, 99); + let expected = reference_fft(&input); + let mut actual = input; + bowers_phased_fft::(&mut actual).unwrap(); + assert_eq!(actual, expected, "log_n = 5 mismatch"); + } + + #[test] + fn phased_equivalence_log_n_4_through_12() { + for log_n in 4..=12 { + for seed in [1u64, 7, 12345, 0xdead_beef] { + let input = random_input(log_n, seed); + let expected = reference_fft(&input); + let mut actual = input; + bowers_phased_fft::(&mut actual).unwrap(); + assert_eq!( + actual, expected, + "log_n = {log_n}, seed = {seed:x}: phased output mismatches reference" + ); + } + } + } + + #[test] + fn phased_equivalence_log_n_16() { + let input = random_input(16, 0xcafe_babe); + let expected = reference_fft(&input); + let mut actual = input; + bowers_phased_fft::(&mut actual).unwrap(); + assert_eq!(actual, expected, "log_n = 16 mismatch"); + } + + #[test] + fn phased_equivalence_log_n_18_odd_split() { + // log_n = 18 → log_m = 9, log_k = 9 (even split, no reuse). + let input = random_input(18, 7); + let expected = reference_fft(&input); + let mut actual = input; + bowers_phased_fft::(&mut actual).unwrap(); + assert_eq!(actual, expected, "log_n = 18 mismatch"); + } + + #[test] + fn phased_multicol_matches_per_column_reference() { + let log_n = 14; + let num_cols = 6; + let ctx = PhasedFftContext::::new(log_n).expect("ctx valid"); + + let mut columns: Vec> = + (0..num_cols).map(|c| random_input(log_n, c as u64 + 1)).collect(); + let expected: Vec> = columns.iter().map(|c| reference_fft(c)).collect(); + + let mut col_refs: Vec<&mut [FE]> = + columns.iter_mut().map(|v| v.as_mut_slice()).collect(); + bowers_phased_fft_multicol::(&mut col_refs, &ctx).expect("multicol ok"); + + for (c, (got, want)) in columns.iter().zip(expected.iter()).enumerate() { + assert_eq!(got, want, "column {c} mismatches reference"); + } + } + + #[test] + fn tiled_transpose_2x3() { + let input: Vec = vec![1, 2, 3, 4, 5, 6]; + let mut output = vec![0u64; 6]; + tiled_transpose(&input, &mut output, 2, 3); + // Input (2×3): + // 1 2 3 + // 4 5 6 + // Expected output (3×2): + // 1 4 + // 2 5 + // 3 6 + assert_eq!(output, vec![1, 4, 2, 5, 3, 6]); + } + + #[test] + fn tiled_transpose_roundtrip() { + let rows = 17; + let cols = 23; + let input: Vec = (0..(rows * cols) as u64).collect(); + let mut t = vec![0u64; rows * cols]; + let mut back = vec![0u64; rows * cols]; + tiled_transpose(&input, &mut t, rows, cols); + tiled_transpose(&t, &mut back, cols, rows); + assert_eq!(back, input); + } +} diff --git a/crypto/math/src/polynomial.rs b/crypto/math/src/polynomial.rs index e3eaf66d4..b7bcc4d63 100644 --- a/crypto/math/src/polynomial.rs +++ b/crypto/math/src/polynomial.rs @@ -4,6 +4,10 @@ use crate::fft::bowers_fft::{LayerTwiddles, bowers_fft_opt_fused, bowers_ifft_op #[cfg(feature = "parallel")] use crate::fft::bowers_fft::{bowers_fft_opt_fused_parallel, bowers_ifft_opt_parallel}; use crate::fft::errors::FFTError; +// Phased FFT is unconditionally wired on this research branch so the bench +// harness exercises it without needing a cargo feature flag. The +// `phased-fft` feature is retained as a no-op for compatibility. +use crate::fft::phased_fft::{PhasedFftContext, bowers_phased_fft_seq_with_buf}; use crate::field::traits::{IsFFTField, IsField, IsSubFieldOf}; use alloc::{borrow::ToOwned, vec, vec::Vec}; @@ -264,6 +268,65 @@ fn dispatch_fft, E: IsField + Send + Sync>( bowers_fft_opt_fused(buffer, twiddles) } +/// Minimum LDE size at which the phased FFT path becomes competitive +/// on x86 server hardware. Below this we keep the single-pass Bowers +/// to amortise the transpose / phase-twiddle fixed overhead. +const PHASED_FFT_THRESHOLD: usize = 1 << 21; + +/// Dispatch a forward FFT and produce **natural-order** output. +/// +/// Behaviour is identical for both backends: +/// - Bowers DIF (default): `bowers_fft_opt_fused_parallel(...)` followed +/// by `in_place_bit_reverse_permute(...)`. +/// - Phased Bailey four-step (when `phased-fft` is enabled AND the +/// buffer is large enough): natural-order output directly, no +/// trailing bit-reverse. +/// +/// Callers that previously wrote +/// ```ignore +/// dispatch_fft(buf, tw)?; +/// in_place_bit_reverse_permute(buf); +/// ``` +/// should switch to `dispatch_fft_natural(buf, tw)?;` to let both +/// backends share a single call site. +#[inline] +fn dispatch_fft_natural( + buffer: &mut [FieldElement], + twiddles: &LayerTwiddles, +) -> Result<(), FFTError> +where + F: IsFFTField + IsSubFieldOf, + E: IsField + Send + Sync, + FieldElement: Send + Sync, + FieldElement: Send + Sync + Clone, +{ + { + let n = buffer.len(); + if n >= PHASED_FFT_THRESHOLD && n.is_power_of_two() { + let log_n = n.trailing_zeros() as usize; + // Building the context per call costs O(M + K) field elements + // (inner LayerTwiddles); negligible relative to the FFT itself. + // A cached / thread-local context would be a follow-up + // optimisation if we want to land this for real. + // + // Use the SEQUENTIAL-inner phased path: this dispatch is + // invoked from the prover's per-column `par_iter` LDE, which + // already saturates the rayon pool. The parallel-inner variant + // would nest a second rayon level and oversubscribe. The + // Bailey cache-blocking (lower DRAM traffic) — the win at + // memory-bound sizes — applies regardless of intra-FFT + // parallelism. + if let Ok(ctx) = PhasedFftContext::::new(log_n) { + let mut scratch: Vec> = Vec::with_capacity(n); + return bowers_phased_fft_seq_with_buf(buffer, &ctx, &mut scratch); + } + } + } + dispatch_fft(buffer, twiddles)?; + in_place_bit_reverse_permute(buffer); + Ok(()) +} + /// Dispatch inverse FFT (DIT) to parallel or sequential implementation based on buffer size. #[inline] fn dispatch_ifft, E: IsField + Send + Sync>( @@ -444,8 +507,7 @@ impl Polynomial> { *coeff = w * &*coeff; } - dispatch_fft(buffer, fwd_twiddles)?; - in_place_bit_reverse_permute(buffer); + dispatch_fft_natural(buffer, fwd_twiddles)?; Ok(()) } @@ -497,8 +559,7 @@ impl Polynomial> { buffer.resize(lde_size, FieldElement::zero()); // 4. Forward FFT on the full buffer - dispatch_fft(buffer, fwd_twiddles)?; - in_place_bit_reverse_permute(buffer); + dispatch_fft_natural(buffer, fwd_twiddles)?; Ok(()) } @@ -540,9 +601,14 @@ where LayerTwiddles::::new(order).ok_or(FFTError::DomainSizeError(order as usize))?; let mut result = coeffs.to_vec(); - dispatch_fft(&mut result, &layer_twiddles)?; if permute_to_natural { - in_place_bit_reverse_permute(&mut result); + // Natural output: route through the phased-FFT-aware wrapper so + // both backends share one call site. + dispatch_fft_natural(&mut result, &layer_twiddles)?; + } else { + // Bit-reversed output requested explicitly: keep the legacy + // Bowers path which produces bit-reversed by design. + dispatch_fft(&mut result, &layer_twiddles)?; } Ok(result) } diff --git a/prover/Cargo.toml b/prover/Cargo.toml index 03f9d8e75..cf78d7fb3 100644 --- a/prover/Cargo.toml +++ b/prover/Cargo.toml @@ -10,6 +10,9 @@ parallel = ["stark/parallel", "math/parallel", "crypto/parallel", "dep:rayon"] debug-checks = ["stark/debug-checks"] instruments = ["stark/instruments"] disk-spill = ["stark/disk-spill"] +# Route the LDE forward FFT through the phased (Bailey four-step) path in +# the math crate. Off by default; enable for A/B benching of large tables. +phased-fft = ["math/phased-fft"] [dependencies] stark = { path = "../crypto/stark" } diff --git a/prover/src/tables/mod.rs b/prover/src/tables/mod.rs index 4a6032ef2..7649bef4f 100644 --- a/prover/src/tables/mod.rs +++ b/prover/src/tables/mod.rs @@ -95,19 +95,58 @@ pub struct MaxRowsConfig { pub memw_register: usize, } +/// Global chunk-size scale (log2), a left-shift applied uniformly to +/// every per-table `max_rows`. Read from `LAMBDA_MAXROWS_SCALE_LOG2`, +/// falling back to [`DEFAULT_MAXROWS_SCALE_LOG2`] when unset/invalid. +/// +/// Bigger chunks → fewer chunks → fewer full StarkProof instances, which +/// cuts the per-instance overhead that scales with chunk count (FRI query +/// phase ~219 queries, 20-bit grinding, OOD/DEEP, proof serialization) and +/// pushes LDE sizes past the phased-FFT threshold into the memory-bound +/// regime where the Bailey four-step FFT wins. Full power-of-two chunks +/// are not padded (only the final partial chunk is), so the cached-LDE +/// peak heap stays ~constant across scales. +/// +/// This research branch defaults to a non-zero scale so the automated +/// main-vs-PR bench harness exercises the larger tables without needing to +/// inject an env var. Override to sweep: `LAMBDA_MAXROWS_SCALE_LOG2=0` +/// reproduces production sizing, `=1`/`=2` are intermediate steps. +/// +/// Clamped to `0..=4` so a stray value can't request a 16M+× blow-up. +fn maxrows_scale_log2_from_env() -> u32 { + match std::env::var("LAMBDA_MAXROWS_SCALE_LOG2") { + Ok(s) => match s.trim().parse::() { + Ok(v) => v.min(4), + Err(_) => DEFAULT_MAXROWS_SCALE_LOG2, + }, + Err(_) => DEFAULT_MAXROWS_SCALE_LOG2, + } +} + +/// Default chunk-size scale. **Production sizing (0).** +/// +/// Bigger tables were benchmarked on fib_8M and regress monotonically: +/// ×4 +16% / ×8 +41% prove time (bigger chunks collapse the table-level +/// parallelism the prover relies on, plus last-chunk padding). The +/// phased FFT does not recover it even in the memory-bound regime, so we +/// keep production chunk sizing. Override with `LAMBDA_MAXROWS_SCALE_LOG2` +/// to reproduce the experiment. +const DEFAULT_MAXROWS_SCALE_LOG2: u32 = 0; + impl Default for MaxRowsConfig { fn default() -> Self { + let shift = maxrows_scale_log2_from_env(); Self { - cpu: max_rows::CPU, - memw: max_rows::MEMW, - memw_aligned: max_rows::MEMW_A, - dvrm: max_rows::DVRM, - mul: max_rows::MUL, - lt: max_rows::LT, - shift: max_rows::SHIFT, - load: max_rows::LOAD, - branch: max_rows::BRANCH, - memw_register: max_rows::MEMW_R, + cpu: max_rows::CPU << shift, + memw: max_rows::MEMW << shift, + memw_aligned: max_rows::MEMW_A << shift, + dvrm: max_rows::DVRM << shift, + mul: max_rows::MUL << shift, + lt: max_rows::LT << shift, + shift: max_rows::SHIFT << shift, + load: max_rows::LOAD << shift, + branch: max_rows::BRANCH << shift, + memw_register: max_rows::MEMW_R << shift, } } } diff --git a/scripts/bench_prove.sh b/scripts/bench_prove.sh index 4ba97a921..42902f18c 100755 --- a/scripts/bench_prove.sh +++ b/scripts/bench_prove.sh @@ -61,8 +61,15 @@ build_branch() { local branch=$1 label=$2 echo -e "${GREEN}[$label] Checking out $branch...${NC}" git -C "$ROOT_DIR" checkout "$branch" - echo -e "${GREEN}[$label] Building release CLI (with jemalloc-stats)...${NC}" - cargo build --release -p cli --features jemalloc-stats --manifest-path "$ROOT_DIR/Cargo.toml" 2>&1 | tail -1 + # Extra cargo features applied ONLY to the PR/feature side (not the + # base branch, which may lack them — e.g. `phased-fft`). Set via + # PR_EXTRA_FEATURES=phased-fft ./scripts/bench_prove.sh ... + local features="jemalloc-stats" + if [ "$label" != "base" ] && [ -n "${PR_EXTRA_FEATURES:-}" ]; then + features="$features $PR_EXTRA_FEATURES" + fi + echo -e "${GREEN}[$label] Building release CLI (--features \"$features\")...${NC}" + cargo build --release -p cli --features "$features" --manifest-path "$ROOT_DIR/Cargo.toml" 2>&1 | tail -1 cp "$ROOT_DIR/target/release/cli" "$TMP_DIR/cli-$label" echo -e "${GREEN}[$label] Binary saved.${NC}" }