diff --git a/bin/cli/Cargo.toml b/bin/cli/Cargo.toml
index 87bb1c8fc..f91175037 100644
--- a/bin/cli/Cargo.toml
+++ b/bin/cli/Cargo.toml
@@ -18,3 +18,4 @@ env_logger = "0.11"
 jemalloc-stats = ["dep:tikv-jemalloc-ctl"]
 disk-spill = ["prover/disk-spill"]
 instruments = ["prover/instruments", "stark/instruments"]
+phased-fft = ["prover/phased-fft"]
diff --git a/crypto/math/Cargo.toml b/crypto/math/Cargo.toml
index 85979a7c4..93ad93a2f 100644
--- a/crypto/math/Cargo.toml
+++ b/crypto/math/Cargo.toml
@@ -34,6 +34,10 @@ default = ["parallel", "std"]
 std = ["alloc", "serde?/std", "serde_json?/std"]
 alloc = []
 parallel = ["dep:rayon"]
+# Route the forward FFT through the phased (Bailey four-step) path when the
+# LDE size is large enough. Output convention unchanged — natural order.
+# Off by default so A/B bench can toggle.
+phased-fft = ["parallel"]
 lambdaworks-serde-binary = ["dep:serde", "alloc"]
 lambdaworks-serde-string = ["dep:serde", "dep:serde_json", "alloc"]
 proptest = ["dep:proptest"]
@@ -45,4 +49,8 @@ getrandom = { version = "0.2.15", features = ["js"] }
 
 [[bench]]
 name = "goldilocks_benchmark"
+harness = false
+
+[[bench]]
+name = "phased_fft_benchmark"
 harness = false
\ No newline at end of file
diff --git a/crypto/math/benches/phased_fft_benchmark.rs b/crypto/math/benches/phased_fft_benchmark.rs
new file mode 100644
index 000000000..4a3644a58
--- /dev/null
+++ b/crypto/math/benches/phased_fft_benchmark.rs
@@ -0,0 +1,141 @@
+//! Phased FFT vs Bowers FFT — single-column and multi-column.
+
+use std::hint::black_box;
+use std::time::Duration;
+
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use rand::{RngCore, SeedableRng};
+
+use math::fft::bit_reversing::in_place_bit_reverse_permute;
+use math::fft::bowers_fft::{LayerTwiddles, bowers_fft_opt_fused_parallel};
+use math::fft::phased_fft::{
+    PhasedFftContext, bowers_phased_fft_multicol, bowers_phased_fft_with_buf,
+};
+use math::field::element::FieldElement;
+use math::field::goldilocks::GoldilocksField;
+
+#[cfg(feature = "parallel")]
+use rayon::prelude::*;
+
+type F = GoldilocksField;
+type FE = FieldElement<F>;
+
+fn random_input(log_n: usize, seed: u64) -> Vec<FE> {
+    let n = 1usize << log_n;
+    let mut rng = rand_chacha::ChaCha20Rng::seed_from_u64(seed);
+    (0..n).map(|_| FE::from(rng.next_u64())).collect()
+}
+
+fn bench_single_col(c: &mut Criterion) {
+    let mut group = c.benchmark_group("fft_single_col");
+    group.sample_size(10);
+    group.measurement_time(Duration::from_secs(8));
+
+    for &log_n in &[20usize, 22, 24] {
+        let n = 1usize << log_n;
+        let input = random_input(log_n, 1);
+        let bowers_twiddles = LayerTwiddles::<F>::new(log_n as u64).unwrap();
+        let phased_ctx = PhasedFftContext::<F>::new(log_n).unwrap();
+
+        group.bench_with_input(
+            BenchmarkId::new("bowers", format!("2^{log_n}")),
+            &n,
+            |b, _| {
+                b.iter_batched(
+                    || input.clone(),
+                    |mut data| {
+                        bowers_fft_opt_fused_parallel(&mut data, &bowers_twiddles).unwrap();
+                        in_place_bit_reverse_permute(&mut data);
+                        black_box(data);
+                    },
+                    criterion::BatchSize::LargeInput,
+                );
+            },
+        );
+
+        let mut scratch: Vec<FE> = Vec::with_capacity(n);
+        group.bench_with_input(
+            BenchmarkId::new("phased", format!("2^{log_n}")),
+            &n,
+            |b, _| {
+                b.iter_batched(
+                    || input.clone(),
+                    |mut data| {
+                        bowers_phased_fft_with_buf::<F, F>(&mut data, &phased_ctx, &mut scratch)
+                            .unwrap();
+                        black_box(data);
+                    },
+                    criterion::BatchSize::LargeInput,
+                );
+            },
+        );
+    }
+
+    group.finish();
+}
+
+/// Multi-column bench: realistic trace LDE shape. CPU table is 74 cols at
+/// log_n = 21, MEMW at 49 cols. We probe a few (cols × log_n) combinations.
+fn bench_multi_col(c: &mut Criterion) {
+    let mut group = c.benchmark_group("fft_multi_col");
+    group.sample_size(10);
+    group.measurement_time(Duration::from_secs(10));
+
+    for &(cols, log_n) in &[(74usize, 21usize), (49, 21), (24, 22), (12, 24)] {
+        let n = 1usize << log_n;
+        let bowers_twiddles = LayerTwiddles::<F>::new(log_n as u64).unwrap();
+        let phased_ctx = PhasedFftContext::<F>::new(log_n).unwrap();
+        let columns: Vec<Vec<FE>> = (0..cols)
+            .map(|c| random_input(log_n, c as u64 + 1))
+            .collect();
+
+        // Baseline 1: rayon par_iter over columns, each calls bowers
+        // independently. This mirrors what the prover does today.
+        group.bench_with_input(
+            BenchmarkId::new("bowers_par", format!("{cols}cols_2^{log_n}")),
+            &n,
+            |b, _| {
+                b.iter_batched(
+                    || columns.clone(),
+                    |mut data| {
+                        #[cfg(feature = "parallel")]
+                        data.par_iter_mut().for_each(|col| {
+                            bowers_fft_opt_fused_parallel(col, &bowers_twiddles).unwrap();
+                            in_place_bit_reverse_permute(col);
+                        });
+                        #[cfg(not(feature = "parallel"))]
+                        for col in data.iter_mut() {
+                            bowers_fft_opt_fused_parallel(col, &bowers_twiddles).unwrap();
+                            in_place_bit_reverse_permute(col);
+                        }
+                        black_box(data);
+                    },
+                    criterion::BatchSize::LargeInput,
+                );
+            },
+        );
+
+        // Phased multi-col: shared ctx + per-worker buffer pool.
+        group.bench_with_input(
+            BenchmarkId::new("phased_multicol", format!("{cols}cols_2^{log_n}")),
+            &n,
+            |b, _| {
+                b.iter_batched(
+                    || columns.clone(),
+                    |mut data| {
+                        let mut refs: Vec<&mut [FE]> =
+                            data.iter_mut().map(|c| c.as_mut_slice()).collect();
+                        bowers_phased_fft_multicol::<F, F>(&mut refs, &phased_ctx).unwrap();
+                        black_box(data);
+                    },
+                    criterion::BatchSize::LargeInput,
+                );
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_single_col, bench_multi_col);
+criterion_main!(benches);
diff --git a/crypto/math/src/fft/mod.rs b/crypto/math/src/fft/mod.rs
index fd0d1c4e2..127a5a4cd 100644
--- a/crypto/math/src/fft/mod.rs
+++ b/crypto/math/src/fft/mod.rs
@@ -3,6 +3,8 @@ pub mod bit_reversing;
 pub mod bowers_fft;
 pub mod errors;
 #[cfg(feature = "alloc")]
+pub mod phased_fft;
+#[cfg(feature = "alloc")]
 pub mod roots_of_unity;
 
 #[cfg(all(test, feature = "alloc"))]
diff --git a/crypto/math/src/fft/phased_fft.rs b/crypto/math/src/fft/phased_fft.rs
new file mode 100644
index 000000000..2a5b0d0f6
--- /dev/null
+++ b/crypto/math/src/fft/phased_fft.rs
@@ -0,0 +1,607 @@
+//! Phased Bowers FFT — 2-phase Bailey four-step prototype.
+//!
+//! This is a probe to evaluate whether a phased / cache-blocked NTT
+//! beats the current single-pass `bowers_fft_opt_fused_parallel` for
+//! large `N` (especially `log_n >= 22` where the working set spills out
+//! of L3 and the naive layer-by-layer Bowers pattern becomes
+//! memory-bound).
+//!
+//! # Approach
+//!
+//! The standard Bailey four-step. View the input as an `M × K` row-major
+//! matrix with `N = M·K` and split log_n roughly in half:
+//!
+//! 1. Transpose input → `K × M`.
+//! 2. FFT_M on each of the K rows (reuses existing Bowers fused
+//!    kernels; each row fits in L1).
+//! 3. Multiply pointwise by `ω_N^(s·d)` where `s` is the row index
+//!    `[0, K)` and `d` the column index `[0, M)`.
+//! 4. Transpose → `M × K`.
+//! 5. FFT_K on each of the M rows.
+//! 6. Final transpose → linear output `Y[0], …, Y[N-1]`.
+//!
+//! Phase twiddles are computed on-the-fly (no upfront `N`-sized table)
+//! — for `log_n = 26` the inter-phase twiddle table would cost ~1 GB
+//! otherwise. We do `K` powers `ω_K^s` and step the accumulator
+//! inside the row, one mul per element.
+//!
+//! Output convention: **natural order**. Equivalent to
+//! `bowers_fft_opt_fused_parallel(x, ...); in_place_bit_reverse_permute(x)`.
+//!
+//! # Status
+//!
+//! Prototype. Correctness verified by equivalence tests against the
+//! existing Bowers path; performance measured by `phased_fft_bench`.
+//! Not yet wired into the prover / polynomial API.
+
+use crate::fft::bit_reversing::in_place_bit_reverse_permute;
+use crate::fft::bowers_fft::{LayerTwiddles, bowers_fft_opt_fused};
+use crate::fft::errors::FFTError;
+use crate::field::{
+    element::FieldElement,
+    traits::{IsFFTField, IsField, IsSubFieldOf},
+};
+use alloc::vec::Vec;
+
+#[cfg(feature = "parallel")]
+use rayon::prelude::*;
+
+/// Tile size for the out-of-place transpose. 32 × 32 × 8 B = 8 KB per
+/// tile fits the L1 of Goldilocks base-field elements (extension-field
+/// tiles are ~3× larger but still fit).
+const TRANSPOSE_TILE: usize = 32;
+
+/// Out-of-place tiled transpose, parallelised over bands of output rows.
+///
+/// Same contract as [`tiled_transpose`] but splits `output` into
+/// `TRANSPOSE_TILE`-row bands processed in parallel. Each band reads a
+/// `TRANSPOSE_TILE`-wide column strip of `input` (tiled along `rows` for
+/// cache locality) and writes a contiguous chunk of `output`. Use for
+/// the standalone single-FFT path where no outer parallel loop exists.
+#[cfg(feature = "parallel")]
+pub fn tiled_transpose_par<T: Clone + Send + Sync>(
+    input: &[T],
+    output: &mut [T],
+    rows: usize,
+    cols: usize,
+) {
+    debug_assert_eq!(input.len(), rows * cols);
+    debug_assert_eq!(output.len(), rows * cols);
+
+    output
+        .par_chunks_mut(rows * TRANSPOSE_TILE)
+        .enumerate()
+        .for_each(|(band, out_band)| {
+            let c0 = band * TRANSPOSE_TILE;
+            let c_end = (c0 + TRANSPOSE_TILE).min(cols);
+            for tile_r in (0..rows).step_by(TRANSPOSE_TILE) {
+                let r_end = (tile_r + TRANSPOSE_TILE).min(rows);
+                for c in c0..c_end {
+                    let out_row = &mut out_band[(c - c0) * rows..(c - c0) * rows + rows];
+                    for r in tile_r..r_end {
+                        out_row[r] = input[r * cols + c].clone();
+                    }
+                }
+            }
+        });
+}
+
+/// Dispatch to the parallel transpose when the feature is on, else the
+/// sequential one. Used by the standalone single-FFT path.
+#[inline]
+fn transpose_single<T: Clone + Send + Sync>(
+    input: &[T],
+    output: &mut [T],
+    rows: usize,
+    cols: usize,
+) {
+    #[cfg(feature = "parallel")]
+    tiled_transpose_par(input, output, rows, cols);
+    #[cfg(not(feature = "parallel"))]
+    tiled_transpose(input, output, rows, cols);
+}
+
+/// Out-of-place tiled transpose (sequential).
+///
+/// `input` is `rows × cols` row-major; `output` becomes `cols × rows`
+/// row-major. Tile-based for cache locality: each tile pair is
+/// `TRANSPOSE_TILE × TRANSPOSE_TILE` elements, sized to fit L1.
+///
+/// Use this when the caller is already parallelising at a coarser grain
+/// (e.g. across columns in [`bowers_phased_fft_multicol`]); nesting a
+/// parallel transpose under a parallel column loop oversubscribes the
+/// pool.
+pub fn tiled_transpose<T: Clone>(input: &[T], output: &mut [T], rows: usize, cols: usize) {
+    debug_assert_eq!(input.len(), rows * cols);
+    debug_assert_eq!(output.len(), rows * cols);
+
+    for tile_r in (0..rows).step_by(TRANSPOSE_TILE) {
+        let r_end = (tile_r + TRANSPOSE_TILE).min(rows);
+        for tile_c in (0..cols).step_by(TRANSPOSE_TILE) {
+            let c_end = (tile_c + TRANSPOSE_TILE).min(cols);
+            for r in tile_r..r_end {
+                for c in tile_c..c_end {
+                    output[c * rows + r] = input[r * cols + c].clone();
+                }
+            }
+        }
+    }
+}
+
+/// Pre-built phased-FFT twiddle context. Build once per `(log_n)` pair,
+/// reuse across many invocations to amortise the LayerTwiddles
+/// allocation + the `ω_N` lookup.
+pub struct PhasedFftContext<F: IsField + IsFFTField> {
+    pub log_n: usize,
+    pub log_m: usize,
+    pub log_k: usize,
+    pub twiddles_m: LayerTwiddles<F>,
+    /// `None` when `log_k == log_m` — caller reuses `twiddles_m`.
+    pub twiddles_k: Option<LayerTwiddles<F>>,
+    pub omega_n: FieldElement<F>,
+}
+
+impl<F: IsFFTField> PhasedFftContext<F> {
+    pub fn new(log_n: usize) -> Result<Self, FFTError> {
+        let n = 1usize << log_n;
+        if log_n < 4 {
+            return Err(FFTError::InputError(n));
+        }
+        let log_m = log_n.div_ceil(2);
+        let log_k = log_n - log_m;
+        let twiddles_m = LayerTwiddles::<F>::new(log_m as u64)
+            .ok_or(FFTError::InputError(1usize << log_m))?;
+        let twiddles_k = if log_k == log_m {
+            None
+        } else {
+            Some(
+                LayerTwiddles::<F>::new(log_k as u64)
+                    .ok_or(FFTError::InputError(1usize << log_k))?,
+            )
+        };
+        let omega_n = F::get_primitive_root_of_unity(log_n as u64)
+            .map_err(|_| FFTError::InputError(n))?;
+        Ok(Self {
+            log_n,
+            log_m,
+            log_k,
+            twiddles_m,
+            twiddles_k,
+            omega_n,
+        })
+    }
+
+    #[inline]
+    fn twiddles_k_ref(&self) -> &LayerTwiddles<F> {
+        self.twiddles_k.as_ref().unwrap_or(&self.twiddles_m)
+    }
+}
+
+/// 2-phase Bailey four-step FFT.
+///
+/// Splits the `N = 2^log_n` problem into two cache-friendly FFTs of
+/// sizes `M = 2^⌈log_n/2⌉` and `K = 2^⌊log_n/2⌋`, with a pointwise
+/// phase-twiddle multiply between them.
+///
+/// Output: natural order (equivalent to
+/// `bowers_fft_opt_fused_parallel` followed by
+/// `in_place_bit_reverse_permute`).
+///
+/// For `log_n < 4` falls back to a single-pass Bowers (the phased
+/// machinery has fixed overhead that doesn't pay back at tiny sizes).
+///
+/// Convenience wrapper that builds the `PhasedFftContext` per call.
+/// Hot-path callers should prefer [`bowers_phased_fft_with_context`].
+///
+/// # Errors
+/// Returns `FFTError::InputError` if `input.len()` is not a power of
+/// two, or if the field lacks a primitive root of unity of the
+/// required order.
+pub fn bowers_phased_fft<F, E>(input: &mut [FieldElement<E>]) -> Result<(), FFTError>
+where
+    F: IsFFTField + IsSubFieldOf<E>,
+    E: IsField + Send + Sync,
+    FieldElement<F>: Send + Sync,
+    FieldElement<E>: Send + Sync + Clone,
+{
+    let n = input.len();
+    if !n.is_power_of_two() {
+        return Err(FFTError::InputError(n));
+    }
+    if n <= 1 {
+        return Ok(());
+    }
+    let log_n = n.trailing_zeros() as usize;
+
+    if log_n < 4 {
+        let twiddles = LayerTwiddles::<F>::new(log_n as u64).ok_or(FFTError::InputError(n))?;
+        bowers_fft_opt_fused(input, &twiddles)?;
+        in_place_bit_reverse_permute(input);
+        return Ok(());
+    }
+
+    let ctx = PhasedFftContext::<F>::new(log_n)?;
+    bowers_phased_fft_with_context(input, &ctx)
+}
+
+/// 2-phase Bailey four-step FFT using a pre-built twiddle context.
+///
+/// Skips the inner-twiddle and `ω_N` setup that
+/// [`bowers_phased_fft`] pays per call. Caller must size `input` to
+/// `1 << ctx.log_n`. Allocates an internal `N`-element scratch buffer
+/// per call — for repeated calls (multi-column / multi-poly), use
+/// [`bowers_phased_fft_with_buf`] instead.
+pub fn bowers_phased_fft_with_context<F, E>(
+    input: &mut [FieldElement<E>],
+    ctx: &PhasedFftContext<F>,
+) -> Result<(), FFTError>
+where
+    F: IsFFTField + IsSubFieldOf<E>,
+    E: IsField + Send + Sync,
+    FieldElement<F>: Send + Sync,
+    FieldElement<E>: Send + Sync + Clone,
+{
+    let mut buf: Vec<FieldElement<E>> = Vec::new();
+    bowers_phased_fft_with_buf(input, ctx, &mut buf)
+}
+
+/// 2-phase Bailey four-step FFT, reusing a caller-supplied scratch
+/// buffer. The buffer is resized to `1 << ctx.log_n` on demand;
+/// repeat callers can hold a long-lived `Vec` and pay only the
+/// initial allocation.
+pub fn bowers_phased_fft_with_buf<F, E>(
+    input: &mut [FieldElement<E>],
+    ctx: &PhasedFftContext<F>,
+    buf: &mut Vec<FieldElement<E>>,
+) -> Result<(), FFTError>
+where
+    F: IsFFTField + IsSubFieldOf<E>,
+    E: IsField + Send + Sync,
+    FieldElement<F>: Send + Sync,
+    FieldElement<E>: Send + Sync + Clone,
+{
+    // Standalone single-FFT path: parallelise *within* this FFT.
+    phased_fft_core(input, ctx, buf, true)
+}
+
+/// Sequential-inner variant of [`bowers_phased_fft_with_buf`].
+///
+/// Runs the transposes, inner row-FFTs and phase-twiddle pass on a
+/// single thread. Use when the caller already parallelises at a coarser
+/// grain — e.g. the prover's per-column `par_iter` LDE — so nesting a
+/// second rayon level here would oversubscribe the pool. The Bailey
+/// cache-blocking (reduced DRAM traffic) still applies; only intra-FFT
+/// parallelism is dropped.
+pub fn bowers_phased_fft_seq_with_buf<F, E>(
+    input: &mut [FieldElement<E>],
+    ctx: &PhasedFftContext<F>,
+    buf: &mut Vec<FieldElement<E>>,
+) -> Result<(), FFTError>
+where
+    F: IsFFTField + IsSubFieldOf<E>,
+    E: IsField + Send + Sync,
+    FieldElement<F>: Send + Sync,
+    FieldElement<E>: Send + Sync + Clone,
+{
+    phased_fft_core(input, ctx, buf, false)
+}
+
+/// Core Bailey four-step worker shared by the single-FFT and
+/// multi-column entry points.
+///
+/// `parallel` selects the intra-FFT parallelism strategy:
+///   - `true`  — parallelise the transposes, per-row inner FFTs and the
+///     phase-twiddle pass (the only work in flight).
+///   - `false` — fully sequential, for callers already saturating the
+///     pool at a coarser grain (one column per worker in
+///     [`bowers_phased_fft_multicol`]); avoids nested-rayon
+///     oversubscription.
+fn phased_fft_core<F, E>(
+    input: &mut [FieldElement<E>],
+    ctx: &PhasedFftContext<F>,
+    buf: &mut Vec<FieldElement<E>>,
+    parallel: bool,
+) -> Result<(), FFTError>
+where
+    F: IsFFTField + IsSubFieldOf<E>,
+    E: IsField + Send + Sync,
+    FieldElement<F>: Send + Sync,
+    FieldElement<E>: Send + Sync + Clone,
+{
+    let n = 1usize << ctx.log_n;
+    if input.len() != n {
+        return Err(FFTError::InputError(input.len()));
+    }
+    let m = 1usize << ctx.log_m;
+    let k = 1usize << ctx.log_k;
+    let twiddles_m = &ctx.twiddles_m;
+    let twiddles_k_ref = ctx.twiddles_k_ref();
+    let omega_n = &ctx.omega_n;
+
+    if buf.len() < n {
+        buf.resize(n, FieldElement::<E>::zero());
+    }
+    let buf: &mut [FieldElement<E>] = &mut buf.as_mut_slice()[..n];
+
+    let transpose = |src: &[FieldElement<E>], dst: &mut [FieldElement<E>], r: usize, c: usize| {
+        if parallel {
+            transpose_single(src, dst, r, c);
+        } else {
+            tiled_transpose(src, dst, r, c);
+        }
+    };
+    let run_rows = |slice: &mut [FieldElement<E>],
+                    row_len: usize,
+                    tw: &LayerTwiddles<F>|
+     -> Result<(), FFTError> {
+        #[cfg(feature = "parallel")]
+        if parallel {
+            return slice
+                .par_chunks_mut(row_len)
+                .try_for_each(|row| -> Result<(), FFTError> {
+                    bowers_fft_opt_fused(row, tw)?;
+                    in_place_bit_reverse_permute(row);
+                    Ok(())
+                });
+        }
+        for row in slice.chunks_exact_mut(row_len) {
+            bowers_fft_opt_fused(row, tw)?;
+            in_place_bit_reverse_permute(row);
+        }
+        Ok(())
+    };
+
+    // Step 1: transpose input (M × K) → buf (K × M).
+    transpose(input, buf, m, k);
+    // Step 2: FFT_M on each of K rows in buf.
+    run_rows(buf, m, twiddles_m)?;
+    // Step 3: phase twiddle multiply on buf (K × M).
+    apply_phase_twiddles::<F, E>(buf, k, m, omega_n, parallel);
+    // Step 4: transpose buf (K × M) → input (M × K).
+    transpose(buf, input, k, m);
+    // Step 5: FFT_K on each of M rows in input.
+    run_rows(input, k, twiddles_k_ref)?;
+    // Step 6: final transpose to natural linear order. Lands in buf, so
+    // copy back (parallel copy to stay off the serial critical path).
+    transpose(input, buf, m, k);
+    copy_into(input, buf, parallel);
+
+    Ok(())
+}
+
+/// `dst.clone_from_slice(src)`, parallelised when requested.
+#[inline]
+fn copy_into<E>(dst: &mut [FieldElement<E>], src: &[FieldElement<E>], parallel: bool)
+where
+    E: IsField,
+    FieldElement<E>: Send + Sync + Clone,
+{
+    #[cfg(feature = "parallel")]
+    if parallel {
+        dst.par_chunks_mut(1 << 16)
+            .zip(src.par_chunks(1 << 16))
+            .for_each(|(d, s)| d.clone_from_slice(s));
+        return;
+    }
+    dst.clone_from_slice(src);
+}
+
+/// Apply the inter-phase twiddle correction in-place.
+///
+/// `buf` is `rows × cols` row-major; `omega_n` is the primitive `N`-th
+/// root of unity where `N = rows · cols`. Multiplies `buf[s·cols + d]`
+/// by `ω_N^(s·d)` for `s ∈ [0, rows)`, `d ∈ [0, cols)`.
+///
+/// Rows are independent; parallelises across `s` when `parallel`.
+fn apply_phase_twiddles<F, E>(
+    buf: &mut [FieldElement<E>],
+    rows: usize,
+    cols: usize,
+    omega_n: &FieldElement<F>,
+    parallel: bool,
+) where
+    F: IsFFTField + IsSubFieldOf<E>,
+    E: IsField + Send + Sync,
+    FieldElement<F>: Send + Sync,
+    FieldElement<E>: Send + Sync,
+{
+    debug_assert_eq!(buf.len(), rows * cols);
+
+    let apply_row = |(s, row): (usize, &mut [FieldElement<E>])| {
+        if s == 0 {
+            return;
+        }
+        let omega_s = omega_n.pow(s as u64);
+        let mut accum = FieldElement::<F>::one();
+        for slot in row.iter_mut() {
+            *slot = &accum * &*slot;
+            accum = &accum * &omega_s;
+        }
+    };
+
+    #[cfg(feature = "parallel")]
+    if parallel {
+        buf.par_chunks_mut(cols).enumerate().for_each(apply_row);
+        return;
+    }
+    buf.chunks_exact_mut(cols).enumerate().for_each(apply_row);
+}
+
+/// Multi-column phased FFT: runs N-point FFTs over many columns in
+/// parallel, sharing the PhasedFftContext and amortising the scratch
+/// buffer across rayon workers via for_each_init.
+///
+/// Each column receives the same FFT (output in natural order). Use
+/// when you have a batch of independent polynomials to evaluate at the
+/// same LDE domain — typical for STARK trace LDE.
+///
+/// # Memory model
+///
+/// One scratch buffer of N field elements per active worker thread
+/// (= rayon::current_num_threads() typically). Independent of column
+/// count: 100 columns processed by 8 workers ⇒ 8 buffers, not 100.
+pub fn bowers_phased_fft_multicol<F, E>(
+    columns: &mut [&mut [FieldElement<E>]],
+    ctx: &PhasedFftContext<F>,
+) -> Result<(), FFTError>
+where
+    F: IsFFTField + IsSubFieldOf<E>,
+    E: IsField + Send + Sync,
+    FieldElement<F>: Send + Sync,
+    FieldElement<E>: Send + Sync + Clone,
+{
+    let n = 1usize << ctx.log_n;
+
+    #[cfg(feature = "parallel")]
+    {
+        columns
+            .par_iter_mut()
+            .try_for_each_init(
+                || Vec::<FieldElement<E>>::with_capacity(n),
+                |buf, col| -> Result<(), FFTError> {
+                    // Sequential inner: this loop already parallelises
+                    // across columns; nesting would oversubscribe rayon.
+                    phased_fft_core(col, ctx, buf, false)
+                },
+            )?;
+    }
+    #[cfg(not(feature = "parallel"))]
+    {
+        let mut buf: Vec<FieldElement<E>> = Vec::with_capacity(n);
+        for col in columns.iter_mut() {
+            bowers_phased_fft_with_buf(col, ctx, &mut buf)?;
+        }
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::field::goldilocks::GoldilocksField;
+
+    type F = GoldilocksField;
+    type FE = FieldElement<F>;
+
+    /// Reference: existing Bowers + bit_reverse → natural order output.
+    fn reference_fft(input: &[FE]) -> Vec<FE> {
+        let log_n = input.len().trailing_zeros() as u64;
+        let twiddles = LayerTwiddles::<F>::new(log_n).unwrap();
+        let mut buf = input.to_vec();
+        bowers_fft_opt_fused(&mut buf, &twiddles).unwrap();
+        in_place_bit_reverse_permute(&mut buf);
+        buf
+    }
+
+    fn random_input(log_n: usize, seed: u64) -> Vec<FE> {
+        // Deterministic pseudo-random over Goldilocks: chain LCG on u64.
+        let n = 1usize << log_n;
+        let mut state = seed.wrapping_mul(0x9E37_79B9_7F4A_7C15).wrapping_add(1);
+        (0..n)
+            .map(|_| {
+                state = state.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
+                FE::from(state)
+            })
+            .collect()
+    }
+
+    #[test]
+    fn phased_equivalence_log4() {
+        let input = random_input(4, 42);
+        let expected = reference_fft(&input);
+        let mut actual = input;
+        bowers_phased_fft::<F, F>(&mut actual).unwrap();
+        assert_eq!(actual, expected, "log_n = 4 mismatch");
+    }
+
+    #[test]
+    fn phased_equivalence_log5_odd_split() {
+        // log_n = 5 → log_m = 3, log_k = 2 (uneven split).
+        let input = random_input(5, 99);
+        let expected = reference_fft(&input);
+        let mut actual = input;
+        bowers_phased_fft::<F, F>(&mut actual).unwrap();
+        assert_eq!(actual, expected, "log_n = 5 mismatch");
+    }
+
+    #[test]
+    fn phased_equivalence_log_n_4_through_12() {
+        for log_n in 4..=12 {
+            for seed in [1u64, 7, 12345, 0xdead_beef] {
+                let input = random_input(log_n, seed);
+                let expected = reference_fft(&input);
+                let mut actual = input;
+                bowers_phased_fft::<F, F>(&mut actual).unwrap();
+                assert_eq!(
+                    actual, expected,
+                    "log_n = {log_n}, seed = {seed:x}: phased output mismatches reference"
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn phased_equivalence_log_n_16() {
+        let input = random_input(16, 0xcafe_babe);
+        let expected = reference_fft(&input);
+        let mut actual = input;
+        bowers_phased_fft::<F, F>(&mut actual).unwrap();
+        assert_eq!(actual, expected, "log_n = 16 mismatch");
+    }
+
+    #[test]
+    fn phased_equivalence_log_n_18_odd_split() {
+        // log_n = 18 → log_m = 9, log_k = 9 (even split, no reuse).
+        let input = random_input(18, 7);
+        let expected = reference_fft(&input);
+        let mut actual = input;
+        bowers_phased_fft::<F, F>(&mut actual).unwrap();
+        assert_eq!(actual, expected, "log_n = 18 mismatch");
+    }
+
+    #[test]
+    fn phased_multicol_matches_per_column_reference() {
+        let log_n = 14;
+        let num_cols = 6;
+        let ctx = PhasedFftContext::<F>::new(log_n).expect("ctx valid");
+
+        let mut columns: Vec<Vec<FE>> =
+            (0..num_cols).map(|c| random_input(log_n, c as u64 + 1)).collect();
+        let expected: Vec<Vec<FE>> = columns.iter().map(|c| reference_fft(c)).collect();
+
+        let mut col_refs: Vec<&mut [FE]> =
+            columns.iter_mut().map(|v| v.as_mut_slice()).collect();
+        bowers_phased_fft_multicol::<F, F>(&mut col_refs, &ctx).expect("multicol ok");
+
+        for (c, (got, want)) in columns.iter().zip(expected.iter()).enumerate() {
+            assert_eq!(got, want, "column {c} mismatches reference");
+        }
+    }
+
+    #[test]
+    fn tiled_transpose_2x3() {
+        let input: Vec<u64> = vec![1, 2, 3, 4, 5, 6];
+        let mut output = vec![0u64; 6];
+        tiled_transpose(&input, &mut output, 2, 3);
+        // Input (2×3):
+        //   1 2 3
+        //   4 5 6
+        // Expected output (3×2):
+        //   1 4
+        //   2 5
+        //   3 6
+        assert_eq!(output, vec![1, 4, 2, 5, 3, 6]);
+    }
+
+    #[test]
+    fn tiled_transpose_roundtrip() {
+        let rows = 17;
+        let cols = 23;
+        let input: Vec<u64> = (0..(rows * cols) as u64).collect();
+        let mut t = vec![0u64; rows * cols];
+        let mut back = vec![0u64; rows * cols];
+        tiled_transpose(&input, &mut t, rows, cols);
+        tiled_transpose(&t, &mut back, cols, rows);
+        assert_eq!(back, input);
+    }
+}
diff --git a/crypto/math/src/polynomial.rs b/crypto/math/src/polynomial.rs
index e3eaf66d4..b7bcc4d63 100644
--- a/crypto/math/src/polynomial.rs
+++ b/crypto/math/src/polynomial.rs
@@ -4,6 +4,10 @@ use crate::fft::bowers_fft::{LayerTwiddles, bowers_fft_opt_fused, bowers_ifft_op
 #[cfg(feature = "parallel")]
 use crate::fft::bowers_fft::{bowers_fft_opt_fused_parallel, bowers_ifft_opt_parallel};
 use crate::fft::errors::FFTError;
+// Phased FFT is unconditionally wired on this research branch so the bench
+// harness exercises it without needing a cargo feature flag. The
+// `phased-fft` feature is retained as a no-op for compatibility.
+use crate::fft::phased_fft::{PhasedFftContext, bowers_phased_fft_seq_with_buf};
 use crate::field::traits::{IsFFTField, IsField, IsSubFieldOf};
 use alloc::{borrow::ToOwned, vec, vec::Vec};
 
@@ -264,6 +268,65 @@ fn dispatch_fft<F: IsFFTField + IsSubFieldOf<E>, E: IsField + Send + Sync>(
     bowers_fft_opt_fused(buffer, twiddles)
 }
 
+/// Minimum LDE size at which the phased FFT path becomes competitive
+/// on x86 server hardware. Below this we keep the single-pass Bowers
+/// to amortise the transpose / phase-twiddle fixed overhead.
+const PHASED_FFT_THRESHOLD: usize = 1 << 21;
+
+/// Dispatch a forward FFT and produce **natural-order** output.
+///
+/// Behaviour is identical for both backends:
+///   - Bowers DIF (default): `bowers_fft_opt_fused_parallel(...)` followed
+///     by `in_place_bit_reverse_permute(...)`.
+///   - Phased Bailey four-step (when `phased-fft` is enabled AND the
+///     buffer is large enough): natural-order output directly, no
+///     trailing bit-reverse.
+///
+/// Callers that previously wrote
+/// ```ignore
+/// dispatch_fft(buf, tw)?;
+/// in_place_bit_reverse_permute(buf);
+/// ```
+/// should switch to `dispatch_fft_natural(buf, tw)?;` to let both
+/// backends share a single call site.
+#[inline]
+fn dispatch_fft_natural<F, E>(
+    buffer: &mut [FieldElement<E>],
+    twiddles: &LayerTwiddles<F>,
+) -> Result<(), FFTError>
+where
+    F: IsFFTField + IsSubFieldOf<E>,
+    E: IsField + Send + Sync,
+    FieldElement<F>: Send + Sync,
+    FieldElement<E>: Send + Sync + Clone,
+{
+    {
+        let n = buffer.len();
+        if n >= PHASED_FFT_THRESHOLD && n.is_power_of_two() {
+            let log_n = n.trailing_zeros() as usize;
+            // Building the context per call costs O(M + K) field elements
+            // (inner LayerTwiddles); negligible relative to the FFT itself.
+            // A cached / thread-local context would be a follow-up
+            // optimisation if we want to land this for real.
+            //
+            // Use the SEQUENTIAL-inner phased path: this dispatch is
+            // invoked from the prover's per-column `par_iter` LDE, which
+            // already saturates the rayon pool. The parallel-inner variant
+            // would nest a second rayon level and oversubscribe. The
+            // Bailey cache-blocking (lower DRAM traffic) — the win at
+            // memory-bound sizes — applies regardless of intra-FFT
+            // parallelism.
+            if let Ok(ctx) = PhasedFftContext::<F>::new(log_n) {
+                let mut scratch: Vec<FieldElement<E>> = Vec::with_capacity(n);
+                return bowers_phased_fft_seq_with_buf(buffer, &ctx, &mut scratch);
+            }
+        }
+    }
+    dispatch_fft(buffer, twiddles)?;
+    in_place_bit_reverse_permute(buffer);
+    Ok(())
+}
+
 /// Dispatch inverse FFT (DIT) to parallel or sequential implementation based on buffer size.
 #[inline]
 fn dispatch_ifft<F: IsFFTField + IsSubFieldOf<E>, E: IsField + Send + Sync>(
@@ -444,8 +507,7 @@ impl<E: IsField> Polynomial<FieldElement<E>> {
             *coeff = w * &*coeff;
         }
 
-        dispatch_fft(buffer, fwd_twiddles)?;
-        in_place_bit_reverse_permute(buffer);
+        dispatch_fft_natural(buffer, fwd_twiddles)?;
 
         Ok(())
     }
@@ -497,8 +559,7 @@ impl<E: IsField> Polynomial<FieldElement<E>> {
         buffer.resize(lde_size, FieldElement::zero());
 
         // 4. Forward FFT on the full buffer
-        dispatch_fft(buffer, fwd_twiddles)?;
-        in_place_bit_reverse_permute(buffer);
+        dispatch_fft_natural(buffer, fwd_twiddles)?;
 
         Ok(())
     }
@@ -540,9 +601,14 @@ where
         LayerTwiddles::<F>::new(order).ok_or(FFTError::DomainSizeError(order as usize))?;
 
     let mut result = coeffs.to_vec();
-    dispatch_fft(&mut result, &layer_twiddles)?;
     if permute_to_natural {
-        in_place_bit_reverse_permute(&mut result);
+        // Natural output: route through the phased-FFT-aware wrapper so
+        // both backends share one call site.
+        dispatch_fft_natural(&mut result, &layer_twiddles)?;
+    } else {
+        // Bit-reversed output requested explicitly: keep the legacy
+        // Bowers path which produces bit-reversed by design.
+        dispatch_fft(&mut result, &layer_twiddles)?;
     }
     Ok(result)
 }
diff --git a/prover/Cargo.toml b/prover/Cargo.toml
index 03f9d8e75..cf78d7fb3 100644
--- a/prover/Cargo.toml
+++ b/prover/Cargo.toml
@@ -10,6 +10,9 @@ parallel = ["stark/parallel", "math/parallel", "crypto/parallel", "dep:rayon"]
 debug-checks = ["stark/debug-checks"]
 instruments = ["stark/instruments"]
 disk-spill = ["stark/disk-spill"]
+# Route the LDE forward FFT through the phased (Bailey four-step) path in
+# the math crate. Off by default; enable for A/B benching of large tables.
+phased-fft = ["math/phased-fft"]
 
 [dependencies]
 stark = { path = "../crypto/stark" }
diff --git a/prover/src/tables/mod.rs b/prover/src/tables/mod.rs
index 4a6032ef2..7649bef4f 100644
--- a/prover/src/tables/mod.rs
+++ b/prover/src/tables/mod.rs
@@ -95,19 +95,58 @@ pub struct MaxRowsConfig {
     pub memw_register: usize,
 }
 
+/// Global chunk-size scale (log2), a left-shift applied uniformly to
+/// every per-table `max_rows`. Read from `LAMBDA_MAXROWS_SCALE_LOG2`,
+/// falling back to [`DEFAULT_MAXROWS_SCALE_LOG2`] when unset/invalid.
+///
+/// Bigger chunks → fewer chunks → fewer full StarkProof instances, which
+/// cuts the per-instance overhead that scales with chunk count (FRI query
+/// phase ~219 queries, 20-bit grinding, OOD/DEEP, proof serialization) and
+/// pushes LDE sizes past the phased-FFT threshold into the memory-bound
+/// regime where the Bailey four-step FFT wins. Full power-of-two chunks
+/// are not padded (only the final partial chunk is), so the cached-LDE
+/// peak heap stays ~constant across scales.
+///
+/// This research branch defaults to a non-zero scale so the automated
+/// main-vs-PR bench harness exercises the larger tables without needing to
+/// inject an env var. Override to sweep: `LAMBDA_MAXROWS_SCALE_LOG2=0`
+/// reproduces production sizing, `=1`/`=2` are intermediate steps.
+///
+/// Clamped to `0..=4` so a stray value can't request a 16M+× blow-up.
+fn maxrows_scale_log2_from_env() -> u32 {
+    match std::env::var("LAMBDA_MAXROWS_SCALE_LOG2") {
+        Ok(s) => match s.trim().parse::<u32>() {
+            Ok(v) => v.min(4),
+            Err(_) => DEFAULT_MAXROWS_SCALE_LOG2,
+        },
+        Err(_) => DEFAULT_MAXROWS_SCALE_LOG2,
+    }
+}
+
+/// Default chunk-size scale. **Production sizing (0).**
+///
+/// Bigger tables were benchmarked on fib_8M and regress monotonically:
+/// ×4 +16% / ×8 +41% prove time (bigger chunks collapse the table-level
+/// parallelism the prover relies on, plus last-chunk padding). The
+/// phased FFT does not recover it even in the memory-bound regime, so we
+/// keep production chunk sizing. Override with `LAMBDA_MAXROWS_SCALE_LOG2`
+/// to reproduce the experiment.
+const DEFAULT_MAXROWS_SCALE_LOG2: u32 = 0;
+
 impl Default for MaxRowsConfig {
     fn default() -> Self {
+        let shift = maxrows_scale_log2_from_env();
         Self {
-            cpu: max_rows::CPU,
-            memw: max_rows::MEMW,
-            memw_aligned: max_rows::MEMW_A,
-            dvrm: max_rows::DVRM,
-            mul: max_rows::MUL,
-            lt: max_rows::LT,
-            shift: max_rows::SHIFT,
-            load: max_rows::LOAD,
-            branch: max_rows::BRANCH,
-            memw_register: max_rows::MEMW_R,
+            cpu: max_rows::CPU << shift,
+            memw: max_rows::MEMW << shift,
+            memw_aligned: max_rows::MEMW_A << shift,
+            dvrm: max_rows::DVRM << shift,
+            mul: max_rows::MUL << shift,
+            lt: max_rows::LT << shift,
+            shift: max_rows::SHIFT << shift,
+            load: max_rows::LOAD << shift,
+            branch: max_rows::BRANCH << shift,
+            memw_register: max_rows::MEMW_R << shift,
         }
     }
 }
diff --git a/scripts/bench_prove.sh b/scripts/bench_prove.sh
index 4ba97a921..42902f18c 100755
--- a/scripts/bench_prove.sh
+++ b/scripts/bench_prove.sh
@@ -61,8 +61,15 @@ build_branch() {
     local branch=$1 label=$2
     echo -e "${GREEN}[$label] Checking out $branch...${NC}"
     git -C "$ROOT_DIR" checkout "$branch"
-    echo -e "${GREEN}[$label] Building release CLI (with jemalloc-stats)...${NC}"
-    cargo build --release -p cli --features jemalloc-stats --manifest-path "$ROOT_DIR/Cargo.toml" 2>&1 | tail -1
+    # Extra cargo features applied ONLY to the PR/feature side (not the
+    # base branch, which may lack them — e.g. `phased-fft`). Set via
+    #   PR_EXTRA_FEATURES=phased-fft ./scripts/bench_prove.sh ...
+    local features="jemalloc-stats"
+    if [ "$label" != "base" ] && [ -n "${PR_EXTRA_FEATURES:-}" ]; then
+        features="$features $PR_EXTRA_FEATURES"
+    fi
+    echo -e "${GREEN}[$label] Building release CLI (--features \"$features\")...${NC}"
+    cargo build --release -p cli --features "$features" --manifest-path "$ROOT_DIR/Cargo.toml" 2>&1 | tail -1
     cp "$ROOT_DIR/target/release/cli" "$TMP_DIR/cli-$label"
     echo -e "${GREEN}[$label] Binary saved.${NC}"
 }