diff --git a/Makefile b/Makefile
index e4e54f2d5..f29ec030a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 .PHONY: deps deps-linux deps-macos prepare-test-data compile-programs-asm compile-programs-rust compile-bench \
 compile-programs clean-asm clean-rust clean-bench clean-shared clean test test-asm test-no-compile \
 test-asm-no-compile test-rust test-rust-no-compile test-executor flamegraph-prover \
-test-fast test-prover test-prover-all test-disk-spill test-math-cuda bench-math-cuda bench-prover bench-prover-cuda build check clippy fmt lint
+test-fast test-prover test-prover-all test-disk-spill test-math-cuda test-cuda-integration bench-math-cuda bench-prover bench-prover-cuda build check clippy fmt lint
 
 UNAME := $(shell uname)
 
@@ -194,6 +194,12 @@ test-disk-spill:
 test-math-cuda:
 	cargo test -p math-cuda --release
 
+# End-to-end cuda dispatch coverage (requires NVIDIA GPU + nvcc).
+# Asserts every R1/R2/R3 GPU counter fired on a real prove.
+test-cuda-integration:
+	cargo test -p lambda-vm-prover --release --features cuda \
+	    --test cuda_path_integration -- --ignored --nocapture
+
 # math-cuda quick microbench (median of 10 runs)
 bench-math-cuda:
 	cargo test -p math-cuda --release --test bench_quick -- --ignored --nocapture
diff --git a/crypto/math-cuda/build.rs b/crypto/math-cuda/build.rs
index bc84f2653..25c16a634 100644
--- a/crypto/math-cuda/build.rs
+++ b/crypto/math-cuda/build.rs
@@ -111,4 +111,5 @@ fn main() {
     compile_ptx("arith.cu", "arith.ptx", have_nvcc);
     compile_ptx("ntt.cu", "ntt.ptx", have_nvcc);
     compile_ptx("keccak.cu", "keccak.ptx", have_nvcc);
+    compile_ptx("barycentric.cu", "barycentric.ptx", have_nvcc);
 }
diff --git a/crypto/math-cuda/kernels/barycentric.cu b/crypto/math-cuda/kernels/barycentric.cu
new file mode 100644
index 000000000..5c18bcb88
--- /dev/null
+++ b/crypto/math-cuda/kernels/barycentric.cu
@@ -0,0 +1,192 @@
+// Barycentric evaluation of a polynomial (given as evaluations on a coset) at
+// a single out-of-domain point. Matches the CPU
+// `math::polynomial::interpolate_coset_eval_*_with_g_n_inv` pair.
+//
+// Per column, the barycentric sum is
+//     S = sum over i of point_i * eval_i * inv_denom_i
+// where `point_i` is a base-field coset point, `eval_i` is the polynomial's
+// value at that point (base for main-trace columns, ext3 for aux or composition
+// columns), and `inv_denom_i = 1 / (z - point_i)` is an ext3 scalar (same for
+// every column sharing the evaluation point `z`).
+//
+// These kernels compute only S. The full OOD value is S scaled by the ext3
+// constant `vanishing * n_inv * g_n_inv`, which is constant across a column, so
+// the caller applies it once per column (one ext3 mul per column, independent
+// of n). Keeping it on the host means the kernel takes no extra ext3 constant
+// argument.
+//
+// Launch: grid = (num_cols, 1, 1), block = (BARY_BLOCK_DIM, 1, 1).
+
+#include "goldilocks.cuh"
+#include "ext3.cuh"
+
+// 256 threads/block. One ext3 accumulator per thread in shmem => 6 KiB.
+#define BARY_BLOCK_DIM 256
+
+__device__ __forceinline__ ext3::Fe3 block_reduce_ext3(ext3::Fe3 my) {
+    __shared__ uint64_t shm_a[BARY_BLOCK_DIM];
+    __shared__ uint64_t shm_b[BARY_BLOCK_DIM];
+    __shared__ uint64_t shm_c[BARY_BLOCK_DIM];
+    uint32_t tid = threadIdx.x;
+    shm_a[tid] = my.a;
+    shm_b[tid] = my.b;
+    shm_c[tid] = my.c;
+    __syncthreads();
+    for (uint32_t s = BARY_BLOCK_DIM / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            shm_a[tid] = goldilocks::add(shm_a[tid], shm_a[tid + s]);
+            shm_b[tid] = goldilocks::add(shm_b[tid], shm_b[tid + s]);
+            shm_c[tid] = goldilocks::add(shm_c[tid], shm_c[tid + s]);
+        }
+        __syncthreads();
+    }
+    return ext3::make(shm_a[0], shm_b[0], shm_c[0]);
+}
+
+/// Base-column variant: M base-field columns, each `col_stride` u64 apart.
+/// `inv_denoms` is a flat 3N u64 buffer (ext3, interleaved `[a0,b0,c0,...]`).
+/// Writes `out_ext3_int`: 3M u64, ext3 interleaved, one accumulator per column.
+extern "C" __global__ void barycentric_base_batched(
+    const uint64_t *columns,
+    uint64_t col_stride,
+    const uint64_t *coset_points,
+    const uint64_t *inv_denoms,
+    uint64_t n,
+    uint64_t *out_ext3_int
+) {
+    uint64_t col = blockIdx.x;
+    const uint64_t *col_data = columns + col * col_stride;
+
+    ext3::Fe3 acc = ext3::zero();
+    for (uint64_t i = threadIdx.x; i < n; i += BARY_BLOCK_DIM) {
+        uint64_t eval  = col_data[i];
+        uint64_t point = coset_points[i];
+        uint64_t pe    = goldilocks::mul(point, eval);   // F * F -> F
+        ext3::Fe3 inv_d = ext3::make(
+            inv_denoms[i * 3 + 0],
+            inv_denoms[i * 3 + 1],
+            inv_denoms[i * 3 + 2]);
+        ext3::Fe3 term = ext3::mul_base(inv_d, pe);      // E * F -> E
+        acc = ext3::add(acc, term);
+    }
+
+    ext3::Fe3 sum = block_reduce_ext3(acc);
+    if (threadIdx.x == 0) {
+        out_ext3_int[col * 3 + 0] = sum.a;
+        out_ext3_int[col * 3 + 1] = sum.b;
+        out_ext3_int[col * 3 + 2] = sum.c;
+    }
+}
+
+/// Same as `barycentric_base_batched` but reads rows at stride `row_stride`
+/// within each column. Treats the column as an LDE of length `n * row_stride`
+/// and sums over the trace-size coset (every `row_stride`-th row). Lets R3 OOD
+/// run directly against the LDE device handle from R1 without copying the
+/// strided rows into a separate trace-size buffer.
+extern "C" __global__ void barycentric_base_batched_strided(
+    const uint64_t *columns,
+    uint64_t col_stride,
+    uint64_t row_stride,
+    const uint64_t *coset_points,
+    const uint64_t *inv_denoms,
+    uint64_t n,
+    uint64_t *out_ext3_int
+) {
+    uint64_t col = blockIdx.x;
+    const uint64_t *col_data = columns + col * col_stride;
+
+    ext3::Fe3 acc = ext3::zero();
+    for (uint64_t i = threadIdx.x; i < n; i += BARY_BLOCK_DIM) {
+        uint64_t eval  = col_data[i * row_stride];
+        uint64_t point = coset_points[i];
+        uint64_t pe    = goldilocks::mul(point, eval);
+        ext3::Fe3 inv_d = ext3::make(
+            inv_denoms[i * 3 + 0],
+            inv_denoms[i * 3 + 1],
+            inv_denoms[i * 3 + 2]);
+        ext3::Fe3 term = ext3::mul_base(inv_d, pe);
+        acc = ext3::add(acc, term);
+    }
+
+    ext3::Fe3 sum = block_reduce_ext3(acc);
+    if (threadIdx.x == 0) {
+        out_ext3_int[col * 3 + 0] = sum.a;
+        out_ext3_int[col * 3 + 1] = sum.b;
+        out_ext3_int[col * 3 + 2] = sum.c;
+    }
+}
+
+/// Ext3-column variant: M ext3 columns stored as 3M base slabs. Column `c`
+/// lives at `columns[(c*3+k)*col_stride + i]` for component `k` in 0..3.
+extern "C" __global__ void barycentric_ext3_batched(
+    const uint64_t *columns,
+    uint64_t col_stride,
+    const uint64_t *coset_points,
+    const uint64_t *inv_denoms,
+    uint64_t n,
+    uint64_t *out_ext3_int
+) {
+    uint64_t col = blockIdx.x;
+    const uint64_t *slab_a = columns + (col * 3 + 0) * col_stride;
+    const uint64_t *slab_b = columns + (col * 3 + 1) * col_stride;
+    const uint64_t *slab_c = columns + (col * 3 + 2) * col_stride;
+
+    ext3::Fe3 acc = ext3::zero();
+    for (uint64_t i = threadIdx.x; i < n; i += BARY_BLOCK_DIM) {
+        ext3::Fe3 eval = ext3::make(slab_a[i], slab_b[i], slab_c[i]);
+        uint64_t point = coset_points[i];
+        // F * E -> E. Point times eval, componentwise on the 3 base components.
+        ext3::Fe3 pe = ext3::mul_base(eval, point);
+        // E * E -> E
+        ext3::Fe3 inv_d = ext3::make(
+            inv_denoms[i * 3 + 0],
+            inv_denoms[i * 3 + 1],
+            inv_denoms[i * 3 + 2]);
+        ext3::Fe3 term = ext3::mul(pe, inv_d);
+        acc = ext3::add(acc, term);
+    }
+
+    ext3::Fe3 sum = block_reduce_ext3(acc);
+    if (threadIdx.x == 0) {
+        out_ext3_int[col * 3 + 0] = sum.a;
+        out_ext3_int[col * 3 + 1] = sum.b;
+        out_ext3_int[col * 3 + 2] = sum.c;
+    }
+}
+
+/// Strided ext3 variant for R3 OOD of aux LDE.
+extern "C" __global__ void barycentric_ext3_batched_strided(
+    const uint64_t *columns,
+    uint64_t col_stride,
+    uint64_t row_stride,
+    const uint64_t *coset_points,
+    const uint64_t *inv_denoms,
+    uint64_t n,
+    uint64_t *out_ext3_int
+) {
+    uint64_t col = blockIdx.x;
+    const uint64_t *slab_a = columns + (col * 3 + 0) * col_stride;
+    const uint64_t *slab_b = columns + (col * 3 + 1) * col_stride;
+    const uint64_t *slab_c = columns + (col * 3 + 2) * col_stride;
+
+    ext3::Fe3 acc = ext3::zero();
+    for (uint64_t i = threadIdx.x; i < n; i += BARY_BLOCK_DIM) {
+        uint64_t lde_i = i * row_stride;
+        ext3::Fe3 eval = ext3::make(slab_a[lde_i], slab_b[lde_i], slab_c[lde_i]);
+        uint64_t point = coset_points[i];
+        ext3::Fe3 pe = ext3::mul_base(eval, point);
+        ext3::Fe3 inv_d = ext3::make(
+            inv_denoms[i * 3 + 0],
+            inv_denoms[i * 3 + 1],
+            inv_denoms[i * 3 + 2]);
+        ext3::Fe3 term = ext3::mul(pe, inv_d);
+        acc = ext3::add(acc, term);
+    }
+
+    ext3::Fe3 sum = block_reduce_ext3(acc);
+    if (threadIdx.x == 0) {
+        out_ext3_int[col * 3 + 0] = sum.a;
+        out_ext3_int[col * 3 + 1] = sum.b;
+        out_ext3_int[col * 3 + 2] = sum.c;
+    }
+}
diff --git a/crypto/math-cuda/src/barycentric.rs b/crypto/math-cuda/src/barycentric.rs
new file mode 100644
index 000000000..b4eb12dfd
--- /dev/null
+++ b/crypto/math-cuda/src/barycentric.rs
@@ -0,0 +1,227 @@
+//! Barycentric evaluation on device. Matches the CPU
+//! [`interpolate_coset_eval_ext_with_g_n_inv`](math::polynomial::interpolate_coset_eval_ext_with_g_n_inv)
+//!
+//! The kernels compute only the unscaled barycentric sum
+//!     S = sum over i of point_i * eval_i * inv_denom_i
+//! per column. The caller multiplies each `S` by the ext3 scalar
+//! `(z^N - g^N) * 1/N * 1/g^N` to get the final OOD value. That scaling is
+//! one ext3 mul per column and stays on host.
+
+use cudarc::driver::{LaunchConfig, PushKernelArg};
+
+use crate::Result;
+use crate::device::backend;
+use crate::lde::{GpuLdeBase, GpuLdeExt3};
+
+const BLOCK_DIM: u32 = 256;
+
+/// Barycentric sums over M base-field columns, each of length `n`, laid out
+/// with stride `col_stride` (so column `c` is at `columns[c*col_stride ..
+/// c*col_stride + n]`). `inv_denoms` is 3N u64 (ext3 interleaved).
+/// Returns 3M u64 (ext3 interleaved), one per column.
+pub fn barycentric_base(
+    columns: &[u64],
+    col_stride: usize,
+    coset_points: &[u64],
+    inv_denoms_ext3: &[u64],
+    n: usize,
+    num_cols: usize,
+) -> Result<Vec<u64>> {
+    assert_eq!(coset_points.len(), n);
+    assert_eq!(inv_denoms_ext3.len(), 3 * n);
+    assert!(columns.len() >= num_cols * col_stride);
+    // Kernel reads col_data[0..n] per column, so col_stride must cover at
+    // least n u64s. Smaller strides would read past the column boundary.
+    assert!(
+        col_stride >= n,
+        "col_stride {col_stride} < n {n}: kernel reads col_data[0..n] but stride is shorter"
+    );
+    if num_cols == 0 || n == 0 {
+        return Ok(vec![0; 3 * num_cols]);
+    }
+
+    let be = backend()?;
+    let stream = be.next_stream();
+
+    let cols_dev = stream.clone_htod(&columns[..num_cols * col_stride])?;
+    let points_dev = stream.clone_htod(coset_points)?;
+    let inv_dev = stream.clone_htod(inv_denoms_ext3)?;
+    let mut out_dev = stream.alloc_zeros::<u64>(3 * num_cols)?;
+
+    let col_stride_u64 = col_stride as u64;
+    let n_u64 = n as u64;
+    let cfg = LaunchConfig {
+        grid_dim: (num_cols as u32, 1, 1),
+        block_dim: (BLOCK_DIM, 1, 1),
+        shared_mem_bytes: 0,
+    };
+    unsafe {
+        stream
+            .launch_builder(&be.barycentric_base_batched)
+            .arg(&cols_dev)
+            .arg(&col_stride_u64)
+            .arg(&points_dev)
+            .arg(&inv_dev)
+            .arg(&n_u64)
+            .arg(&mut out_dev)
+            .launch(cfg)?;
+    }
+    let out = stream.clone_dtoh(&out_dev)?;
+    stream.synchronize()?;
+    Ok(out)
+}
+
+/// Same as [`barycentric_base`] but `columns` holds M ext3 columns in the
+/// de-interleaved layout: slab `c*3 + k` at offset `(c*3+k)*col_stride`.
+/// `columns.len() >= num_cols * 3 * col_stride`.
+pub fn barycentric_ext3(
+    columns: &[u64],
+    col_stride: usize,
+    coset_points: &[u64],
+    inv_denoms_ext3: &[u64],
+    n: usize,
+    num_cols: usize,
+) -> Result<Vec<u64>> {
+    assert_eq!(coset_points.len(), n);
+    assert_eq!(inv_denoms_ext3.len(), 3 * n);
+    assert!(columns.len() >= num_cols * 3 * col_stride);
+    // Each ext3 slab is read at indices [0..n), so col_stride must cover at
+    // least n u64s. Smaller strides would read past the slab boundary.
+    assert!(
+        col_stride >= n,
+        "col_stride {col_stride} < n {n}: kernel reads slab[0..n] but stride is shorter"
+    );
+    if num_cols == 0 || n == 0 {
+        return Ok(vec![0; 3 * num_cols]);
+    }
+
+    let be = backend()?;
+    let stream = be.next_stream();
+
+    let cols_dev = stream.clone_htod(&columns[..num_cols * 3 * col_stride])?;
+    let points_dev = stream.clone_htod(coset_points)?;
+    let inv_dev = stream.clone_htod(inv_denoms_ext3)?;
+    let mut out_dev = stream.alloc_zeros::<u64>(3 * num_cols)?;
+
+    let col_stride_u64 = col_stride as u64;
+    let n_u64 = n as u64;
+    let cfg = LaunchConfig {
+        grid_dim: (num_cols as u32, 1, 1),
+        block_dim: (BLOCK_DIM, 1, 1),
+        shared_mem_bytes: 0,
+    };
+    unsafe {
+        stream
+            .launch_builder(&be.barycentric_ext3_batched)
+            .arg(&cols_dev)
+            .arg(&col_stride_u64)
+            .arg(&points_dev)
+            .arg(&inv_dev)
+            .arg(&n_u64)
+            .arg(&mut out_dev)
+            .launch(cfg)?;
+    }
+    let out = stream.clone_dtoh(&out_dev)?;
+    stream.synchronize()?;
+    Ok(out)
+}
+
+/// Run `barycentric_base_batched_strided` over the base LDE already on
+/// device (`main_handle`), summing over the trace-size coset (every
+/// `row_stride = blowup_factor`-th row). H2Ds only the coset points and
+/// inv_denoms. The column data never crosses PCIe.
+pub fn barycentric_base_on_device(
+    main_handle: &GpuLdeBase,
+    row_stride: usize,
+    coset_points: &[u64],
+    inv_denoms_ext3: &[u64],
+    n: usize,
+) -> Result<Vec<u64>> {
+    assert_eq!(coset_points.len(), n);
+    assert_eq!(inv_denoms_ext3.len(), 3 * n);
+    let num_cols = main_handle.m;
+    if num_cols == 0 || n == 0 {
+        return Ok(vec![0; 3 * num_cols]);
+    }
+    let col_stride = main_handle.lde_size;
+
+    let be = backend()?;
+    let stream = be.next_stream();
+
+    let points_dev = stream.clone_htod(coset_points)?;
+    let inv_dev = stream.clone_htod(inv_denoms_ext3)?;
+    let mut out_dev = stream.alloc_zeros::<u64>(3 * num_cols)?;
+
+    let col_stride_u64 = col_stride as u64;
+    let row_stride_u64 = row_stride as u64;
+    let n_u64 = n as u64;
+    let cfg = LaunchConfig {
+        grid_dim: (num_cols as u32, 1, 1),
+        block_dim: (BLOCK_DIM, 1, 1),
+        shared_mem_bytes: 0,
+    };
+    unsafe {
+        stream
+            .launch_builder(&be.barycentric_base_batched_strided)
+            .arg(main_handle.buf.as_ref())
+            .arg(&col_stride_u64)
+            .arg(&row_stride_u64)
+            .arg(&points_dev)
+            .arg(&inv_dev)
+            .arg(&n_u64)
+            .arg(&mut out_dev)
+            .launch(cfg)?;
+    }
+    let out = stream.clone_dtoh(&out_dev)?;
+    stream.synchronize()?;
+    Ok(out)
+}
+
+/// Ext3 counterpart of [`barycentric_base_on_device`]. Reads the aux LDE
+/// from the de-interleaved device handle.
+pub fn barycentric_ext3_on_device(
+    aux_handle: &GpuLdeExt3,
+    row_stride: usize,
+    coset_points: &[u64],
+    inv_denoms_ext3: &[u64],
+    n: usize,
+) -> Result<Vec<u64>> {
+    assert_eq!(coset_points.len(), n);
+    assert_eq!(inv_denoms_ext3.len(), 3 * n);
+    let num_cols = aux_handle.m;
+    if num_cols == 0 || n == 0 {
+        return Ok(vec![0; 3 * num_cols]);
+    }
+    let col_stride = aux_handle.lde_size;
+
+    let be = backend()?;
+    let stream = be.next_stream();
+
+    let points_dev = stream.clone_htod(coset_points)?;
+    let inv_dev = stream.clone_htod(inv_denoms_ext3)?;
+    let mut out_dev = stream.alloc_zeros::<u64>(3 * num_cols)?;
+
+    let col_stride_u64 = col_stride as u64;
+    let row_stride_u64 = row_stride as u64;
+    let n_u64 = n as u64;
+    let cfg = LaunchConfig {
+        grid_dim: (num_cols as u32, 1, 1),
+        block_dim: (BLOCK_DIM, 1, 1),
+        shared_mem_bytes: 0,
+    };
+    unsafe {
+        stream
+            .launch_builder(&be.barycentric_ext3_batched_strided)
+            .arg(aux_handle.buf.as_ref())
+            .arg(&col_stride_u64)
+            .arg(&row_stride_u64)
+            .arg(&points_dev)
+            .arg(&inv_dev)
+            .arg(&n_u64)
+            .arg(&mut out_dev)
+            .launch(cfg)?;
+    }
+    let out = stream.clone_dtoh(&out_dev)?;
+    stream.synchronize()?;
+    Ok(out)
+}
diff --git a/crypto/math-cuda/src/device.rs b/crypto/math-cuda/src/device.rs
index 3af2ed546..353932ba6 100644
--- a/crypto/math-cuda/src/device.rs
+++ b/crypto/math-cuda/src/device.rs
@@ -93,6 +93,7 @@ impl Drop for PinnedStaging {
 const ARITH_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/arith.ptx"));
 const NTT_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/ntt.ptx"));
 const KECCAK_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/keccak.ptx"));
+const BARY_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/barycentric.ptx"));
 /// Number of CUDA streams in the pool. Larger pools let many rayon-parallel
 /// callers overlap on the GPU without serializing on stream ownership. The
 /// default stream is deliberately excluded because it synchronises with all
@@ -143,6 +144,12 @@ pub struct Backend {
     pub keccak_fri_leaves_ext3: CudaFunction,
     pub keccak_merkle_level: CudaFunction,
 
+    // barycentric.ptx
+    pub barycentric_base_batched: CudaFunction,
+    pub barycentric_ext3_batched: CudaFunction,
+    pub barycentric_base_batched_strided: CudaFunction,
+    pub barycentric_ext3_batched_strided: CudaFunction,
+
     // Twiddle caches keyed by log_n.
     fwd_twiddles: Mutex<Vec<Option<Arc<CudaSlice<u64>>>>>,
     inv_twiddles: Mutex<Vec<Option<Arc<CudaSlice<u64>>>>>,
@@ -160,6 +167,7 @@ impl Backend {
         let arith = ctx.load_module(Ptx::from_src(ARITH_PTX))?;
         let ntt = ctx.load_module(Ptx::from_src(NTT_PTX))?;
         let keccak = ctx.load_module(Ptx::from_src(KECCAK_PTX))?;
+        let bary = ctx.load_module(Ptx::from_src(BARY_PTX))?;
 
         let mut streams = Vec::with_capacity(STREAM_POOL_SIZE);
         for _ in 0..STREAM_POOL_SIZE {
@@ -212,6 +220,12 @@ impl Backend {
             keccak_comp_poly_leaves_ext3: keccak.load_function("keccak_comp_poly_leaves_ext3")?,
             keccak_fri_leaves_ext3: keccak.load_function("keccak_fri_leaves_ext3")?,
             keccak_merkle_level: keccak.load_function("keccak_merkle_level")?,
+            barycentric_base_batched: bary.load_function("barycentric_base_batched")?,
+            barycentric_ext3_batched: bary.load_function("barycentric_ext3_batched")?,
+            barycentric_base_batched_strided: bary
+                .load_function("barycentric_base_batched_strided")?,
+            barycentric_ext3_batched_strided: bary
+                .load_function("barycentric_ext3_batched_strided")?,
             fwd_twiddles: Mutex::new(vec![None; max_log]),
             inv_twiddles: Mutex::new(vec![None; max_log]),
             ctx,
diff --git a/crypto/math-cuda/src/lib.rs b/crypto/math-cuda/src/lib.rs
index d1b4e1210..a5bb8defb 100644
--- a/crypto/math-cuda/src/lib.rs
+++ b/crypto/math-cuda/src/lib.rs
@@ -4,6 +4,7 @@
 //! element-wise arith) is either internal to the LDE pipeline or used by the
 //! parity test suite.
 
+pub mod barycentric;
 pub mod device;
 pub mod lde;
 pub mod merkle;
diff --git a/crypto/math-cuda/src/merkle.rs b/crypto/math-cuda/src/merkle.rs
index 6faf12b51..932e81325 100644
--- a/crypto/math-cuda/src/merkle.rs
+++ b/crypto/math-cuda/src/merkle.rs
@@ -55,6 +55,7 @@ pub fn keccak_leaves_base(
         &mut out_dev.as_view_mut(),
     )?;
     let out = stream.clone_dtoh(&out_dev)?;
+    stream.synchronize()?;
     Ok(out)
 }
 
@@ -89,6 +90,7 @@ pub fn keccak_leaves_ext3(
         &mut out_dev.as_view_mut(),
     )?;
     let out = stream.clone_dtoh(&out_dev)?;
+    stream.synchronize()?;
     Ok(out)
 }
 
@@ -211,6 +213,7 @@ pub fn build_merkle_tree_on_device(hashed_leaves: &[u8]) -> Result<Vec<u8>> {
     build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, leaves_len)?;
 
     let out = stream.clone_dtoh(&nodes_dev)?;
+    stream.synchronize()?;
     Ok(out)
 }
 
@@ -280,6 +283,7 @@ pub fn build_comp_poly_tree_from_evals_ext3(parts_interleaved: &[&[u64]]) -> Res
     build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, num_leaves)?;
 
     let out = stream.clone_dtoh(&nodes_dev)?;
+    stream.synchronize()?;
     drop(staging);
     Ok(out)
 }
@@ -325,6 +329,7 @@ pub fn build_fri_layer_tree_from_evals_ext3(evals: &[u64]) -> Result<Vec<u8>> {
     build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, num_leaves)?;
 
     let out = stream.clone_dtoh(&nodes_dev)?;
+    stream.synchronize()?;
     Ok(out)
 }
 
diff --git a/crypto/math-cuda/tests/barycentric.rs b/crypto/math-cuda/tests/barycentric.rs
new file mode 100644
index 000000000..e9fac4096
--- /dev/null
+++ b/crypto/math-cuda/tests/barycentric.rs
@@ -0,0 +1,144 @@
+//! Parity: GPU barycentric sum vs CPU. We don't call the upstream
+//! `interpolate_coset_eval_*_with_g_n_inv` directly because the GPU kernel
+//! returns only the unscaled sum and the caller applies the ext3 scale. We
+//! replicate the same unscaled sum on CPU for comparison.
+
+use math::field::element::FieldElement;
+use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField;
+use math::field::goldilocks::GoldilocksField;
+use math::field::traits::IsPrimeField;
+use math_cuda::barycentric::{barycentric_base, barycentric_ext3};
+use rand::{Rng, SeedableRng};
+use rand_chacha::ChaCha8Rng;
+
+type Fp = FieldElement<GoldilocksField>;
+type Fp3 = FieldElement<Degree3GoldilocksExtensionField>;
+
+fn canon_triplet(e: &Fp3) -> [u64; 3] {
+    [
+        GoldilocksField::canonical(e.value()[0].value()),
+        GoldilocksField::canonical(e.value()[1].value()),
+        GoldilocksField::canonical(e.value()[2].value()),
+    ]
+}
+
+fn canon_triplet_raw(t: &[u64]) -> [u64; 3] {
+    [
+        GoldilocksField::canonical(&t[0]),
+        GoldilocksField::canonical(&t[1]),
+        GoldilocksField::canonical(&t[2]),
+    ]
+}
+
+fn random_fp(rng: &mut ChaCha8Rng) -> Fp {
+    Fp::from_raw(rng.r#gen::<u64>())
+}
+fn random_fp3(rng: &mut ChaCha8Rng) -> Fp3 {
+    Fp3::new([random_fp(rng), random_fp(rng), random_fp(rng)])
+}
+
+#[test]
+fn barycentric_base_sum_matches_cpu() {
+    for &(log_n, num_cols) in &[(4u32, 1usize), (8, 5), (10, 17), (12, 3)] {
+        let n = 1 << log_n;
+        let mut rng = ChaCha8Rng::seed_from_u64(100 + log_n as u64 * 7 + num_cols as u64);
+
+        let coset_points: Vec<Fp> = (0..n).map(|_| random_fp(&mut rng)).collect();
+        let inv_denoms: Vec<Fp3> = (0..n).map(|_| random_fp3(&mut rng)).collect();
+
+        // Lay out columns base: column c contiguous slab of n u64s.
+        let cols_fp: Vec<Vec<Fp>> = (0..num_cols)
+            .map(|_| (0..n).map(|_| random_fp(&mut rng)).collect())
+            .collect();
+        let mut columns_flat = vec![0u64; num_cols * n];
+        for (c, col) in cols_fp.iter().enumerate() {
+            for (r, e) in col.iter().enumerate() {
+                columns_flat[c * n + r] = *e.value();
+            }
+        }
+        let points_raw: Vec<u64> = coset_points.iter().map(|e| *e.value()).collect();
+        let inv_denoms_raw: Vec<u64> = inv_denoms
+            .iter()
+            .flat_map(|e| {
+                [
+                    *e.value()[0].value(),
+                    *e.value()[1].value(),
+                    *e.value()[2].value(),
+                ]
+            })
+            .collect();
+
+        let gpu =
+            barycentric_base(&columns_flat, n, &points_raw, &inv_denoms_raw, n, num_cols).unwrap();
+
+        for (c, col) in cols_fp.iter().enumerate() {
+            // CPU reference sum. Force ext3 by embedding the base product.
+            let mut sum = Fp3::zero();
+            for i in 0..n {
+                let pe_base: Fp = &coset_points[i] * &col[i]; // F * F = F
+                // Base * ext3 = ext3 (base is on the left, IsSubFieldOf direction).
+                let pe_ext3: Fp3 = &pe_base * &inv_denoms[i]; // F * E = E
+                sum = &sum + &pe_ext3;
+            }
+            let gpu_sum = canon_triplet_raw(&gpu[c * 3..(c + 1) * 3]);
+            let cpu_sum = canon_triplet(&sum);
+            assert_eq!(
+                gpu_sum, cpu_sum,
+                "base col {c} log_n={log_n} num_cols={num_cols}"
+            );
+        }
+    }
+}
+
+#[test]
+fn barycentric_ext3_sum_matches_cpu() {
+    for &(log_n, num_cols) in &[(4u32, 1usize), (8, 3), (10, 7)] {
+        let n = 1 << log_n;
+        let mut rng = ChaCha8Rng::seed_from_u64(200 + log_n as u64 * 11 + num_cols as u64);
+
+        let coset_points: Vec<Fp> = (0..n).map(|_| random_fp(&mut rng)).collect();
+        let inv_denoms: Vec<Fp3> = (0..n).map(|_| random_fp3(&mut rng)).collect();
+        let cols_fp3: Vec<Vec<Fp3>> = (0..num_cols)
+            .map(|_| (0..n).map(|_| random_fp3(&mut rng)).collect())
+            .collect();
+
+        // De-interleaved layout: 3 base slabs per ext3 column.
+        let mut columns_flat = vec![0u64; num_cols * 3 * n];
+        for (c, col) in cols_fp3.iter().enumerate() {
+            for (r, e) in col.iter().enumerate() {
+                columns_flat[(c * 3) * n + r] = *e.value()[0].value();
+                columns_flat[(c * 3 + 1) * n + r] = *e.value()[1].value();
+                columns_flat[(c * 3 + 2) * n + r] = *e.value()[2].value();
+            }
+        }
+        let points_raw: Vec<u64> = coset_points.iter().map(|e| *e.value()).collect();
+        let inv_denoms_raw: Vec<u64> = inv_denoms
+            .iter()
+            .flat_map(|e| {
+                [
+                    *e.value()[0].value(),
+                    *e.value()[1].value(),
+                    *e.value()[2].value(),
+                ]
+            })
+            .collect();
+
+        let gpu =
+            barycentric_ext3(&columns_flat, n, &points_raw, &inv_denoms_raw, n, num_cols).unwrap();
+
+        for (c, col) in cols_fp3.iter().enumerate() {
+            let mut sum = Fp3::zero();
+            for i in 0..n {
+                let pe: Fp3 = &coset_points[i] * &col[i]; // F * E = E
+                let term: Fp3 = &pe * &inv_denoms[i]; // E * E = E
+                sum = &sum + &term;
+            }
+            let gpu_sum = canon_triplet_raw(&gpu[c * 3..(c + 1) * 3]);
+            let cpu_sum = canon_triplet(&sum);
+            assert_eq!(
+                gpu_sum, cpu_sum,
+                "ext3 col {c} log_n={log_n} num_cols={num_cols}"
+            );
+        }
+    }
+}
diff --git a/crypto/math-cuda/tests/barycentric_strided.rs b/crypto/math-cuda/tests/barycentric_strided.rs
new file mode 100644
index 000000000..653ef4e38
--- /dev/null
+++ b/crypto/math-cuda/tests/barycentric_strided.rs
@@ -0,0 +1,147 @@
+//! Parity: strided barycentric kernels (used by R3 OOD on device LDE handles)
+//! match the non-strided kernels fed a pre-strided column buffer.
+
+use std::sync::Arc;
+
+use math::field::element::FieldElement;
+use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField;
+use math::field::goldilocks::GoldilocksField;
+use math_cuda::barycentric::{
+    barycentric_base, barycentric_base_on_device, barycentric_ext3, barycentric_ext3_on_device,
+};
+use math_cuda::device::backend;
+use math_cuda::lde::{GpuLdeBase, GpuLdeExt3};
+use rand::{Rng, SeedableRng};
+use rand_chacha::ChaCha8Rng;
+
+type Fp = FieldElement<GoldilocksField>;
+type Fp3 = FieldElement<Degree3GoldilocksExtensionField>;
+
+fn rand_fp(rng: &mut ChaCha8Rng) -> Fp {
+    Fp::from_raw(rng.r#gen::<u64>())
+}
+fn rand_fp3(rng: &mut ChaCha8Rng) -> Fp3 {
+    Fp3::new([rand_fp(rng), rand_fp(rng), rand_fp(rng)])
+}
+
+fn run_base(log_trace: u32, blowup: usize, num_cols: usize, seed: u64) {
+    let n = 1usize << log_trace;
+    let lde_size = n * blowup;
+    let mut rng = ChaCha8Rng::seed_from_u64(seed);
+    let lde_data: Vec<Vec<Fp>> = (0..num_cols)
+        .map(|_| (0..lde_size).map(|_| rand_fp(&mut rng)).collect())
+        .collect();
+    let coset_points: Vec<u64> = (0..n).map(|_| rng.r#gen::<u64>()).collect();
+    let inv_denoms_ext3: Vec<u64> = (0..(n * 3)).map(|_| rng.r#gen::<u64>()).collect();
+
+    // Pack full LDE column-major for device.
+    let mut lde_flat = vec![0u64; num_cols * lde_size];
+    for (c, col) in lde_data.iter().enumerate() {
+        for (r, v) in col.iter().enumerate() {
+            lde_flat[c * lde_size + r] = *v.value();
+        }
+    }
+    let be = backend().unwrap();
+    let stream = be.next_stream();
+    let lde_dev = stream.clone_htod(&lde_flat).unwrap();
+    stream.synchronize().unwrap();
+    let handle = GpuLdeBase {
+        buf: Arc::new(lde_dev),
+        m: num_cols,
+        lde_size,
+    };
+
+    // Pre-strided buffer for non-strided reference: trace-size picks of each col.
+    let mut pre_strided = vec![0u64; num_cols * n];
+    for c in 0..num_cols {
+        for i in 0..n {
+            pre_strided[c * n + i] = lde_flat[c * lde_size + i * blowup];
+        }
+    }
+
+    let reference = barycentric_base(
+        &pre_strided,
+        n,
+        &coset_points,
+        &inv_denoms_ext3,
+        n,
+        num_cols,
+    )
+    .unwrap();
+
+    let strided =
+        barycentric_base_on_device(&handle, blowup, &coset_points, &inv_denoms_ext3, n).unwrap();
+
+    assert_eq!(
+        reference, strided,
+        "base strided mismatch (log_trace={log_trace}, blowup={blowup}, cols={num_cols})"
+    );
+}
+
+fn run_ext3(log_trace: u32, blowup: usize, num_cols: usize, seed: u64) {
+    let n = 1usize << log_trace;
+    let lde_size = n * blowup;
+    let mut rng = ChaCha8Rng::seed_from_u64(seed);
+    let lde_data: Vec<Vec<Fp3>> = (0..num_cols)
+        .map(|_| (0..lde_size).map(|_| rand_fp3(&mut rng)).collect())
+        .collect();
+    let coset_points: Vec<u64> = (0..n).map(|_| rng.r#gen::<u64>()).collect();
+    let inv_denoms_ext3: Vec<u64> = (0..(n * 3)).map(|_| rng.r#gen::<u64>()).collect();
+
+    // Pack LDE de-interleaved: (m*3) by lde_size.
+    let mut lde_flat = vec![0u64; num_cols * 3 * lde_size];
+    for (c, col) in lde_data.iter().enumerate() {
+        for (r, v) in col.iter().enumerate() {
+            lde_flat[(c * 3) * lde_size + r] = *v.value()[0].value();
+            lde_flat[(c * 3 + 1) * lde_size + r] = *v.value()[1].value();
+            lde_flat[(c * 3 + 2) * lde_size + r] = *v.value()[2].value();
+        }
+    }
+    let be = backend().unwrap();
+    let stream = be.next_stream();
+    let lde_dev = stream.clone_htod(&lde_flat).unwrap();
+    stream.synchronize().unwrap();
+    let handle = GpuLdeExt3 {
+        buf: Arc::new(lde_dev),
+        m: num_cols,
+        lde_size,
+    };
+
+    // Pre-strided buffer for non-strided reference.
+    let mut pre_strided = vec![0u64; num_cols * 3 * n];
+    for c in 0..num_cols {
+        for i in 0..n {
+            pre_strided[(c * 3) * n + i] = lde_flat[(c * 3) * lde_size + i * blowup];
+            pre_strided[(c * 3 + 1) * n + i] = lde_flat[(c * 3 + 1) * lde_size + i * blowup];
+            pre_strided[(c * 3 + 2) * n + i] = lde_flat[(c * 3 + 2) * lde_size + i * blowup];
+        }
+    }
+    let reference = barycentric_ext3(
+        &pre_strided,
+        n,
+        &coset_points,
+        &inv_denoms_ext3,
+        n,
+        num_cols,
+    )
+    .unwrap();
+
+    let strided =
+        barycentric_ext3_on_device(&handle, blowup, &coset_points, &inv_denoms_ext3, n).unwrap();
+
+    assert_eq!(reference, strided, "ext3 strided mismatch");
+}
+
+#[test]
+fn bary_base_strided_small() {
+    for (log_t, blowup, cols) in [(4u32, 2usize, 3usize), (8, 4, 10), (12, 2, 5)] {
+        run_base(log_t, blowup, cols, 1000 + log_t as u64);
+    }
+}
+
+#[test]
+fn bary_ext3_strided_small() {
+    for (log_t, blowup, cols) in [(4u32, 2usize, 2usize), (8, 4, 5), (10, 2, 3)] {
+        run_ext3(log_t, blowup, cols, 2000 + log_t as u64);
+    }
+}
diff --git a/crypto/math-cuda/tests/comp_poly_tree.rs b/crypto/math-cuda/tests/comp_poly_tree.rs
new file mode 100644
index 000000000..29e33b6fe
--- /dev/null
+++ b/crypto/math-cuda/tests/comp_poly_tree.rs
@@ -0,0 +1,235 @@
+//! Parity: GPU fused `evaluate_poly_coset_batch_ext3_into_with_merkle_tree`
+//! (LDE + row-pair Keccak leaves + Merkle inner tree) against the same CPU
+//! pipeline produced by `commit_composition_polynomial`.
+
+use math::field::element::FieldElement;
+use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField;
+use math::field::goldilocks::GoldilocksField;
+use math::field::traits::{IsField, IsPrimeField};
+use math::polynomial::Polynomial;
+use math::traits::ByteConversion;
+use rand::{Rng, SeedableRng};
+use rand_chacha::ChaCha8Rng;
+use sha3::{Digest, Keccak256};
+
+type Fp = FieldElement<GoldilocksField>;
+type Fp3 = FieldElement<Degree3GoldilocksExtensionField>;
+
+fn reverse_index(i: u64, n: u64) -> u64 {
+    let log_n = n.trailing_zeros();
+    i.reverse_bits() >> (64 - log_n)
+}
+
+fn offset_weights(n: usize, offset: u64) -> Vec<u64> {
+    let mut w = Vec::with_capacity(n);
+    let mut cur = 1u64;
+    for _ in 0..n {
+        w.push(cur);
+        cur = GoldilocksField::mul(&cur, &offset);
+    }
+    w
+}
+
+fn rand_ext3(rng: &mut ChaCha8Rng) -> Fp3 {
+    Fp3::new([
+        Fp::from_raw(rng.r#gen::<u64>()),
+        Fp::from_raw(rng.r#gen::<u64>()),
+        Fp::from_raw(rng.r#gen::<u64>()),
+    ])
+}
+
+fn ext3_to_u64s(col: &[Fp3]) -> Vec<u64> {
+    let mut out = Vec::with_capacity(col.len() * 3);
+    for e in col {
+        out.push(*e.value()[0].value());
+        out.push(*e.value()[1].value());
+        out.push(*e.value()[2].value());
+    }
+    out
+}
+
+fn u64s_to_ext3(raw: &[u64]) -> Vec<Fp3> {
+    let mut out = Vec::with_capacity(raw.len() / 3);
+    for i in 0..raw.len() / 3 {
+        out.push(Fp3::new([
+            Fp::from_raw(raw[i * 3]),
+            Fp::from_raw(raw[i * 3 + 1]),
+            Fp::from_raw(raw[i * 3 + 2]),
+        ]));
+    }
+    out
+}
+
+fn canon_ext3(e: &Fp3) -> [u64; 3] {
+    [
+        GoldilocksField::canonical(e.value()[0].value()),
+        GoldilocksField::canonical(e.value()[1].value()),
+        GoldilocksField::canonical(e.value()[2].value()),
+    ]
+}
+
+/// CPU: evaluate polynomial on coset via `Polynomial::evaluate_offset_fft`.
+fn cpu_evaluate(coefs: &[Fp3], blowup: usize, offset: &Fp) -> Vec<Fp3> {
+    let poly = Polynomial::new(coefs);
+    Polynomial::evaluate_offset_fft::<GoldilocksField>(&poly, blowup, None, offset).unwrap()
+}
+
+fn cpu_hash_pair(left: &[u8; 32], right: &[u8; 32]) -> [u8; 32] {
+    let mut h = Keccak256::new();
+    h.update(left);
+    h.update(right);
+    let mut out = [0u8; 32];
+    out.copy_from_slice(&h.finalize());
+    out
+}
+
+/// CPU: `commit_composition_polynomial`-style tree root over num_rows/2 leaves.
+fn cpu_tree_nodes(parts: &[Vec<Fp3>]) -> Vec<[u8; 32]> {
+    let num_rows = parts[0].len();
+    let num_parts = parts.len();
+    let num_leaves = num_rows / 2;
+    assert!(num_leaves.is_power_of_two() && num_leaves >= 1);
+    let byte_len = 24;
+
+    let hashed_leaves: Vec<[u8; 32]> = (0..num_leaves)
+        .map(|leaf_idx| {
+            let br_0 = reverse_index(2 * leaf_idx as u64, num_rows as u64) as usize;
+            let br_1 = reverse_index(2 * leaf_idx as u64 + 1, num_rows as u64) as usize;
+            let total_bytes = 2 * num_parts * byte_len;
+            let mut buf = vec![0u8; total_bytes];
+            let mut offset = 0;
+            for part in parts.iter() {
+                part[br_0].write_bytes_be(&mut buf[offset..offset + byte_len]);
+                offset += byte_len;
+            }
+            for part in parts.iter() {
+                part[br_1].write_bytes_be(&mut buf[offset..offset + byte_len]);
+                offset += byte_len;
+            }
+            let mut h = Keccak256::new();
+            h.update(&buf);
+            let mut r = [0u8; 32];
+            r.copy_from_slice(&h.finalize());
+            r
+        })
+        .collect();
+
+    let total = 2 * num_leaves - 1;
+    let mut nodes: Vec<[u8; 32]> = vec![[0u8; 32]; total];
+    for (i, leaf) in hashed_leaves.iter().enumerate() {
+        nodes[num_leaves - 1 + i] = *leaf;
+    }
+    let mut level_begin = num_leaves - 1;
+    while level_begin != 0 {
+        let new_begin = level_begin / 2;
+        let n_pairs = level_begin - new_begin;
+        for j in 0..n_pairs {
+            let left = nodes[level_begin + 2 * j];
+            let right = nodes[level_begin + 2 * j + 1];
+            nodes[new_begin + j] = cpu_hash_pair(&left, &right);
+        }
+        level_begin = new_begin;
+    }
+    nodes
+}
+
+fn run_parity(log_n: u32, blowup: usize, num_parts: usize, seed: u64) {
+    let n = 1usize << log_n;
+    let lde_size = n * blowup;
+    assert!(lde_size >= 2);
+    let mut rng = ChaCha8Rng::seed_from_u64(seed);
+
+    // Random ext3 coefficient vectors per part.
+    let parts_cpu: Vec<Vec<Fp3>> = (0..num_parts)
+        .map(|_| (0..n).map(|_| rand_ext3(&mut rng)).collect())
+        .collect();
+
+    // CPU LDE via evaluate_offset_fft, then CPU tree.
+    let offset_u64 = rng.r#gen::<u64>() | 1;
+    let offset = Fp::from_raw(offset_u64);
+    let cpu_lde_parts: Vec<Vec<Fp3>> = parts_cpu
+        .iter()
+        .map(|c| cpu_evaluate(c, blowup, &offset))
+        .collect();
+    let cpu_nodes = cpu_tree_nodes(&cpu_lde_parts);
+
+    // GPU fused call.
+    let weights = offset_weights(n, offset_u64);
+    let coefs_u64: Vec<Vec<u64>> = parts_cpu.iter().map(|c| ext3_to_u64s(c)).collect();
+    let coefs_slices: Vec<&[u64]> = coefs_u64.iter().map(|v| v.as_slice()).collect();
+    let mut outputs_raw: Vec<Vec<u64>> = (0..num_parts).map(|_| vec![0u64; 3 * lde_size]).collect();
+    let mut outputs_slices: Vec<&mut [u64]> =
+        outputs_raw.iter_mut().map(|v| v.as_mut_slice()).collect();
+    // R2 leaves are row-pairs: num_leaves = lde_size / 2, so
+    // tight_total_nodes = 2 * num_leaves - 1 = lde_size - 1.
+    let total_nodes = lde_size - 1;
+    let mut nodes_bytes = vec![0u8; total_nodes * 32];
+
+    math_cuda::lde::evaluate_poly_coset_batch_ext3_into_with_merkle_tree(
+        &coefs_slices,
+        n,
+        blowup,
+        &weights,
+        &mut outputs_slices,
+        &mut nodes_bytes,
+    )
+    .unwrap();
+
+    // Compare LDE parts.
+    for (c, cpu_col) in cpu_lde_parts.iter().enumerate() {
+        let gpu_col = u64s_to_ext3(&outputs_raw[c]);
+        for i in 0..lde_size {
+            assert_eq!(
+                canon_ext3(&gpu_col[i]),
+                canon_ext3(&cpu_col[i]),
+                "LDE mismatch part {c} row {i} log_n={log_n} blowup={blowup}"
+            );
+        }
+    }
+
+    // Compare tree nodes. GPU writes `2*num_leaves - 1 = lde_size - 1` nodes.
+    let num_leaves = lde_size / 2;
+    let tight_total = 2 * num_leaves - 1;
+    assert_eq!(cpu_nodes.len(), tight_total);
+    for i in 0..tight_total {
+        let g = &nodes_bytes[i * 32..(i + 1) * 32];
+        let c = &cpu_nodes[i];
+        assert_eq!(
+            g, c,
+            "tree node {i} mismatch at log_n={log_n} blowup={blowup} parts={num_parts}"
+        );
+    }
+}
+
+#[test]
+fn comp_poly_tree_small() {
+    for log_n in 2u32..=5 {
+        for &blowup in &[2usize, 4, 8] {
+            for &parts in &[1usize, 2, 4] {
+                run_parity(
+                    log_n,
+                    blowup,
+                    parts,
+                    1000 + log_n as u64 * 31 + parts as u64,
+                );
+            }
+        }
+    }
+}
+
+#[test]
+fn comp_poly_tree_medium() {
+    for &(log_n, blowup, parts) in &[(10u32, 4usize, 4usize), (12, 2, 3)] {
+        run_parity(
+            log_n,
+            blowup,
+            parts,
+            2000 + log_n as u64 * 11 + parts as u64,
+        );
+    }
+}
+
+#[test]
+fn comp_poly_tree_large() {
+    run_parity(14, 2, 4, 9999);
+}
diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs
index 6c0d8d9e9..530e2f6b9 100644
--- a/crypto/stark/src/gpu_lde.rs
+++ b/crypto/stark/src/gpu_lde.rs
@@ -5,17 +5,21 @@
 //! launch overhead dominates. Produces the same natural-order, non-canonical
 //! LDE evaluations as the CPU path.
 
+use core::mem::transmute_copy;
 use std::any::TypeId;
 use std::slice::{from_raw_parts, from_raw_parts_mut};
 use std::sync::OnceLock;
 use std::sync::atomic::{AtomicU64, Ordering};
 
+use crypto::merkle_tree::merkle::MerkleTree;
+use crypto::merkle_tree::traits::IsMerkleTreeBackend;
 use math::field::element::FieldElement;
 use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField;
 use math::field::goldilocks::GoldilocksField;
-use math::field::traits::{IsField, IsSubFieldOf};
+use math::field::traits::{IsFFTField, IsField, IsSubFieldOf};
 
 use crate::domain::Domain;
+use crate::trace::LDETraceTable;
 
 /// Break-even LDE size. For LDE sizes smaller than this, the CPU
 /// `coset_lde_full_expand` completes in a few hundred microseconds and the
@@ -56,6 +60,9 @@ pub fn reset_all_gpu_call_counters() {
     GPU_EXTEND_HALVES_CALLS.store(0, Ordering::Relaxed);
     GPU_LEAF_HASH_CALLS.store(0, Ordering::Relaxed);
     GPU_MERKLE_TREE_CALLS.store(0, Ordering::Relaxed);
+    GPU_PARTS_LDE_CALLS.store(0, Ordering::Relaxed);
+    GPU_BARY_CALLS.store(0, Ordering::Relaxed);
+    GPU_COMP_POLY_TREE_CALLS.store(0, Ordering::Relaxed);
 }
 
 pub(crate) static GPU_EXTEND_HALVES_CALLS: AtomicU64 = AtomicU64::new(0);
@@ -267,7 +274,10 @@ fn alloc_merkle_nodes(lde_size: usize) -> Option<(Vec<[u8; 32]>, usize)> {
     // SAFETY: every byte will be overwritten via the GPU D2H before the
     // contents are read. The caller computes the byte-length view from the
     // returned `nodes` Vec using `total_nodes.checked_mul(32)`.
-    unsafe { nodes.set_len(total_nodes) };
+    #[allow(clippy::uninit_vec)]
+    unsafe {
+        nodes.set_len(total_nodes)
+    };
     Some((nodes, total_nodes))
 }
 
@@ -342,13 +352,14 @@ where
 ///
 /// Returns `None` when the GPU path doesn't apply (too small, or CPU path
 /// should be used); in that case the caller runs its existing rayon::join.
+#[allow(clippy::type_complexity)]
 pub(crate) fn try_extend_two_halves_gpu<F, E>(
     h0: &[FieldElement<E>],
     h1: &[FieldElement<E>],
     domain: &Domain<F>,
 ) -> Option<(Vec<FieldElement<E>>, Vec<FieldElement<E>>)>
 where
-    F: math::field::traits::IsFFTField + IsField + 'static,
+    F: IsFFTField + IsField + 'static,
     E: IsField + 'static,
     F: IsSubFieldOf<E>,
 {
@@ -391,7 +402,7 @@ where
         // F == GoldilocksField by TypeId check above, so value is u64.
         let v: u64 = unsafe { *(w.value() as *const _ as *const u64) };
         weights_u64.push(v);
-        w = w * &g_inv;
+        w *= &g_inv;
     }
 
     // Pre-allocate outputs.
@@ -438,14 +449,11 @@ pub(crate) fn try_expand_leaf_and_tree_batched_keep<F, E, B>(
     columns: &mut [Vec<FieldElement<E>>],
     blowup_factor: usize,
     weights: &[FieldElement<F>],
-) -> Option<(
-    crypto::merkle_tree::merkle::MerkleTree<B>,
-    math_cuda::lde::GpuLdeBase,
-)>
+) -> Option<(MerkleTree<B>, math_cuda::lde::GpuLdeBase)>
 where
     F: IsField + 'static,
     E: IsField + 'static,
-    B: crypto::merkle_tree::traits::IsMerkleTreeBackend<Node = [u8; 32]>,
+    B: IsMerkleTreeBackend<Node = [u8; 32]>,
 {
     let (n, lde_size) = match check_base_layout::<F, E>(columns, blowup_factor) {
         LayoutDispatch::Empty | LayoutDispatch::Skip => return None,
@@ -486,7 +494,7 @@ where
         }
     };
 
-    let tree = crypto::merkle_tree::merkle::MerkleTree::<B>::from_precomputed_nodes(nodes)?;
+    let tree = MerkleTree::<B>::from_precomputed_nodes(nodes)?;
     Some((tree, handle))
 }
 
@@ -498,14 +506,11 @@ pub(crate) fn try_expand_leaf_and_tree_batched_ext3_keep<F, E, B>(
     columns: &mut [Vec<FieldElement<E>>],
     blowup_factor: usize,
     weights: &[FieldElement<F>],
-) -> Option<(
-    crypto::merkle_tree::merkle::MerkleTree<B>,
-    math_cuda::lde::GpuLdeExt3,
-)>
+) -> Option<(MerkleTree<B>, math_cuda::lde::GpuLdeExt3)>
 where
     F: IsField + 'static,
     E: IsField + 'static,
-    B: crypto::merkle_tree::traits::IsMerkleTreeBackend<Node = [u8; 32]>,
+    B: IsMerkleTreeBackend<Node = [u8; 32]>,
 {
     let (n, lde_size) = match check_ext3_layout::<F, E>(columns, blowup_factor) {
         LayoutDispatch::Empty | LayoutDispatch::Skip => return None,
@@ -547,7 +552,7 @@ where
         }
     };
 
-    let tree = crypto::merkle_tree::merkle::MerkleTree::<B>::from_precomputed_nodes(nodes)?;
+    let tree = MerkleTree::<B>::from_precomputed_nodes(nodes)?;
     Some((tree, handle))
 }
 
@@ -603,3 +608,398 @@ static GPU_MERKLE_TREE_CALLS: AtomicU64 = AtomicU64::new(0);
 pub fn gpu_merkle_tree_calls() -> u64 {
     GPU_MERKLE_TREE_CALLS.load(Ordering::Relaxed)
 }
+
+// ============================================================================
+// PR-3: R2 composition-parts LDE + Merkle commit + R3 OOD barycentric
+// ============================================================================
+
+/// R2 dispatch counter: incremented once per
+/// [`try_evaluate_parts_on_lde_gpu`] call that actually routed to the GPU.
+pub(crate) static GPU_PARTS_LDE_CALLS: AtomicU64 = AtomicU64::new(0);
+pub fn gpu_parts_lde_calls() -> u64 {
+    GPU_PARTS_LDE_CALLS.load(Ordering::Relaxed)
+}
+
+/// R3 dispatch counter: incremented once per `try_barycentric_*_on_handle`
+/// call that actually routed to the GPU.
+pub(crate) static GPU_BARY_CALLS: AtomicU64 = AtomicU64::new(0);
+pub fn gpu_bary_calls() -> u64 {
+    GPU_BARY_CALLS.load(Ordering::Relaxed)
+}
+
+/// R2 comp-poly tree counter: incremented once per
+/// [`try_build_comp_poly_tree_gpu`] call that actually routed to the GPU.
+/// Distinct from `GPU_MERKLE_TREE_CALLS` (R1 main/aux tree builds) so the
+/// two dispatch sites can be diagnosed independently.
+pub(crate) static GPU_COMP_POLY_TREE_CALLS: AtomicU64 = AtomicU64::new(0);
+pub fn gpu_comp_poly_tree_calls() -> u64 {
+    GPU_COMP_POLY_TREE_CALLS.load(Ordering::Relaxed)
+}
+
+/// Trace-size threshold for the R3 OOD barycentric GPU path. Below this the
+/// rayon CPU path is competitive and PCIe round-trip overhead would dominate.
+/// Override via `LAMBDA_VM_GPU_BARY_THRESHOLD`.
+const DEFAULT_GPU_BARY_THRESHOLD: usize = 1 << 14;
+
+fn gpu_bary_threshold() -> usize {
+    static CACHED: OnceLock<usize> = OnceLock::new();
+    *CACHED.get_or_init(|| {
+        std::env::var("LAMBDA_VM_GPU_BARY_THRESHOLD")
+            .ok()
+            .and_then(|s| s.parse().ok())
+            .unwrap_or(DEFAULT_GPU_BARY_THRESHOLD)
+    })
+}
+
+/// One ext3 scalar `(n_inv * g_n_inv) * (z^N - g^N)`. Returned as `[u64;3]`.
+///
+/// The GPU kernels compute only the unscaled barycentric sum per column.
+/// Applying this scalar on the host is one ext3 multiply per column, cheap
+/// next to the trace-size reduction.
+fn ood_ext3_scalar<F, E>(
+    coset_offset_pow_n: &FieldElement<F>,
+    n_inv: &FieldElement<F>,
+    g_n_inv: &FieldElement<F>,
+    z_pow_n: &FieldElement<E>,
+) -> [u64; 3]
+where
+    F: IsField + IsSubFieldOf<E> + 'static,
+    E: IsField + 'static,
+{
+    // (z^N - g^N) in E, via sub_subfield (E - F -> E).
+    let vanishing = z_pow_n.sub_subfield(coset_offset_pow_n);
+    let base_scalar = n_inv * g_n_inv; // F * F -> F
+    let scalar_ext3: FieldElement<E> = &base_scalar * &vanishing; // F * E -> E
+    // SAFETY: TypeId-checked at the caller (`try_barycentric_*_on_handle`).
+    // E == Degree3GoldilocksExtensionField, whose backing is
+    // `[FieldElement<Gl>; 3]`, layout-equivalent to `[u64; 3]`.
+    let ptr = &scalar_ext3 as *const FieldElement<E> as *const u64;
+    unsafe { [*ptr, *ptr.add(1), *ptr.add(2)] }
+}
+
+/// Multiply each raw GPU ext3 sum by the host-computed ext3 scalar. `sums_raw`
+/// is `3 * num_cols` u64s (interleaved). Returns the final OOD evaluations as
+/// `Vec<FieldElement<E>>` of length `num_cols`.
+fn apply_ext3_scalar<E>(sums_raw: &[u64], scalar: [u64; 3], num_cols: usize) -> Vec<FieldElement<E>>
+where
+    E: IsField + 'static,
+{
+    type Gl = GoldilocksField;
+    type Ext3 = Degree3GoldilocksExtensionField;
+
+    assert_eq!(sums_raw.len(), 3 * num_cols);
+    // Avoids the `E != Ext3` path reaching the unsafe `transmute_copy` below
+    // that is UB in that case. Cost is one TypeId comparison per call.
+    assert_eq!(
+        TypeId::of::<E>(),
+        TypeId::of::<Ext3>(),
+        "apply_ext3_scalar: E must be Ext3"
+    );
+
+    let scalar_e: FieldElement<Ext3> = FieldElement::<Ext3>::new([
+        FieldElement::<Gl>::from_raw(scalar[0]),
+        FieldElement::<Gl>::from_raw(scalar[1]),
+        FieldElement::<Gl>::from_raw(scalar[2]),
+    ]);
+
+    let mut out: Vec<FieldElement<E>> = Vec::with_capacity(num_cols);
+    for c in 0..num_cols {
+        let s: FieldElement<Ext3> = FieldElement::<Ext3>::new([
+            FieldElement::<Gl>::from_raw(sums_raw[c * 3]),
+            FieldElement::<Gl>::from_raw(sums_raw[c * 3 + 1]),
+            FieldElement::<Gl>::from_raw(sums_raw[c * 3 + 2]),
+        ]);
+        let final_ext3 = &s * &scalar_e;
+        // SAFETY: TypeId-checked at the caller. E == Ext3, identical layout.
+        let final_e: FieldElement<E> =
+            unsafe { transmute_copy::<FieldElement<Ext3>, FieldElement<E>>(&final_ext3) };
+        out.push(final_e);
+    }
+    out
+}
+
+/// R2 GPU dispatch: batched ext3 LDE over `parts_coefs` (composition-poly
+/// coefficient parts). Returns the LDE evaluations as `Vec<Vec<FieldElement<E>>>`
+/// of length `lde_size` per part on success, `None` to fall through to the CPU
+/// path. Used by `round_2_compute_composition_polynomial` in the
+/// `number_of_parts > 2` branch.
+///
+/// Inputs are immutable (`&[&[FieldElement<E>]]`) and outputs are fresh, so
+/// there is no `restore_columns_on_err` needed. `Err` just returns `None`
+/// and the caller's coefficient slices are left untouched.
+pub(crate) fn try_evaluate_parts_on_lde_gpu<F, E>(
+    parts_coefs: &[&[FieldElement<E>]],
+    blowup_factor: usize,
+    domain_size: usize,
+    offset: &FieldElement<F>,
+) -> Option<Vec<Vec<FieldElement<E>>>>
+where
+    F: IsFFTField + IsField + IsSubFieldOf<E> + 'static,
+    E: IsField + 'static,
+{
+    if parts_coefs.is_empty() {
+        return Some(Vec::new());
+    }
+    if !domain_size.is_power_of_two() || !blowup_factor.is_power_of_two() {
+        return None;
+    }
+    let lde_size = domain_size.checked_mul(blowup_factor)?;
+    if lde_size < gpu_lde_threshold() {
+        return None;
+    }
+    if TypeId::of::<E>() != TypeId::of::<Degree3GoldilocksExtensionField>() {
+        return None;
+    }
+    if TypeId::of::<F>() != TypeId::of::<GoldilocksField>() {
+        return None;
+    }
+    let m = parts_coefs.len();
+
+    // Weights: `offset^k` for k in 0..domain_size. F is Goldilocks by check above.
+    let mut weights_u64 = Vec::with_capacity(domain_size);
+    let mut w = FieldElement::<F>::one();
+    for _ in 0..domain_size {
+        // SAFETY: F == Goldilocks per TypeId check. FieldElement<Gl> is
+        // #[repr(transparent)] over u64.
+        let v: u64 = unsafe { *(w.value() as *const _ as *const u64) };
+        weights_u64.push(v);
+        w *= offset;
+    }
+
+    // Pack parts into per-part `3 * domain_size` u64 buffers (zero-padded).
+    let mut part_bufs: Vec<Vec<u64>> = Vec::with_capacity(m);
+    for part in parts_coefs.iter() {
+        let mut buf = vec![0u64; 3 * domain_size];
+        let len = part.len().min(domain_size);
+        // SAFETY: E == Ext3; backing is `[FieldElement<Gl>; 3]` = `[u64; 3]`.
+        let src_ptr = part.as_ptr() as *const u64;
+        let src_len = len.checked_mul(3).expect("part src len overflow");
+        let src = unsafe { from_raw_parts(src_ptr, src_len) };
+        buf[..src_len].copy_from_slice(src);
+        part_bufs.push(buf);
+    }
+    let input_slices: Vec<&[u64]> = part_bufs.iter().map(|v| v.as_slice()).collect();
+
+    let mut outputs: Vec<Vec<FieldElement<E>>> = (0..m)
+        .map(|_| vec![FieldElement::<E>::zero(); lde_size])
+        .collect();
+    let gpu_result = {
+        let mut out_slices: Vec<&mut [u64]> = outputs
+            .iter_mut()
+            .map(|o| {
+                let ptr = o.as_mut_ptr() as *mut u64;
+                let byte_len = lde_size.checked_mul(3).expect("ext3 out len overflow");
+                // SAFETY: E == Ext3 per TypeId check; Vec<FieldElement<E>> of
+                // length `lde_size` is layout-equivalent to `[u64; 3 * lde_size]`.
+                unsafe { from_raw_parts_mut(ptr, byte_len) }
+            })
+            .collect();
+        math_cuda::lde::evaluate_poly_coset_batch_ext3_into(
+            &input_slices,
+            domain_size,
+            blowup_factor,
+            &weights_u64,
+            &mut out_slices,
+        )
+    };
+    if gpu_result.is_err() {
+        // Outputs are local and dropped on return; caller's inputs are
+        // immutable, so no restore is needed.
+        return None;
+    }
+    GPU_PARTS_LDE_CALLS.fetch_add(1, Ordering::Relaxed);
+    Some(outputs)
+}
+
+/// R2 GPU dispatch: build the composition-polynomial Merkle tree from the
+/// host-side ext3 LDE eval Vecs produced by [`try_evaluate_parts_on_lde_gpu`]
+/// (or the CPU path). Uses the same row-pair leaf pattern as the CPU
+/// `commit_composition_polynomial`: each leaf hashes 2 consecutive
+/// bit-reversed rows.
+///
+/// Returns `None` to fall through to the CPU path when the type or size
+/// conditions don't hold; returns `None` on a math-cuda `Err` so the caller
+/// recomputes on CPU.
+pub(crate) fn try_build_comp_poly_tree_gpu<E, B>(
+    lde_parts: &[Vec<FieldElement<E>>],
+) -> Option<MerkleTree<B>>
+where
+    E: IsField + 'static,
+    B: IsMerkleTreeBackend<Node = [u8; 32]>,
+{
+    if lde_parts.is_empty() {
+        return None;
+    }
+    let lde_size = lde_parts[0].len();
+    if !lde_size.is_power_of_two() || lde_size < 2 {
+        return None;
+    }
+    if lde_size < gpu_lde_threshold() {
+        return None;
+    }
+    if TypeId::of::<E>() != TypeId::of::<Degree3GoldilocksExtensionField>() {
+        return None;
+    }
+    // All parts must have the same LDE length.
+    if lde_parts.iter().any(|p| p.len() != lde_size) {
+        return None;
+    }
+
+    // SAFETY: E == Ext3 per TypeId check. FieldElement<Ext3> backing is
+    // `[FieldElement<Gl>; 3]`, layout-equivalent to `[u64; 3]`.
+    let raw_parts: Vec<&[u64]> = lde_parts
+        .iter()
+        .map(|p| {
+            let byte_len = p.len().checked_mul(3).expect("ext3 part byte len overflow");
+            unsafe { from_raw_parts(p.as_ptr() as *const u64, byte_len) }
+        })
+        .collect();
+
+    let nodes_bytes = match math_cuda::merkle::build_comp_poly_tree_from_evals_ext3(&raw_parts) {
+        Ok(v) => v,
+        Err(_) => return None,
+    };
+
+    // lde_size is an even power of two >= 2, so 2*num_leaves == lde_size and
+    // tight_total_nodes = lde_size - 1 >= 1. No overflow or underflow possible.
+    let tight_total_nodes = lde_size - 1;
+    let expected_byte_len = tight_total_nodes
+        .checked_mul(32)
+        .expect("comp-poly node byte length overflow");
+    debug_assert_eq!(nodes_bytes.len(), expected_byte_len);
+
+    let nodes: Vec<[u8; 32]> = nodes_bytes
+        .chunks_exact(32)
+        .map(|c| {
+            c.try_into()
+                .expect("chunks_exact(32) yields exactly 32 bytes")
+        })
+        .collect();
+    GPU_COMP_POLY_TREE_CALLS.fetch_add(1, Ordering::Relaxed);
+    // Falls back to CPU on `None`, matching the R1 paths (lines 496, 557).
+    MerkleTree::<B>::from_precomputed_nodes(nodes)
+}
+
+/// R3 GPU dispatch: batched strided barycentric OOD evaluation over the main
+/// (base-field) LDE columns kept on device from R1. Operates on the
+/// device-resident LDE in place; only the coset points and inv_denoms are
+/// copied to the device, not the columns. Returns the OOD evaluations
+/// as `Vec<FieldElement<E>>` of length `num_main_cols` (already scaled by
+/// `vanishing * n_inv * g_n_inv`), or `None` if the GPU handle is absent,
+/// types don't match, the trace-size domain is below threshold, or the
+/// math-cuda call returns `Err`.
+#[allow(clippy::too_many_arguments)]
+pub(crate) fn try_barycentric_base_on_handle<F, E>(
+    lde_trace: &LDETraceTable<F, E>,
+    row_stride: usize,
+    coset_points: &[FieldElement<F>],
+    coset_offset_pow_n: &FieldElement<F>,
+    n_inv: &FieldElement<F>,
+    g_n_inv: &FieldElement<F>,
+    z_pow_n: &FieldElement<E>,
+    inv_denoms: &[FieldElement<E>],
+) -> Option<Vec<FieldElement<E>>>
+where
+    F: IsField + IsSubFieldOf<E> + 'static,
+    E: IsField + 'static,
+{
+    if TypeId::of::<F>() != TypeId::of::<GoldilocksField>() {
+        return None;
+    }
+    if TypeId::of::<E>() != TypeId::of::<Degree3GoldilocksExtensionField>() {
+        return None;
+    }
+    let main = lde_trace.gpu_main()?;
+    let num_cols = main.m;
+    if num_cols == 0 {
+        return Some(Vec::new());
+    }
+    let n = coset_points.len();
+    if !n.is_power_of_two() || n < gpu_bary_threshold() {
+        return None;
+    }
+    if inv_denoms.len() != n || main.lde_size != n.checked_mul(row_stride)? {
+        return None;
+    }
+
+    // SAFETY: F == Goldilocks per TypeId check; FieldElement<Gl> is
+    // #[repr(transparent)] over u64.
+    let points_raw: &[u64] = unsafe { from_raw_parts(coset_points.as_ptr() as *const u64, n) };
+    // SAFETY: E == Ext3 per TypeId check; FieldElement<Ext3> backing is
+    // `[FieldElement<Gl>; 3]` = `[u64; 3]`.
+    let inv_denoms_len = n.checked_mul(3).expect("inv_denoms u64 len overflow");
+    let inv_denoms_raw: &[u64] =
+        unsafe { from_raw_parts(inv_denoms.as_ptr() as *const u64, inv_denoms_len) };
+
+    let sums_raw = match math_cuda::barycentric::barycentric_base_on_device(
+        main,
+        row_stride,
+        points_raw,
+        inv_denoms_raw,
+        n,
+    ) {
+        Ok(v) => v,
+        Err(_) => return None,
+    };
+    GPU_BARY_CALLS.fetch_add(1, Ordering::Relaxed);
+
+    let scalar = ood_ext3_scalar::<F, E>(coset_offset_pow_n, n_inv, g_n_inv, z_pow_n);
+    Some(apply_ext3_scalar::<E>(&sums_raw, scalar, num_cols))
+}
+
+/// Ext3 counterpart of [`try_barycentric_base_on_handle`] for the aux LDE.
+/// Reads `lde_trace.gpu_aux()` (the de-interleaved 3-slab device buffer).
+#[allow(clippy::too_many_arguments)]
+pub(crate) fn try_barycentric_ext3_on_handle<F, E>(
+    lde_trace: &LDETraceTable<F, E>,
+    row_stride: usize,
+    coset_points: &[FieldElement<F>],
+    coset_offset_pow_n: &FieldElement<F>,
+    n_inv: &FieldElement<F>,
+    g_n_inv: &FieldElement<F>,
+    z_pow_n: &FieldElement<E>,
+    inv_denoms: &[FieldElement<E>],
+) -> Option<Vec<FieldElement<E>>>
+where
+    F: IsField + IsSubFieldOf<E> + 'static,
+    E: IsField + 'static,
+{
+    if TypeId::of::<F>() != TypeId::of::<GoldilocksField>() {
+        return None;
+    }
+    if TypeId::of::<E>() != TypeId::of::<Degree3GoldilocksExtensionField>() {
+        return None;
+    }
+    let aux = lde_trace.gpu_aux()?;
+    let num_cols = aux.m;
+    if num_cols == 0 {
+        return Some(Vec::new());
+    }
+    let n = coset_points.len();
+    if !n.is_power_of_two() || n < gpu_bary_threshold() {
+        return None;
+    }
+    if inv_denoms.len() != n || aux.lde_size != n.checked_mul(row_stride)? {
+        return None;
+    }
+
+    let points_raw: &[u64] = unsafe { from_raw_parts(coset_points.as_ptr() as *const u64, n) };
+    let inv_denoms_len = n.checked_mul(3).expect("inv_denoms u64 len overflow");
+    let inv_denoms_raw: &[u64] =
+        unsafe { from_raw_parts(inv_denoms.as_ptr() as *const u64, inv_denoms_len) };
+
+    let sums_raw = match math_cuda::barycentric::barycentric_ext3_on_device(
+        aux,
+        row_stride,
+        points_raw,
+        inv_denoms_raw,
+        n,
+    ) {
+        Ok(v) => v,
+        Err(_) => return None,
+    };
+    GPU_BARY_CALLS.fetch_add(1, Ordering::Relaxed);
+
+    let scalar = ood_ext3_scalar::<F, E>(coset_offset_pow_n, n_inv, g_n_inv, z_pow_n);
+    Some(apply_ext3_scalar::<E>(&sums_raw, scalar, num_cols))
+}
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index dd26e020a..2c02eacf5 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -1020,28 +1020,66 @@ pub trait IsStarkProver<
                 Polynomial::interpolate_offset_fft(&constraint_evaluations, &domain.coset_offset)
                     .unwrap();
             let composition_poly_parts = composition_poly.break_in_parts(number_of_parts);
-            composition_poly_parts
-                .iter()
-                .map(|part| {
-                    evaluate_polynomial_on_lde_domain(
-                        part,
-                        domain.blowup_factor,
-                        domain.interpolation_domain_size,
-                        &domain.coset_offset,
-                    )
-                    .unwrap()
-                })
-                .collect()
+
+            let cpu_eval = || -> Vec<Vec<FieldElement<FieldExtension>>> {
+                composition_poly_parts
+                    .iter()
+                    .map(|part| {
+                        evaluate_polynomial_on_lde_domain(
+                            part,
+                            domain.blowup_factor,
+                            domain.interpolation_domain_size,
+                            &domain.coset_offset,
+                        )
+                        .unwrap()
+                    })
+                    .collect()
+            };
+
+            // GPU fast path: batched ext3 LDE for all parts in one call.
+            // Non-`_keep` variant. The device buffer is freed at end of
+            // call; downstream R3 reads the host-side evals. PR-4 will
+            // upgrade to the `_keep` variant to retain the handle for R4
+            // DEEP composition.
+            #[cfg(feature = "cuda")]
+            {
+                let parts_slices: Vec<&[FieldElement<FieldExtension>]> = composition_poly_parts
+                    .iter()
+                    .map(|p| p.coefficients.as_slice())
+                    .collect();
+                crate::gpu_lde::try_evaluate_parts_on_lde_gpu::<Field, FieldExtension>(
+                    &parts_slices,
+                    domain.blowup_factor,
+                    domain.interpolation_domain_size,
+                    &domain.coset_offset,
+                )
+                .unwrap_or_else(cpu_eval)
+            }
+            #[cfg(not(feature = "cuda"))]
+            cpu_eval()
         };
         #[cfg(feature = "instruments")]
         let fft_dur = t_sub.elapsed();
 
         #[cfg(feature = "instruments")]
         let t_sub = Instant::now();
-        let Some((composition_poly_merkle_tree, composition_poly_root)) =
-            Self::commit_composition_polynomial(&lde_composition_poly_parts_evaluations)
-        else {
-            return Err(ProvingError::EmptyCommitment);
+        // GPU fast path for the comp-poly Merkle commit: row-pair Keccak
+        // leaves + device-side inner tree, both wrapping the host eval Vecs.
+        #[cfg(feature = "cuda")]
+        let gpu_tree = crate::gpu_lde::try_build_comp_poly_tree_gpu::<
+            FieldExtension,
+            BatchedMerkleTreeBackend<FieldExtension>,
+        >(&lde_composition_poly_parts_evaluations);
+        #[cfg(not(feature = "cuda"))]
+        let gpu_tree: Option<BatchedMerkleTree<FieldExtension>> = None;
+
+        let (composition_poly_merkle_tree, composition_poly_root) = match gpu_tree {
+            Some(tree) => {
+                let root = tree.root;
+                (tree, root)
+            }
+            None => Self::commit_composition_polynomial(&lde_composition_poly_parts_evaluations)
+                .ok_or(ProvingError::EmptyCommitment)?,
         };
         #[cfg(feature = "instruments")]
         let merkle_dur = t_sub.elapsed();
diff --git a/crypto/stark/src/trace.rs b/crypto/stark/src/trace.rs
index c8e96987b..f4469447d 100644
--- a/crypto/stark/src/trace.rs
+++ b/crypto/stark/src/trace.rs
@@ -436,8 +436,8 @@ pub fn get_trace_evaluations_from_lde<F, E>(
     dc: &DomainConstants<F>,
 ) -> Table<E>
 where
-    F: IsSubFieldOf<E> + IsFFTField,
-    E: IsField,
+    F: IsSubFieldOf<E> + IsFFTField + 'static,
+    E: IsField + 'static,
 {
     let n = domain.interpolation_domain_size;
     let bf = domain.blowup_factor;
@@ -468,58 +468,113 @@ where
         let vanishing = z_pow_n.sub_subfield(&dc.offset_pow_n);
         let vanishing_factor = &n_inv_g_n_inv * &vanishing;
 
-        // Precompute inv_denoms = 1/(eval_point - coset_point_i) — shared across all columns
+        // Precompute inv_denoms = 1/(eval_point - coset_point_i), shared across all columns.
+        // Stays on CPU: the batch-invert cost at this scale (n * num_eval_points) is already
+        // rayon-parallelised across tables, and a GPU port regressed wall time in a
+        // 2x15-trial A/B due to stream contention from many concurrent launches.
         let inv_denoms = barycentric_inv_denoms(eval_point, &dc.points);
 
-        // Precompute col_scale[i] = point[i] * inv_denom[i] — shared across ALL columns.
-        // This eliminates N redundant F×E multiplies per column.
-        let col_scale: Vec<FieldElement<E>> = dc
-            .points
-            .iter()
-            .zip(inv_denoms.iter())
-            .map(|(point, inv_d)| point * inv_d)
-            .collect();
-
-        // Evaluate all main columns directly from LDE (no extraction copy).
-        // For main columns (base field F): sum = Σ col_scale[i] * lde_col[i*bf]
-        // lde_col[i*bf] is F, col_scale[i] is E; use F×E → E mixed arithmetic.
-        #[cfg(feature = "parallel")]
-        let main_iter = (0..num_main_cols).into_par_iter();
-        #[cfg(not(feature = "parallel"))]
-        let main_iter = 0..num_main_cols;
-        let main_evals: Vec<FieldElement<E>> = main_iter
-            .map(|col_idx| {
-                let lde_col = &lde_trace.main_columns[col_idx];
-                let sum = col_scale
+        // col_scale[i] = point[i] * inv_denom[i], shared across ALL CPU column
+        // loops below. Computed lazily on first CPU-fallback use so the all-GPU
+        // path pays nothing, while the all-CPU and mixed paths only pay once.
+        let mut col_scale: Option<Vec<FieldElement<E>>> = None;
+
+        // GPU fast path: batched strided barycentric over the main-trace LDE
+        // already on device. Returns `None` when the GPU R1 path didn't run
+        // for this table (handle absent), the size is below threshold, types
+        // don't match, or the math-cuda call errored. Caller falls through
+        // to the existing rayon CPU loop.
+        #[cfg(feature = "cuda")]
+        let main_gpu = crate::gpu_lde::try_barycentric_base_on_handle::<F, E>(
+            lde_trace,
+            bf,
+            &dc.points,
+            &dc.offset_pow_n,
+            &dc.size_inv,
+            &dc.offset_pow_n_inv,
+            &z_pow_n,
+            &inv_denoms,
+        );
+        #[cfg(not(feature = "cuda"))]
+        let main_gpu: Option<Vec<FieldElement<E>>> = None;
+
+        let main_evals: Vec<FieldElement<E>> = if let Some(v) = main_gpu {
+            v
+        } else {
+            let col_scale = col_scale.get_or_insert_with(|| {
+                dc.points
                     .iter()
-                    .enumerate()
-                    .fold(FieldElement::<E>::zero(), |acc, (i, scale)| {
-                        acc + &lde_col[i * bf] * scale
-                    });
-                &vanishing_factor * &sum
-            })
-            .collect();
+                    .zip(inv_denoms.iter())
+                    .map(|(point, inv_d)| point * inv_d)
+                    .collect()
+            });
+            // Evaluate all main columns directly from LDE (no extraction copy).
+            // For main columns (base field F): sum = sum over i of col_scale[i] * lde_col[i*bf].
+            // lde_col[i*bf] is F, col_scale[i] is E; use F*E -> E mixed arithmetic.
+            #[cfg(feature = "parallel")]
+            let main_iter = (0..num_main_cols).into_par_iter();
+            #[cfg(not(feature = "parallel"))]
+            let main_iter = 0..num_main_cols;
+            main_iter
+                .map(|col_idx| {
+                    let lde_col = &lde_trace.main_columns[col_idx];
+                    let sum = col_scale
+                        .iter()
+                        .enumerate()
+                        .fold(FieldElement::<E>::zero(), |acc, (i, scale)| {
+                            acc + &lde_col[i * bf] * scale
+                        });
+                    &vanishing_factor * &sum
+                })
+                .collect()
+        };
         table_data.extend(main_evals);
 
-        // Evaluate all aux columns directly from LDE (no extraction copy).
-        // For aux columns (extension field E): sum = Σ col_scale[i] * lde_col[i*bf]
-        // Both col_scale and lde_col are in E, so each multiply is E×E → E.
-        #[cfg(feature = "parallel")]
-        let aux_iter = (0..num_aux_cols).into_par_iter();
-        #[cfg(not(feature = "parallel"))]
-        let aux_iter = 0..num_aux_cols;
-        let aux_evals: Vec<FieldElement<E>> = aux_iter
-            .map(|col_idx| {
-                let lde_col = &lde_trace.aux_columns[col_idx];
-                let sum = col_scale
+        // GPU fast path for aux columns reading the de-interleaved ext3 LDE handle.
+        #[cfg(feature = "cuda")]
+        let aux_gpu = crate::gpu_lde::try_barycentric_ext3_on_handle::<F, E>(
+            lde_trace,
+            bf,
+            &dc.points,
+            &dc.offset_pow_n,
+            &dc.size_inv,
+            &dc.offset_pow_n_inv,
+            &z_pow_n,
+            &inv_denoms,
+        );
+        #[cfg(not(feature = "cuda"))]
+        let aux_gpu: Option<Vec<FieldElement<E>>> = None;
+
+        let aux_evals: Vec<FieldElement<E>> = if let Some(v) = aux_gpu {
+            v
+        } else {
+            let col_scale = col_scale.get_or_insert_with(|| {
+                dc.points
                     .iter()
-                    .enumerate()
-                    .fold(FieldElement::<E>::zero(), |acc, (i, scale)| {
-                        acc + scale * &lde_col[i * bf]
-                    });
-                &vanishing_factor * &sum
-            })
-            .collect();
+                    .zip(inv_denoms.iter())
+                    .map(|(point, inv_d)| point * inv_d)
+                    .collect()
+            });
+            // Evaluate all aux columns directly from LDE (no extraction copy).
+            // For aux columns (extension field E): sum = sum over i of col_scale[i] * lde_col[i*bf].
+            // Both col_scale and lde_col are in E, so each multiply is E*E -> E.
+            #[cfg(feature = "parallel")]
+            let aux_iter = (0..num_aux_cols).into_par_iter();
+            #[cfg(not(feature = "parallel"))]
+            let aux_iter = 0..num_aux_cols;
+            aux_iter
+                .map(|col_idx| {
+                    let lde_col = &lde_trace.aux_columns[col_idx];
+                    let sum = col_scale
+                        .iter()
+                        .enumerate()
+                        .fold(FieldElement::<E>::zero(), |acc, (i, scale)| {
+                            acc + scale * &lde_col[i * bf]
+                        });
+                    &vanishing_factor * &sum
+                })
+                .collect()
+        };
         table_data.extend(aux_evals);
     }
 
diff --git a/prover/tests/cuda_path_integration.rs b/prover/tests/cuda_path_integration.rs
new file mode 100644
index 000000000..e54becb85
--- /dev/null
+++ b/prover/tests/cuda_path_integration.rs
@@ -0,0 +1,49 @@
+//! End-to-end GPU path coverage. Proves a real ELF with the `cuda` feature on
+//! and asserts every dispatch counter introduced by the cuda backend actually
+//! fired. Catches silent CPU-fallback regressions where the prover would still
+//! emit a valid proof but the GPU path got skipped (size below threshold, type
+//! mismatch, handle plumbing broken, etc.).
+//!
+//! `#[ignore]`'d so the no-GPU CI path skips it. Run via `make test-cuda-integration`
+//! or `cargo test -p lambda-vm-prover --release --features cuda --test cuda_path_integration -- --ignored --nocapture`.
+#![cfg(feature = "cuda")]
+
+use lambda_vm_prover::prove;
+use lambda_vm_prover::test_utils::asm_elf_bytes;
+use stark::gpu_lde::{
+    gpu_bary_calls, gpu_comp_poly_tree_calls, gpu_lde_calls, gpu_parts_lde_calls,
+    reset_all_gpu_call_counters,
+};
+
+#[test]
+#[ignore = "requires GPU; run with --ignored --nocapture"]
+fn gpu_path_fires_end_to_end() {
+    // Warm-up amortises PTX load + pool warm-up + first-call pinned alloc
+    // so the profiled-pass counter assertions reflect only steady-state work.
+    let elf = asm_elf_bytes("fib_iterative_1M");
+    let _ = prove(&elf).expect("warm-up prove");
+
+    reset_all_gpu_call_counters();
+
+    let _ = prove(&elf).expect("prove");
+
+    // R1 main + aux fused LDE+Merkle. Fires for every table above the LDE
+    // threshold; fib_iterative_1M has plenty.
+    assert!(gpu_lde_calls() > 0, "R1 GPU LDE path did not fire");
+
+    // R3 OOD barycentric reads the R1 main + aux device handles. Fires once
+    // per (eval-point, base/ext3) pair for every table that took the R1 GPU
+    // path.
+    assert!(gpu_bary_calls() > 0, "R3 GPU barycentric did not fire");
+
+    // R2 ext3 LDE of composition-poly parts. Only fires when an AIR's
+    // `number_of_parts > 2`. The branch and shift tables have degree-3
+    // transition constraints, so this triggers on any non-trivial prove.
+    assert!(gpu_parts_lde_calls() > 0, "R2 GPU parts LDE did not fire");
+
+    // R2 comp-poly Merkle tree build, paired with the parts LDE above.
+    assert!(
+        gpu_comp_poly_tree_calls() > 0,
+        "R2 GPU comp-poly tree did not fire"
+    );
+}