Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
d1a0abf
add first cuda files
ColoCarletti May 6, 2026
79634ff
fmt
ColoCarletti May 6, 2026
ac6fbb5
fix clippy
ColoCarletti May 6, 2026
2ceb3b0
gpu 2nd part
ColoCarletti May 6, 2026
affceb1
feat(cuda): Round 1 GPU LDE+commit dispatch + device-resident handles
ColoCarletti May 6, 2026
01172f2
merge main
ColoCarletti May 19, 2026
c4627e1
Merge branch 'main' into feat/cuda-pr2-r1-gpu-commits
ColoCarletti May 19, 2026
01aa5e4
comments fix
ColoCarletti May 20, 2026
cfc5c19
Merge branch 'main' into feat/cuda-pr2-r1-gpu-commits
MauroToscano May 21, 2026
ea5696f
Update crypto/stark/src/gpu_lde.rs
ColoCarletti May 21, 2026
a8cf265
Update crypto/stark/src/gpu_lde.rs
ColoCarletti May 21, 2026
fb8d31f
Update crypto/stark/src/gpu_lde.rs
ColoCarletti May 21, 2026
a79f2b5
Update crypto/stark/src/gpu_lde.rs
ColoCarletti May 21, 2026
761a2c0
Update crypto/stark/src/gpu_lde.rs
ColoCarletti May 21, 2026
e066e9d
address reviews
ColoCarletti May 21, 2026
7d3d0f0
fix review comments
ColoCarletti May 22, 2026
cf80771
Merge remote-tracking branch 'origin/main' into feat/cuda-pr2-r1-gpu-…
ColoCarletti May 22, 2026
71aba0d
address doc comment suggestions
ColoCarletti May 22, 2026
83d91b8
Merge branch 'main' into feat/cuda-pr2-r1-gpu-commits
ColoCarletti May 22, 2026
34cae4b
fix
ColoCarletti May 22, 2026
f076bf4
Merge branch 'main' into feat/cuda-pr2-r1-gpu-commits
gabrielbosio May 27, 2026
a2cde0f
Pass replay transcript to bus-balance call in verify_vm_minimal
gabrielbosio May 27, 2026
46c305b
Update crypto/math-cuda/src/device.rs
ColoCarletti May 28, 2026
aca3dca
Merge branch 'main' into feat/cuda-pr2-r1-gpu-commits
ColoCarletti May 28, 2026
63d7c00
Update crypto/math-cuda/src/device.rs
ColoCarletti May 29, 2026
eb16c02
Update crypto/math-cuda/src/device.rs
ColoCarletti May 29, 2026
66925b1
Update crypto/math-cuda/src/device.rs
ColoCarletti May 29, 2026
4e6daf3
Update crypto/math-cuda/src/lde.rs
ColoCarletti May 29, 2026
4cd27d9
Update crypto/math-cuda/src/lde.rs
ColoCarletti May 29, 2026
5fe390f
Update crypto/math-cuda/src/lde.rs
ColoCarletti May 29, 2026
5819930
Update crypto/math-cuda/src/lde.rs
ColoCarletti May 29, 2026
33f7c36
Update crypto/math-cuda/src/lde.rs
ColoCarletti May 29, 2026
49d3607
Merge branch 'main' into feat/cuda-pr2-r1-gpu-commits
ColoCarletti May 29, 2026
99cd59c
add pr3 code
ColoCarletti Jun 1, 2026
c52521e
Merge branch 'main' into feat/cuda-pr2-r1-gpu-commits
ColoCarletti Jun 1, 2026
828ee16
fix comments
ColoCarletti Jun 1, 2026
19a36a0
Merge remote-tracking branch 'origin/feat/cuda-pr2-r1-gpu-commits' in…
ColoCarletti Jun 1, 2026
80e1ecb
fix sync stream after D2H in merke.rs
ColoCarletti Jun 1, 2026
3ead022
Merge branch 'main' into feat/cuda-pr3
ColoCarletti Jun 1, 2026
04dd872
fix comments
ColoCarletti Jun 1, 2026
8a67e33
address review feedback
ColoCarletti Jun 1, 2026
1f9394d
Update crypto/math-cuda/src/barycentric.rs
ColoCarletti Jun 1, 2026
b07999c
Update crypto/math-cuda/src/barycentric.rs
ColoCarletti Jun 1, 2026
c575017
fix imports
ColoCarletti Jun 1, 2026
0ffc661
Merge branch 'feat/cuda-pr3' of github.com:yetanotherco/lambda_vm int…
ColoCarletti Jun 1, 2026
0777f1e
Merge branch 'main' into feat/cuda-pr3
ColoCarletti Jun 3, 2026
2c7b0de
cuda integration tests
ColoCarletti Jun 3, 2026
2f1fe2d
address review feedback
ColoCarletti Jun 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
.PHONY: deps deps-linux deps-macos prepare-test-data compile-programs-asm compile-programs-rust compile-bench \
compile-programs clean-asm clean-rust clean-bench clean-shared clean test test-asm test-no-compile \
test-asm-no-compile test-rust test-rust-no-compile test-executor flamegraph-prover \
test-fast test-prover test-prover-all test-disk-spill test-math-cuda bench-math-cuda bench-prover bench-prover-cuda build check clippy fmt lint
test-fast test-prover test-prover-all test-disk-spill test-math-cuda test-cuda-integration bench-math-cuda bench-prover bench-prover-cuda build check clippy fmt lint

UNAME := $(shell uname)

Expand Down Expand Up @@ -194,6 +194,12 @@ test-disk-spill:
test-math-cuda:
cargo test -p math-cuda --release

# End-to-end cuda dispatch coverage (requires NVIDIA GPU + nvcc).
# Asserts every R1/R2/R3 GPU counter fired on a real prove.
test-cuda-integration:
cargo test -p lambda-vm-prover --release --features cuda \
--test cuda_path_integration -- --ignored --nocapture

# math-cuda quick microbench (median of 10 runs)
bench-math-cuda:
cargo test -p math-cuda --release --test bench_quick -- --ignored --nocapture
Expand Down
1 change: 1 addition & 0 deletions crypto/math-cuda/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,4 +111,5 @@ fn main() {
compile_ptx("arith.cu", "arith.ptx", have_nvcc);
compile_ptx("ntt.cu", "ntt.ptx", have_nvcc);
compile_ptx("keccak.cu", "keccak.ptx", have_nvcc);
compile_ptx("barycentric.cu", "barycentric.ptx", have_nvcc);
}
192 changes: 192 additions & 0 deletions crypto/math-cuda/kernels/barycentric.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
// Barycentric evaluation of a polynomial (given as evaluations on a coset) at
// a single out-of-domain point. Matches the CPU
// `math::polynomial::interpolate_coset_eval_*_with_g_n_inv` pair.
//
// Per column, the barycentric sum is
// S = sum over i of point_i * eval_i * inv_denom_i
// where `point_i` is a base-field coset point, `eval_i` is the polynomial's
// value at that point (base for main-trace columns, ext3 for aux or composition
// columns), and `inv_denom_i = 1 / (z - point_i)` is an ext3 scalar (same for
// every column sharing the evaluation point `z`).
//
// These kernels compute only S. The full OOD value is S scaled by the ext3
// constant `vanishing * n_inv * g_n_inv`, which is constant across a column, so
// the caller applies it once per column (one ext3 mul per column, independent
// of n). Keeping it on the host means the kernel takes no extra ext3 constant
// argument.
//
// Launch: grid = (num_cols, 1, 1), block = (BARY_BLOCK_DIM, 1, 1).

#include "goldilocks.cuh"
#include "ext3.cuh"

// 256 threads/block. One ext3 accumulator per thread in shmem => 6 KiB.
#define BARY_BLOCK_DIM 256

__device__ __forceinline__ ext3::Fe3 block_reduce_ext3(ext3::Fe3 my) {
__shared__ uint64_t shm_a[BARY_BLOCK_DIM];
__shared__ uint64_t shm_b[BARY_BLOCK_DIM];
__shared__ uint64_t shm_c[BARY_BLOCK_DIM];
uint32_t tid = threadIdx.x;
shm_a[tid] = my.a;
shm_b[tid] = my.b;
shm_c[tid] = my.c;
__syncthreads();
for (uint32_t s = BARY_BLOCK_DIM / 2; s > 0; s >>= 1) {
if (tid < s) {
shm_a[tid] = goldilocks::add(shm_a[tid], shm_a[tid + s]);
shm_b[tid] = goldilocks::add(shm_b[tid], shm_b[tid + s]);
shm_c[tid] = goldilocks::add(shm_c[tid], shm_c[tid + s]);
}
__syncthreads();
}
return ext3::make(shm_a[0], shm_b[0], shm_c[0]);
}

/// Base-column variant: M base-field columns, each `col_stride` u64 apart.
/// `inv_denoms` is a flat 3N u64 buffer (ext3, interleaved `[a0,b0,c0,...]`).
/// Writes `out_ext3_int`: 3M u64, ext3 interleaved, one accumulator per column.
extern "C" __global__ void barycentric_base_batched(
const uint64_t *columns,
uint64_t col_stride,
const uint64_t *coset_points,
const uint64_t *inv_denoms,
uint64_t n,
uint64_t *out_ext3_int
) {
Comment thread
ColoCarletti marked this conversation as resolved.
uint64_t col = blockIdx.x;
const uint64_t *col_data = columns + col * col_stride;

ext3::Fe3 acc = ext3::zero();
for (uint64_t i = threadIdx.x; i < n; i += BARY_BLOCK_DIM) {
uint64_t eval = col_data[i];
uint64_t point = coset_points[i];
uint64_t pe = goldilocks::mul(point, eval); // F * F -> F
ext3::Fe3 inv_d = ext3::make(
inv_denoms[i * 3 + 0],
inv_denoms[i * 3 + 1],
inv_denoms[i * 3 + 2]);
ext3::Fe3 term = ext3::mul_base(inv_d, pe); // E * F -> E
acc = ext3::add(acc, term);
}

ext3::Fe3 sum = block_reduce_ext3(acc);
if (threadIdx.x == 0) {
out_ext3_int[col * 3 + 0] = sum.a;
out_ext3_int[col * 3 + 1] = sum.b;
out_ext3_int[col * 3 + 2] = sum.c;
}
}

/// Same as `barycentric_base_batched` but reads rows at stride `row_stride`
/// within each column. Treats the column as an LDE of length `n * row_stride`
/// and sums over the trace-size coset (every `row_stride`-th row). Lets R3 OOD
/// run directly against the LDE device handle from R1 without copying the
/// strided rows into a separate trace-size buffer.
extern "C" __global__ void barycentric_base_batched_strided(
const uint64_t *columns,
uint64_t col_stride,
uint64_t row_stride,
const uint64_t *coset_points,
const uint64_t *inv_denoms,
uint64_t n,
uint64_t *out_ext3_int
) {
uint64_t col = blockIdx.x;
const uint64_t *col_data = columns + col * col_stride;

ext3::Fe3 acc = ext3::zero();
for (uint64_t i = threadIdx.x; i < n; i += BARY_BLOCK_DIM) {
uint64_t eval = col_data[i * row_stride];
uint64_t point = coset_points[i];
uint64_t pe = goldilocks::mul(point, eval);
ext3::Fe3 inv_d = ext3::make(
inv_denoms[i * 3 + 0],
inv_denoms[i * 3 + 1],
inv_denoms[i * 3 + 2]);
ext3::Fe3 term = ext3::mul_base(inv_d, pe);
acc = ext3::add(acc, term);
}

ext3::Fe3 sum = block_reduce_ext3(acc);
if (threadIdx.x == 0) {
out_ext3_int[col * 3 + 0] = sum.a;
out_ext3_int[col * 3 + 1] = sum.b;
out_ext3_int[col * 3 + 2] = sum.c;
}
}

/// Ext3-column variant: M ext3 columns stored as 3M base slabs. Column `c`
/// lives at `columns[(c*3+k)*col_stride + i]` for component `k` in 0..3.
extern "C" __global__ void barycentric_ext3_batched(
const uint64_t *columns,
uint64_t col_stride,
const uint64_t *coset_points,
const uint64_t *inv_denoms,
uint64_t n,
uint64_t *out_ext3_int
) {
uint64_t col = blockIdx.x;
const uint64_t *slab_a = columns + (col * 3 + 0) * col_stride;
const uint64_t *slab_b = columns + (col * 3 + 1) * col_stride;
const uint64_t *slab_c = columns + (col * 3 + 2) * col_stride;

ext3::Fe3 acc = ext3::zero();
for (uint64_t i = threadIdx.x; i < n; i += BARY_BLOCK_DIM) {
ext3::Fe3 eval = ext3::make(slab_a[i], slab_b[i], slab_c[i]);
uint64_t point = coset_points[i];
// F * E -> E. Point times eval, componentwise on the 3 base components.
ext3::Fe3 pe = ext3::mul_base(eval, point);
// E * E -> E
ext3::Fe3 inv_d = ext3::make(
inv_denoms[i * 3 + 0],
inv_denoms[i * 3 + 1],
inv_denoms[i * 3 + 2]);
ext3::Fe3 term = ext3::mul(pe, inv_d);
acc = ext3::add(acc, term);
}

ext3::Fe3 sum = block_reduce_ext3(acc);
if (threadIdx.x == 0) {
out_ext3_int[col * 3 + 0] = sum.a;
out_ext3_int[col * 3 + 1] = sum.b;
out_ext3_int[col * 3 + 2] = sum.c;
}
}

/// Strided ext3 variant for R3 OOD of aux LDE.
extern "C" __global__ void barycentric_ext3_batched_strided(
const uint64_t *columns,
uint64_t col_stride,
uint64_t row_stride,
const uint64_t *coset_points,
const uint64_t *inv_denoms,
uint64_t n,
uint64_t *out_ext3_int
) {
uint64_t col = blockIdx.x;
const uint64_t *slab_a = columns + (col * 3 + 0) * col_stride;
const uint64_t *slab_b = columns + (col * 3 + 1) * col_stride;
const uint64_t *slab_c = columns + (col * 3 + 2) * col_stride;

ext3::Fe3 acc = ext3::zero();
for (uint64_t i = threadIdx.x; i < n; i += BARY_BLOCK_DIM) {
uint64_t lde_i = i * row_stride;
ext3::Fe3 eval = ext3::make(slab_a[lde_i], slab_b[lde_i], slab_c[lde_i]);
uint64_t point = coset_points[i];
ext3::Fe3 pe = ext3::mul_base(eval, point);
ext3::Fe3 inv_d = ext3::make(
inv_denoms[i * 3 + 0],
inv_denoms[i * 3 + 1],
inv_denoms[i * 3 + 2]);
ext3::Fe3 term = ext3::mul(pe, inv_d);
acc = ext3::add(acc, term);
}

ext3::Fe3 sum = block_reduce_ext3(acc);
if (threadIdx.x == 0) {
out_ext3_int[col * 3 + 0] = sum.a;
out_ext3_int[col * 3 + 1] = sum.b;
out_ext3_int[col * 3 + 2] = sum.c;
}
}
Loading
Loading