Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
d1a0abf
add first cuda files
ColoCarletti May 6, 2026
79634ff
fmt
ColoCarletti May 6, 2026
ac6fbb5
fix clippy
ColoCarletti May 6, 2026
2ceb3b0
gpu 2nd part
ColoCarletti May 6, 2026
affceb1
feat(cuda): Round 1 GPU LDE+commit dispatch + device-resident handles
ColoCarletti May 6, 2026
01172f2
merge main
ColoCarletti May 19, 2026
c4627e1
Merge branch 'main' into feat/cuda-pr2-r1-gpu-commits
ColoCarletti May 19, 2026
01aa5e4
comments fix
ColoCarletti May 20, 2026
cfc5c19
Merge branch 'main' into feat/cuda-pr2-r1-gpu-commits
MauroToscano May 21, 2026
ea5696f
Update crypto/stark/src/gpu_lde.rs
ColoCarletti May 21, 2026
a8cf265
Update crypto/stark/src/gpu_lde.rs
ColoCarletti May 21, 2026
fb8d31f
Update crypto/stark/src/gpu_lde.rs
ColoCarletti May 21, 2026
a79f2b5
Update crypto/stark/src/gpu_lde.rs
ColoCarletti May 21, 2026
761a2c0
Update crypto/stark/src/gpu_lde.rs
ColoCarletti May 21, 2026
e066e9d
address reviews
ColoCarletti May 21, 2026
7d3d0f0
fix review comments
ColoCarletti May 22, 2026
cf80771
Merge remote-tracking branch 'origin/main' into feat/cuda-pr2-r1-gpu-…
ColoCarletti May 22, 2026
71aba0d
address doc comment suggestions
ColoCarletti May 22, 2026
83d91b8
Merge branch 'main' into feat/cuda-pr2-r1-gpu-commits
ColoCarletti May 22, 2026
34cae4b
fix
ColoCarletti May 22, 2026
f076bf4
Merge branch 'main' into feat/cuda-pr2-r1-gpu-commits
gabrielbosio May 27, 2026
a2cde0f
Pass replay transcript to bus-balance call in verify_vm_minimal
gabrielbosio May 27, 2026
46c305b
Update crypto/math-cuda/src/device.rs
ColoCarletti May 28, 2026
aca3dca
Merge branch 'main' into feat/cuda-pr2-r1-gpu-commits
ColoCarletti May 28, 2026
63d7c00
Update crypto/math-cuda/src/device.rs
ColoCarletti May 29, 2026
eb16c02
Update crypto/math-cuda/src/device.rs
ColoCarletti May 29, 2026
66925b1
Update crypto/math-cuda/src/device.rs
ColoCarletti May 29, 2026
4e6daf3
Update crypto/math-cuda/src/lde.rs
ColoCarletti May 29, 2026
4cd27d9
Update crypto/math-cuda/src/lde.rs
ColoCarletti May 29, 2026
5fe390f
Update crypto/math-cuda/src/lde.rs
ColoCarletti May 29, 2026
5819930
Update crypto/math-cuda/src/lde.rs
ColoCarletti May 29, 2026
33f7c36
Update crypto/math-cuda/src/lde.rs
ColoCarletti May 29, 2026
49d3607
Merge branch 'main' into feat/cuda-pr2-r1-gpu-commits
ColoCarletti May 29, 2026
c52521e
Merge branch 'main' into feat/cuda-pr2-r1-gpu-commits
ColoCarletti Jun 1, 2026
828ee16
fix comments
ColoCarletti Jun 1, 2026
fb2c5fe
Update Makefile
ColoCarletti Jun 1, 2026
5d77773
fix makefile
ColoCarletti Jun 1, 2026
0c990f1
Update prover/src/tests/prove_elfs_tests.rs
ColoCarletti Jun 1, 2026
da3c72d
fmt
ColoCarletti Jun 1, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
.PHONY: deps deps-linux deps-macos prepare-test-data compile-programs-asm compile-programs-rust compile-bench \
compile-programs clean-asm clean-rust clean-bench clean-shared clean test test-asm test-no-compile \
test-asm-no-compile test-rust test-rust-no-compile test-executor flamegraph-prover \
test-fast test-prover test-prover-all test-disk-spill test-math-cuda bench-math-cuda build check clippy fmt lint
test-fast test-prover test-prover-all test-disk-spill test-math-cuda bench-math-cuda bench-prover bench-prover-cuda build check clippy fmt lint

UNAME := $(shell uname)

Expand Down Expand Up @@ -198,6 +198,15 @@ test-math-cuda:
bench-math-cuda:
cargo test -p math-cuda --release --test bench_quick -- --ignored --nocapture

# Single-prove wall-time bench (warm-up + profiled run of fib_iterative_1M).
bench-prover:
cargo test -p lambda-vm-prover --release --test bench_single -- --ignored --nocapture

Comment thread
ColoCarletti marked this conversation as resolved.
# Single-prove wall-time bench with the GPU LDE path enabled.
# Needs an NVIDIA GPU + CUDA toolkit/driver.
bench-prover-cuda:
cargo test -p lambda-vm-prover --release --features cuda --test bench_single -- --ignored --nocapture

# Build all
build:
cargo build --workspace
Expand Down
38 changes: 38 additions & 0 deletions crypto/crypto/src/merkle_tree/merkle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,44 @@ where
Self::build_from_hashed_leaves(hashed_leaves)
}

/// Useful for handing a GPU-built tree to the stark prover.
/// Performs no hashing, the caller is responsible for the layout's
/// cryptographic correctness.
///
/// Expected layout (matches [`build_from_hashed_leaves`]):
/// - `nodes.len() == 2 * leaves_len - 1` where `leaves_len` is a power of two
/// - `nodes[0]` is the root
/// - `nodes[leaves_len - 1 .. 2*leaves_len - 1]` are the leaves
pub fn from_precomputed_nodes(nodes: Vec<B::Node>) -> Option<Self> {
Comment thread
ColoCarletti marked this conversation as resolved.
if nodes.is_empty() {
return None;
}
// Validate (cheap) that (nodes.len() + 1) is a power of two: there
// must be `leaves_len - 1 + leaves_len = 2*leaves_len - 1` entries.
let total = nodes.len();
if !(total + 1).is_power_of_two() {
return None;
}
// Debug-only integrity spot-check: the root must equal hash(left, right).
// Catches GPU correctness regressions in CI without paying for a full
// tree walk on every call.
#[cfg(debug_assertions)]
if total >= 3 {
let expected_root = B::hash_new_parent(&nodes[1], &nodes[2]);
debug_assert!(
nodes[ROOT] == expected_root,
"from_precomputed_nodes: root does not hash from children",
);
}
let root = nodes[ROOT].clone();
Some(MerkleTree {
root,
nodes,
#[cfg(feature = "disk-spill")]
mmap_backing: None,
})
}

/// Create a Merkle tree from pre-hashed leaf nodes.
///
/// This skips the `hash_leaves` step, useful when leaves have already been
Expand Down
59 changes: 42 additions & 17 deletions crypto/math-cuda/src/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,16 +102,15 @@ const STREAM_POOL_SIZE: usize = 32;
pub struct Backend {
pub ctx: Arc<CudaContext>,
streams: Vec<Arc<CudaStream>>,
/// Single shared pinned staging buffer, grown to the biggest LDE size
/// seen. Concurrent batched LDE calls serialise on it; in exchange the
/// process keeps only ONE gigabyte-sized pinned allocation (per-stream
/// buffers 32×-inflated memory use and multiplied the one-time pinning
/// cost for every first use of a new table size).
pinned_staging: Mutex<PinnedStaging>,
/// Separate pinned staging for Merkle leaf hashes. Sized `num_rows * 32`
/// bytes. It lives alongside the LDE staging so the GPU→host D2H for
/// hashed leaves runs at full PCIe line-rate.
pinned_hashes: Mutex<PinnedStaging>,
/// Per-rayon-worker pinned staging buffers. Indexed by
/// `rayon::current_thread_index()` (0 for non-rayon callers). Each slot
/// grows lazily on first use, idle slots stay at zero allocation.
/// Worst-case footprint is `N_workers × max_LDE_size` of pinned host RAM.
pinned_staging: Vec<Mutex<PinnedStaging>>,
/// Per-worker pinned staging for Merkle leaf hashes. Same layout as
/// `pinned_staging`; sized `num_rows * 32` bytes per slot. Lives
/// alongside the LDE staging so the GPU→host D2H runs at PCIe line-rate.
pinned_hashes: Vec<Mutex<PinnedStaging>>,
util_stream: Arc<CudaStream>,
next: AtomicUsize,

Expand Down Expand Up @@ -166,8 +165,20 @@ impl Backend {
for _ in 0..STREAM_POOL_SIZE {
streams.push(ctx.new_stream()?);
}
let pinned_staging = Mutex::new(PinnedStaging::empty());
let pinned_hashes = Mutex::new(PinnedStaging::empty());
// One slot per rayon worker. `current_thread_index()` returns
// `0..current_num_threads()`, and non-rayon callers (None) map to slot 0,
// so this many slots covers every caller.
//
// `current_num_threads()` returns the default-pool size (the cpu count)
// when no custom pool is in use. Stable across the backend's lifetime
// since rayon's pool is fixed at first use.
let n_slots = rayon::current_num_threads().max(1);
let pinned_staging: Vec<Mutex<PinnedStaging>> = (0..n_slots)
.map(|_| Mutex::new(PinnedStaging::empty()))
.collect();
let pinned_hashes: Vec<Mutex<PinnedStaging>> = (0..n_slots)
.map(|_| Mutex::new(PinnedStaging::empty()))
.collect();
// Separate "utility" stream for twiddle uploads and other bookkeeping;
// not part of the pool that callers rotate through.
let util_stream = ctx.new_stream()?;
Expand Down Expand Up @@ -219,16 +230,30 @@ impl Backend {
self.streams[idx].clone()
}

/// Shared pinned staging buffer. Grows to the largest LDE the process
/// has seen so far. Concurrent callers serialise on the mutex.
/// Per-rayon-worker pinned staging buffer. Returns the slot for the
/// current worker (or slot 0 outside a rayon context). Grows lazily to
/// the largest LDE the worker has seen. See [`Backend`]'s
/// `pinned_staging` field for the rationale behind the per-worker
/// split.
pub fn pinned_staging(&self) -> &Mutex<PinnedStaging> {
&self.pinned_staging
&self.pinned_staging[self.worker_slot(self.pinned_staging.len())]
}

/// Separate pinned staging for Merkle leaf hash output. Sized in u64
/// Per-worker pinned staging for Merkle leaf hash output. Sized in u64
/// units. Caller should reserve `(num_rows * 32 + 7) / 8` u64s.
pub fn pinned_hashes(&self) -> &Mutex<PinnedStaging> {
&self.pinned_hashes
&self.pinned_hashes[self.worker_slot(self.pinned_hashes.len())]
}

/// Map `rayon::current_thread_index()` to a slot index, with a defensive
/// clamp in case the rayon pool grew past the Vec we sized at init.
fn worker_slot(&self, len: usize) -> usize {
let idx = rayon::current_thread_index().unwrap_or(0);
// Should be unreachable with rayon's fixed default pool, but if a
// larger custom pool sneaks in we still want safety: Fall back to
// slot 0 (correctness preserved, just contention).
debug_assert!(idx < len, "rayon worker {idx} >= staging slots {len}");
idx.min(len.saturating_sub(1))
}

pub fn fwd_twiddles_for(&self, log_n: u64) -> Result<Arc<CudaSlice<u64>>> {
Expand Down
93 changes: 33 additions & 60 deletions crypto/math-cuda/src/lde.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
use std::sync::Arc;

use cudarc::driver::{CudaSlice, CudaStream, LaunchConfig, PushKernelArg};
use rayon::prelude::*;

use crate::Result;
use crate::device::{Backend, backend};
Expand Down Expand Up @@ -69,7 +68,9 @@ pub(crate) fn pack_ext3_to_pinned_slabs(columns: &[&[u64]], pinned: &mut [u64],
let m = columns.len();
debug_assert!(pinned.len() >= 3 * m * n);
let pinned_ptr_u = pinned.as_mut_ptr() as usize;
columns.par_iter().enumerate().for_each(|(c, col)| {
// Runs under the pinned-staging lock, where rayon can deadlock. See
// `Backend::pinned_staging`.
columns.iter().enumerate().for_each(|(c, col)| {
// SAFETY: each task writes to disjoint `[(c*3 + k)*n .. ..+n]` regions
// of `pinned`. The outer `&mut [u64]` borrow guarantees no aliasing.
let slab_a = unsafe {
Expand All @@ -96,7 +97,9 @@ fn unpack_pinned_slabs_to_ext3(pinned: &[u64], outputs: &mut [&mut [u64]], lde_s
let m = outputs.len();
debug_assert!(pinned.len() >= 3 * m * lde_size);
let pinned_const = pinned.as_ptr() as usize;
outputs.par_iter_mut().enumerate().for_each(|(c, dst)| {
// Runs under the pinned-staging lock, where rayon can deadlock. See
// `Backend::pinned_staging`.
outputs.iter_mut().enumerate().for_each(|(c, dst)| {
// SAFETY: each task reads from disjoint `[(c*3 + k)*lde_size .. ..+lde_size]`
// regions of `pinned`. Caller borrows `pinned` for the duration of the call.
let slab_a = unsafe {
Expand Down Expand Up @@ -178,19 +181,9 @@ fn d2h_bytes_via_pinned_hashes(
stream.memcpy_dtoh(dev_bytes, pinned_bytes)?;
stream.synchronize()?;

// Single-threaded `copy_from_slice` faults virgin pageable pages one at
// a time; the mm_struct rwsem serialises them at prover scale. Chunk so
// ~N cores pre-fault+write in parallel.
const CHUNK: usize = 64 * 1024;
let src_ptr = pinned_bytes.as_ptr() as usize;
dst.par_chunks_mut(CHUNK).enumerate().for_each(|(i, d)| {
// SAFETY: each task reads `[i*CHUNK .. i*CHUNK + d.len()]` of
// `pinned_bytes`, which is disjoint per `i` and lives until `staging`
// is dropped below.
let src =
unsafe { std::slice::from_raw_parts((src_ptr as *const u8).add(i * CHUNK), d.len()) };
d.copy_from_slice(src);
});
// Runs under the pinned_hashes lock, where rayon can deadlock. See
// `Backend::pinned_staging`.
dst.copy_from_slice(pinned_bytes);
drop(staging);
Ok(())
}
Expand Down Expand Up @@ -367,18 +360,12 @@ pub fn coset_lde_batch_base(
// SAFETY: staging is locked, the slice alias ends before we unlock.
let pinned = unsafe { staging.as_mut_slice(m * lde_size) };

// Pack columns into first m*n slots of the pinned buffer. Parallel: pinned
// writes are DRAM-bandwidth bound, so rayon spreads the cost across CPU
// cores.
let pinned_base_ptr = pinned.as_mut_ptr() as usize;
columns.par_iter().enumerate().for_each(|(c, col)| {
// SAFETY: each task writes to a disjoint `[c*n..c*n+n]` region of
// `pinned`, and the outer `staging` lock guarantees no other call is
// using the buffer concurrently.
let dst =
unsafe { std::slice::from_raw_parts_mut((pinned_base_ptr as *mut u64).add(c * n), n) };
dst.copy_from_slice(col);
});
// Pack columns into the first m*n slots of the pinned buffer. Runs under
// the pinned-staging lock, where rayon can deadlock. See
// `Backend::pinned_staging`.
for (c, col) in columns.iter().enumerate() {
pinned[c * n..c * n + n].copy_from_slice(col);
}

// Column layout: `buf[c * lde_size + r]`. Zeroed so the [n, lde_size)
// tail of each column is already the zero-pad the CPU path does.
Expand Down Expand Up @@ -459,12 +446,9 @@ pub fn coset_lde_batch_base(
stream.memcpy_dtoh(&buf, &mut pinned[..m * lde_size])?;
stream.synchronize()?;

// Split pinned → per-column Vec<u64>s. The first write to each virgin
// Vec page-faults, which can dominate total time. Parallelise so the
// fault cost spreads across CPU cores.
let pinned_ptr = pinned.as_ptr() as usize;
// Split pinned into per-column Vec<u64>s. Runs under the pinned-staging
// lock, where rayon can deadlock. See `Backend::pinned_staging`.
let out: Vec<Vec<u64>> = (0..m)
.into_par_iter()
.map(|c| {
// set_len skips the O(N) zero-init that vec![0; n] would do.
// copy_from_slice below writes every slot before any reader
Expand All @@ -475,10 +459,7 @@ pub fn coset_lde_batch_base(
unsafe { v.set_len(lde_size) };
v
};
let src = unsafe {
std::slice::from_raw_parts((pinned_ptr as *const u64).add(c * lde_size), lde_size)
};
v.copy_from_slice(src);
v.copy_from_slice(&pinned[c * lde_size..c * lde_size + lde_size]);
v
})
.collect();
Expand Down Expand Up @@ -602,15 +583,11 @@ pub fn coset_lde_batch_base_into(
stream.memcpy_dtoh(&buf, &mut pinned[..m * lde_size])?;
stream.synchronize()?;

// Parallel copy pinned → caller outputs. Caller's Vecs may still fault
// on first write; we spread that cost across rayon cores.
let pinned_ptr = pinned.as_ptr() as usize;
outputs.par_iter_mut().enumerate().for_each(|(c, dst)| {
let src = unsafe {
std::slice::from_raw_parts((pinned_ptr as *const u64).add(c * lde_size), lde_size)
};
dst.copy_from_slice(src);
});
// Copy pinned into caller outputs. Runs under the pinned-staging lock,
// where rayon can deadlock. See `Backend::pinned_staging`.
for (c, dst) in outputs.iter_mut().enumerate() {
dst.copy_from_slice(&pinned[c * lde_size..c * lde_size + lde_size]);
}
drop(staging);
Ok(())
}
Expand Down Expand Up @@ -734,12 +711,11 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner(
staging.ensure_capacity(m * lde_size, &be.ctx)?;
let pinned = unsafe { staging.as_mut_slice(m * lde_size) };

let pinned_base_ptr = pinned.as_mut_ptr() as usize;
columns.par_iter().enumerate().for_each(|(c, col)| {
let dst =
unsafe { std::slice::from_raw_parts_mut((pinned_base_ptr as *mut u64).add(c * n), n) };
dst.copy_from_slice(col);
});
// Pack columns into the pinned buffer. Runs under the pinned-staging
// lock, where rayon can deadlock. See `Backend::pinned_staging`.
for (c, col) in columns.iter().enumerate() {
pinned[c * n..c * n + n].copy_from_slice(col);
}

let mut buf = stream.alloc_zeros::<u64>(m * lde_size)?;
for c in 0..m {
Expand Down Expand Up @@ -833,14 +809,11 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner(
stream.memcpy_dtoh(&buf, &mut pinned[..m * lde_size])?;
d2h_bytes_via_pinned_hashes(&stream, be, &nodes_dev, nodes_out)?;

// Pinned LDE → caller outputs (post-sync host memcpy).
let pinned_ptr = pinned.as_ptr() as usize;
outputs.par_iter_mut().enumerate().for_each(|(c, dst)| {
let src = unsafe {
std::slice::from_raw_parts((pinned_ptr as *const u64).add(c * lde_size), lde_size)
};
dst.copy_from_slice(src);
});
// Copy pinned into caller outputs. Runs under the pinned-staging lock,
// where rayon can deadlock. See `Backend::pinned_staging`.
for (c, dst) in outputs.iter_mut().enumerate() {
dst.copy_from_slice(&pinned[c * lde_size..c * lde_size + lde_size]);
}
drop(staging);

if keep_device_buf {
Expand Down
1 change: 1 addition & 0 deletions crypto/stark/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ rayon = { version = "1.8.0", optional = true }
memmap2 = { version = "0.9", optional = true }
tempfile = { version = "3", optional = true }
libc = { version = "0.2", optional = true }

Comment thread
ColoCarletti marked this conversation as resolved.
# GPU backend for trace LDE — only linked when `cuda` is enabled.
math-cuda = { path = "../math-cuda", optional = true }

Expand Down
Loading
Loading