From ce78bbc891e2d854a3e07e0e1a9d06c1c149e632 Mon Sep 17 00:00:00 2001 From: diegokingston Date: Tue, 26 May 2026 14:01:10 -0300 Subject: [PATCH] perf(stark/verifier): four micro-opts targeting the recursion guest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All four changes preserve verifier semantics — `cargo test -p stark` passes 126/126; `cargo test -p lambda-vm-prover --lib` keeps the baseline 273 pass / 77 pre-existing failures (unrelated to constraints, `UnknownSyscall(5)` on `main`); `make lint` is clean. 1) step_2_verify_claimed_composition_polynomial: drop the HashMap step memo in favour of a small Vec scanned linearly. Boundary constraints number in the low tens; in RISC-V the HashMap's hash function + heap allocator outweigh O(B^2) scan savings. 2) reconstruct_deep_composition_poly_evaluations_for_all_queries: hoist lde_base / lde_base_sym / denoms_scratch out of the per-query loop and `.clear()` each iteration instead of fresh `Vec::new()`. At ~64-128 queries this removes hundreds of small allocations per verify. `reconstruct_deep_composition_poly_evaluation` now takes the scratch buffer as a `&mut Vec>` parameter. 3) verify: drop `proof.clone()` for the single-table wrapper. Added a `multi_verify_proofs` slice-taking variant; `multi_verify` is now a one-line wrapper preserving the existing `&MultiProof` API for all current callers (~50 across stark tests and prover). 4) verify_trace_openings and verify_query_and_sym_openings: replace `result & openings_ok` (always-evaluate fold) with early `return false` on each opening failure. Cuts wasted Merkle work on tampered proofs. The FRI loop additionally drops the pre-allocated `evaluation_point_vec` in favour of stepping `evaluation_point_squared` in-place — one fewer Vec alloc per query. A length-alignment guard makes the loop hit its `i == last_layer_idx` return for any well-formed proof. --- crypto/stark/src/verifier.rs | 226 +++++++++++++++++++++-------------- 1 file changed, 137 insertions(+), 89 deletions(-) diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs index 11d5c5c44..e55260360 100644 --- a/crypto/stark/src/verifier.rs +++ b/crypto/stark/src/verifier.rs @@ -25,7 +25,6 @@ use math::{ }, traits::AsBytes, }; -use std::collections::HashMap; use std::marker::PhantomData; #[cfg(feature = "instruments")] use std::time::Instant; @@ -110,18 +109,23 @@ pub trait IsStarkVerifier< proof.bus_public_inputs.as_ref(), trace_length, ); - // Precompute g^step once per distinct step to avoid the prior O(B^2) - // linear scan. A single pass populates a memo and resolves each - // constraint's step to its point in O(1) amortized. - let mut step_to_point: HashMap> = HashMap::new(); + // Precompute g^step once per distinct step. A small `Vec` with a + // linear scan beats `HashMap` here: boundary constraints typically + // number in the low tens, the recursion guest pays no allocator/hash + // overhead, and the AIR generally emits its constraints grouped by + // step so the scan hits the hot entry first. + let mut step_to_point: Vec<(usize, FieldElement)> = Vec::new(); let boundary_points: Vec> = boundary_constraints .constraints .iter() .map(|c| { - step_to_point - .entry(c.step) - .or_insert_with(|| domain.trace_primitive_root.pow(c.step as u64)) - .clone() + if let Some((_, point)) = step_to_point.iter().find(|(s, _)| *s == c.step) { + point.clone() + } else { + let point = domain.trace_primitive_root.pow(c.step as u64); + step_to_point.push((c.step, point.clone())); + point + } }) .collect(); @@ -352,37 +356,48 @@ pub trait IsStarkVerifier< FieldElement: AsBytes + Sync + Send, { // Main trace (multiplicities for preprocessed, full trace for normal). - let mut ok = Self::verify_opening_pair::( + // Short-circuit on any failure: each opening pair is a Merkle-path + // verification (~20 Keccak hashes against base-field leaves); in the + // recursion guest this is non-trivial cycle cost worth skipping. + if !Self::verify_opening_pair::( &deep_poly_openings.main_trace_polys, &proof.lde_trace_main_merkle_root, iota, - ); + ) { + return false; + } // Precomputed trace (preprocessed tables only). Mismatched presence is // unreachable in practice (multi_verify rejects such proofs upstream), // but a defensive check keeps this function self-contained. - ok &= match ( + match ( &proof.lde_trace_precomputed_merkle_root, &deep_poly_openings.precomputed_trace_polys, ) { - (Some(root), Some(opening)) => Self::verify_opening_pair::(opening, root, iota), - (None, None) => true, - _ => false, - }; + (Some(root), Some(opening)) => { + if !Self::verify_opening_pair::(opening, root, iota) { + return false; + } + } + (None, None) => {} + _ => return false, + } // Auxiliary trace. - ok &= match ( + match ( proof.lde_trace_aux_merkle_root, &deep_poly_openings.aux_trace_polys, ) { (Some(root), Some(opening)) => { - Self::verify_opening_pair::(opening, &root, iota) + if !Self::verify_opening_pair::(opening, &root, iota) { + return false; + } } - (None, None) => true, - _ => false, - }; + (None, None) => {} + _ => return false, + } - ok + true } /// Verify opening Open(Hįµ¢(D_LDE), šœ) and Open(Hįµ¢(D_LDE), -šœ) for all parts Hįµ¢of the composition @@ -480,71 +495,70 @@ pub trait IsStarkVerifier< FieldElement: AsBytes + Sync + Send, { let fri_layers_merkle_roots = &proof.fri_layers_merkle_roots; - let evaluation_point_vec: Vec> = - core::iter::successors(Some(evaluation_point_inv.square()), |evaluation_point| { - Some(evaluation_point.square()) - }) - .take(fri_layers_merkle_roots.len()) - .collect(); - let p0_eval = deep_composition_evaluation; let p0_eval_sym = deep_composition_evaluation_sym; // Reconstruct p₁(šœĀ²) let mut v = - (p0_eval + p0_eval_sym) + evaluation_point_inv * &zetas[0] * (p0_eval - p0_eval_sym); + (p0_eval + p0_eval_sym) + &evaluation_point_inv * &zetas[0] * (p0_eval - p0_eval_sym); let mut index = iota; // Handle case with 0 FRI layers (trace_length <= 2) - // In this case, the fold loop below doesn't iterate, so we need to verify - // the final value directly here. if fri_layers_merkle_roots.is_empty() { return v == proof.fri_last_value; } - // For each FRI layer, starting from the layer 1: use the proof to verify the validity of values pįµ¢(āˆ’šœ^(2ⁱ)) (given by the prover) and - // pįµ¢(šœ^(2ⁱ)) (computed on the previous iteration by the verifier). Then use them to obtain pįµ¢ā‚Šā‚(šœ^(2ⁱ⁺¹)). - // Finally, check that the final value coincides with the given by the prover. - fri_layers_merkle_roots + // Guard zip alignment: the three iterables MUST have equal lengths. + // A malformed proof with mismatched lengths would otherwise silently + // truncate the verification or panic on the `len() - 1` below. + if fri_decommitment.layers_auth_paths.len() != fri_layers_merkle_roots.len() + || fri_decommitment.layers_evaluations_sym.len() != fri_layers_merkle_roots.len() + { + return false; + } + + // For each FRI layer, verify openings then fold to the next layer's + // evaluation. `evaluation_point_squared` is stepped in-place instead + // of pre-collecting a Vec, and a failed opening short-circuits the + // remaining Merkle work (each call is ~logā‚‚(N) Keccak hashes). + let last_layer_idx = fri_layers_merkle_roots.len() - 1; + let mut evaluation_point_squared = evaluation_point_inv.square(); + for (i, ((merkle_root, auth_path_sym), evaluation_sym)) in fri_layers_merkle_roots .iter() - .enumerate() .zip(&fri_decommitment.layers_auth_paths) .zip(&fri_decommitment.layers_evaluations_sym) - .zip(evaluation_point_vec) - .fold( - true, - |result, - ( - (((i, merkle_root), auth_path_sym), evaluation_sym), - evaluation_point_inv, - )| { - // Verify opening Open(pįµ¢(Dā‚–), āˆ’šœ^(2ⁱ)) and Open(pįµ¢(Dā‚–), šœ^(2ⁱ)). - // `v` is pįµ¢(šœ^(2ⁱ)). - // `evaluation_sym` is pįµ¢(āˆ’šœ^(2ⁱ)). - let openings_ok = Self::verify_fri_layer_openings( - merkle_root, - auth_path_sym, - &v, - evaluation_sym, - index, - ); - - // Update `v` with next value pįµ¢ā‚Šā‚(šœ^(2ⁱ⁺¹)). - v = (&v + evaluation_sym) + evaluation_point_inv * &zetas[i + 1] * (&v - evaluation_sym); - - // Update index for next iteration. The index of the squares in the next layer - // is obtained by halving the current index. This is due to the bit-reverse - // ordering of the elements in the Merkle tree. - index >>= 1; - - if i < fri_decommitment.layers_evaluations_sym.len() - 1 { - result & openings_ok - } else { - // Check that final value is the given by the prover - result & (v == proof.fri_last_value) & openings_ok - } - }, - ) + .enumerate() + { + // Verify opening Open(pįµ¢(Dā‚–), āˆ’šœ^(2ⁱ)) and Open(pįµ¢(Dā‚–), šœ^(2ⁱ)). + // `v` is pįµ¢(šœ^(2ⁱ)). `evaluation_sym` is pįµ¢(āˆ’šœ^(2ⁱ)). + if !Self::verify_fri_layer_openings( + merkle_root, + auth_path_sym, + &v, + evaluation_sym, + index, + ) { + return false; + } + + // Update `v` with next value pįµ¢ā‚Šā‚(šœ^(2ⁱ⁺¹)). + v = (&v + evaluation_sym) + + &evaluation_point_squared * &zetas[i + 1] * (&v - evaluation_sym); + + // Index of the squares in the next layer = current index halved + // (bit-reverse ordering of the Merkle tree). + index >>= 1; + + if i == last_layer_idx { + return v == proof.fri_last_value; + } + evaluation_point_squared = evaluation_point_squared.square(); + } + + // Unreachable: the length guard above ensures the loop iterates at + // least once (we passed the is_empty check) and hits the + // `i == last_layer_idx` return. + unreachable!("loop must hit the last_layer_idx return") } fn reconstruct_deep_composition_poly_evaluations_for_all_queries( @@ -562,11 +576,19 @@ pub trait IsStarkVerifier< let primitive_root = &Field::get_primitive_root_of_unity(domain.root_order as u64) .expect("verifier domain root_order is a valid power of two"); + // Scratch buffers hoisted out of the query loop: each iteration calls + // `.clear()` (preserves capacity) instead of re-allocating. With + // ~64-128 queries this saves hundreds of small allocations, which + // matters when the verifier runs as a RISC-V recursion guest. + let mut lde_base: Vec> = Vec::new(); + let mut lde_base_sym: Vec> = Vec::new(); + let mut denoms_scratch: Vec> = Vec::new(); + for (i, iota) in challenges.iotas.iter().enumerate() { let opening = &proof.deep_poly_openings[i]; // Base-field portion: precomputed columns FIRST, then main trace columns. - let mut lde_base: Vec> = Vec::new(); + lde_base.clear(); if let Some(p) = &opening.precomputed_trace_polys { lde_base.extend_from_slice(&p.evaluations); } @@ -587,10 +609,11 @@ pub trait IsStarkVerifier< &lde_base, lde_aux, &opening.composition_poly.evaluations, + &mut denoms_scratch, )?); // Mirror for the symmetric query point. - let mut lde_base_sym: Vec> = Vec::new(); + lde_base_sym.clear(); if let Some(p) = &opening.precomputed_trace_polys { lde_base_sym.extend_from_slice(&p.evaluations_sym); } @@ -611,11 +634,17 @@ pub trait IsStarkVerifier< &lde_base_sym, lde_aux_sym, &opening.composition_poly.evaluations_sym, + &mut denoms_scratch, )?); } Some((deep_poly_evaluations, deep_poly_evaluations_sym)) } + /// `denoms_scratch` is a caller-owned buffer reused across invocations. + /// It is cleared on entry and refilled with `ood_evaluations_table_height` + /// values, so passing the same buffer across all queries avoids a fresh + /// allocation per call (relevant in the recursion guest). + #[allow(clippy::too_many_arguments)] fn reconstruct_deep_composition_poly_evaluation( proof: &StarkProof, evaluation_point: &FieldElement, @@ -624,6 +653,7 @@ pub trait IsStarkVerifier< lde_trace_base_evaluations: &[FieldElement], lde_trace_aux_evaluations: &[FieldElement], lde_composition_poly_parts_evaluation: &[FieldElement], + denoms_scratch: &mut Vec>, ) -> Option> { let ood_evaluations_table_height = proof.trace_ood_evaluations.height; let ood_evaluations_table_width = proof.trace_ood_evaluations.width; @@ -645,14 +675,15 @@ pub trait IsStarkVerifier< return None; } - let mut denoms_trace = Vec::with_capacity(ood_evaluations_table_height); + denoms_scratch.clear(); + denoms_scratch.reserve(ood_evaluations_table_height); let mut current_z = challenges.z.clone(); for _ in 0..ood_evaluations_table_height { - denoms_trace.push(evaluation_point - ¤t_z); + denoms_scratch.push(evaluation_point - ¤t_z); current_z = primitive_root * ¤t_z; } // A malformed proof can land an OOD evaluation point on the LDE coset, reject. - FieldElement::inplace_batch_inverse(&mut denoms_trace).ok()?; + FieldElement::inplace_batch_inverse(denoms_scratch.as_mut_slice()).ok()?; let num_base = lde_trace_base_evaluations.len(); let trace_term = (0..ood_evaluations_table_width) @@ -668,7 +699,7 @@ pub trait IsStarkVerifier< } else { &lde_trace_aux_evaluations[col_idx - num_base] - ood_val }; - let poly_evaluation = diff * &denoms_trace[row_idx]; + let poly_evaluation = diff * &denoms_scratch[row_idx]; trace_t + &poly_evaluation * coeff }, ); @@ -723,11 +754,27 @@ pub trait IsStarkVerifier< FieldElement: AsBytes + Sync + Send, FieldElement: AsBytes + Sync + Send, { - if airs.len() != multi_proof.proofs.len() { + Self::multi_verify_proofs(airs, &multi_proof.proofs, transcript, expected_bus_balance) + } + + /// Slice-taking variant of [`Self::multi_verify`]. Callers that already + /// hold a slice of proofs (or a single proof via [`core::slice::from_ref`]) + /// can call this directly without constructing a [`MultiProof`]. + fn multi_verify_proofs( + airs: &[&dyn AIR], + proofs: &[StarkProof], + transcript: &mut (impl IsStarkTranscript + Clone), + expected_bus_balance: &FieldElement, + ) -> bool + where + FieldElement: AsBytes + Sync + Send, + FieldElement: AsBytes + Sync + Send, + { + if airs.len() != proofs.len() { error!( "AIR count ({}) does not match proof count ({})", airs.len(), - multi_proof.proofs.len() + proofs.len() ); return false; } @@ -741,7 +788,7 @@ pub trait IsStarkVerifier< // For preprocessed tables, use the hardcoded commitment (verifier cannot // trust the prover). For normal tables, use the commitment from the proof. - for (idx, (air, proof)) in airs.iter().zip(&multi_proof.proofs).enumerate() { + for (idx, (air, proof)) in airs.iter().zip(proofs).enumerate() { if air.is_preprocessed() { // Preprocessed table: VERIFY precomputed commitment matches hardcoded. // This is the critical soundness check - ensures prover used correct precomputed values. @@ -795,7 +842,7 @@ pub trait IsStarkVerifier< // boundary constraints on LogUp columns, so the bus balance check is // the only cross-table validation. - for (idx, (air, proof)) in airs.iter().zip(&multi_proof.proofs).enumerate() { + for (idx, (air, proof)) in airs.iter().zip(proofs).enumerate() { if air.has_trace_interaction() && proof.bus_public_inputs.is_none() { error!( "Table {idx}: AIR has LogUp interactions but proof is missing bus_public_inputs" @@ -817,7 +864,7 @@ pub trait IsStarkVerifier< // state after Phase B, domain-separated by table index). This matches // the prover's forking and makes per-table verification independent. - for (idx, (air, proof)) in airs.iter().zip(&multi_proof.proofs).enumerate() { + for (idx, (air, proof)) in airs.iter().zip(proofs).enumerate() { // Must match prover: fork with domain separator for multi-table, // use original transcript directly for single-table. let num_tables = airs.len(); @@ -865,7 +912,7 @@ pub trait IsStarkVerifier< if needs_lookup_challenges { let mut total = FieldElement::::zero(); - for (air, proof) in airs.iter().zip(&multi_proof.proofs) { + for (air, proof) in airs.iter().zip(proofs) { if air.has_trace_interaction() && let Some(interaction) = &proof.bus_public_inputs { @@ -898,12 +945,13 @@ pub trait IsStarkVerifier< where FieldElement: AsBytes + Sync + Send, FieldElement: AsBytes + Sync + Send, - PI: Clone, { - let multi_proof = MultiProof { - proofs: vec![proof.clone()], - }; - Self::multi_verify(&[air], &multi_proof, transcript, &FieldElement::zero()) + Self::multi_verify_proofs( + &[air], + core::slice::from_ref(proof), + transcript, + &FieldElement::zero(), + ) } /// Replays rounds 2, 3 and 4 of the protocol for a given proof, assuming round 1 has