From ce78bbc891e2d854a3e07e0e1a9d06c1c149e632 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Tue, 26 May 2026 14:01:10 -0300
Subject: [PATCH] perf(stark/verifier): four micro-opts targeting the recursion
 guest
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All four changes preserve verifier semantics — `cargo test -p stark`
passes 126/126; `cargo test -p lambda-vm-prover --lib` keeps the
baseline 273 pass / 77 pre-existing failures (unrelated to constraints,
`UnknownSyscall(5)` on `main`); `make lint` is clean.

1) step_2_verify_claimed_composition_polynomial: drop the
   HashMap<usize, FE> step memo in favour of a small Vec scanned
   linearly. Boundary constraints number in the low tens; in RISC-V the
   HashMap's hash function + heap allocator outweigh O(B^2) scan savings.

2) reconstruct_deep_composition_poly_evaluations_for_all_queries: hoist
   lde_base / lde_base_sym / denoms_scratch out of the per-query loop
   and `.clear()` each iteration instead of fresh `Vec::new()`. At
   ~64-128 queries this removes hundreds of small allocations per
   verify. `reconstruct_deep_composition_poly_evaluation` now takes the
   scratch buffer as a `&mut Vec<FE<E>>` parameter.

3) verify: drop `proof.clone()` for the single-table wrapper. Added a
   `multi_verify_proofs` slice-taking variant; `multi_verify` is now a
   one-line wrapper preserving the existing `&MultiProof` API for all
   current callers (~50 across stark tests and prover).

4) verify_trace_openings and verify_query_and_sym_openings: replace
   `result & openings_ok` (always-evaluate fold) with early `return
   false` on each opening failure. Cuts wasted Merkle work on tampered
   proofs. The FRI loop additionally drops the pre-allocated
   `evaluation_point_vec` in favour of stepping `evaluation_point_squared`
   in-place — one fewer Vec alloc per query. A length-alignment guard
   makes the loop hit its `i == last_layer_idx` return for any
   well-formed proof.
---
 crypto/stark/src/verifier.rs | 226 +++++++++++++++++++++--------------
 1 file changed, 137 insertions(+), 89 deletions(-)
diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs
index 11d5c5c44..e55260360 100644
--- a/crypto/stark/src/verifier.rs
+++ b/crypto/stark/src/verifier.rs
@@ -25,7 +25,6 @@ use math::{
     },
     traits::AsBytes,
 };
-use std::collections::HashMap;
 use std::marker::PhantomData;
 #[cfg(feature = "instruments")]
 use std::time::Instant;
@@ -110,18 +109,23 @@ pub trait IsStarkVerifier<
             proof.bus_public_inputs.as_ref(),
             trace_length,
         );
-        // Precompute g^step once per distinct step to avoid the prior O(B^2)
-        // linear scan. A single pass populates a memo and resolves each
-        // constraint's step to its point in O(1) amortized.
-        let mut step_to_point: HashMap<usize, FieldElement<Field>> = HashMap::new();
+        // Precompute g^step once per distinct step. A small `Vec` with a
+        // linear scan beats `HashMap` here: boundary constraints typically
+        // number in the low tens, the recursion guest pays no allocator/hash
+        // overhead, and the AIR generally emits its constraints grouped by
+        // step so the scan hits the hot entry first.
+        let mut step_to_point: Vec<(usize, FieldElement<Field>)> = Vec::new();
         let boundary_points: Vec<FieldElement<Field>> = boundary_constraints
             .constraints
             .iter()
             .map(|c| {
-                step_to_point
-                    .entry(c.step)
-                    .or_insert_with(|| domain.trace_primitive_root.pow(c.step as u64))
-                    .clone()
+                if let Some((_, point)) = step_to_point.iter().find(|(s, _)| *s == c.step) {
+                    point.clone()
+                } else {
+                    let point = domain.trace_primitive_root.pow(c.step as u64);
+                    step_to_point.push((c.step, point.clone()));
+                    point
+                }
             })
             .collect();
 
@@ -352,37 +356,48 @@ pub trait IsStarkVerifier<
         FieldElement<FieldExtension>: AsBytes + Sync + Send,
     {
         // Main trace (multiplicities for preprocessed, full trace for normal).
-        let mut ok = Self::verify_opening_pair::<Field>(
+        // Short-circuit on any failure: each opening pair is a Merkle-path
+        // verification (~20 Keccak hashes against base-field leaves); in the
+        // recursion guest this is non-trivial cycle cost worth skipping.
+        if !Self::verify_opening_pair::<Field>(
             &deep_poly_openings.main_trace_polys,
             &proof.lde_trace_main_merkle_root,
             iota,
-        );
+        ) {
+            return false;
+        }
 
         // Precomputed trace (preprocessed tables only). Mismatched presence is
         // unreachable in practice (multi_verify rejects such proofs upstream),
         // but a defensive check keeps this function self-contained.
-        ok &= match (
+        match (
             &proof.lde_trace_precomputed_merkle_root,
             &deep_poly_openings.precomputed_trace_polys,
         ) {
-            (Some(root), Some(opening)) => Self::verify_opening_pair::<Field>(opening, root, iota),
-            (None, None) => true,
-            _ => false,
-        };
+            (Some(root), Some(opening)) => {
+                if !Self::verify_opening_pair::<Field>(opening, root, iota) {
+                    return false;
+                }
+            }
+            (None, None) => {}
+            _ => return false,
+        }
 
         // Auxiliary trace.
-        ok &= match (
+        match (
             proof.lde_trace_aux_merkle_root,
             &deep_poly_openings.aux_trace_polys,
         ) {
             (Some(root), Some(opening)) => {
-                Self::verify_opening_pair::<FieldExtension>(opening, &root, iota)
+                if !Self::verify_opening_pair::<FieldExtension>(opening, &root, iota) {
+                    return false;
+                }
             }
-            (None, None) => true,
-            _ => false,
-        };
+            (None, None) => {}
+            _ => return false,
+        }
 
-        ok
+        true
     }
 
     /// Verify opening Open(Hᵢ(D_LDE), 𝜐) and Open(Hᵢ(D_LDE), -𝜐) for all parts Hᵢof the composition
@@ -480,71 +495,70 @@ pub trait IsStarkVerifier<
         FieldElement<FieldExtension>: AsBytes + Sync + Send,
     {
         let fri_layers_merkle_roots = &proof.fri_layers_merkle_roots;
-        let evaluation_point_vec: Vec<FieldElement<Field>> =
-            core::iter::successors(Some(evaluation_point_inv.square()), |evaluation_point| {
-                Some(evaluation_point.square())
-            })
-            .take(fri_layers_merkle_roots.len())
-            .collect();
-
         let p0_eval = deep_composition_evaluation;
         let p0_eval_sym = deep_composition_evaluation_sym;
 
         // Reconstruct p₁(𝜐²)
         let mut v =
-            (p0_eval + p0_eval_sym) + evaluation_point_inv * &zetas[0] * (p0_eval - p0_eval_sym);
+            (p0_eval + p0_eval_sym) + &evaluation_point_inv * &zetas[0] * (p0_eval - p0_eval_sym);
         let mut index = iota;
 
         // Handle case with 0 FRI layers (trace_length <= 2)
-        // In this case, the fold loop below doesn't iterate, so we need to verify
-        // the final value directly here.
         if fri_layers_merkle_roots.is_empty() {
             return v == proof.fri_last_value;
         }
 
-        // For each FRI layer, starting from the layer 1: use the proof to verify the validity of values pᵢ(−𝜐^(2ⁱ)) (given by the prover) and
-        // pᵢ(𝜐^(2ⁱ)) (computed on the previous iteration by the verifier). Then use them to obtain pᵢ₊₁(𝜐^(2ⁱ⁺¹)).
-        // Finally, check that the final value coincides with the given by the prover.
-        fri_layers_merkle_roots
+        // Guard zip alignment: the three iterables MUST have equal lengths.
+        // A malformed proof with mismatched lengths would otherwise silently
+        // truncate the verification or panic on the `len() - 1` below.
+        if fri_decommitment.layers_auth_paths.len() != fri_layers_merkle_roots.len()
+            || fri_decommitment.layers_evaluations_sym.len() != fri_layers_merkle_roots.len()
+        {
+            return false;
+        }
+
+        // For each FRI layer, verify openings then fold to the next layer's
+        // evaluation. `evaluation_point_squared` is stepped in-place instead
+        // of pre-collecting a Vec, and a failed opening short-circuits the
+        // remaining Merkle work (each call is ~log₂(N) Keccak hashes).
+        let last_layer_idx = fri_layers_merkle_roots.len() - 1;
+        let mut evaluation_point_squared = evaluation_point_inv.square();
+        for (i, ((merkle_root, auth_path_sym), evaluation_sym)) in fri_layers_merkle_roots
             .iter()
-            .enumerate()
             .zip(&fri_decommitment.layers_auth_paths)
             .zip(&fri_decommitment.layers_evaluations_sym)
-            .zip(evaluation_point_vec)
-            .fold(
-                true,
-                |result,
-                 (
-                    (((i, merkle_root), auth_path_sym), evaluation_sym),
-                    evaluation_point_inv,
-                )| {
-                    // Verify opening Open(pᵢ(Dₖ), −𝜐^(2ⁱ)) and Open(pᵢ(Dₖ), 𝜐^(2ⁱ)).
-                    // `v` is pᵢ(𝜐^(2ⁱ)).
-                    // `evaluation_sym` is pᵢ(−𝜐^(2ⁱ)).
-                    let openings_ok = Self::verify_fri_layer_openings(
-                        merkle_root,
-                        auth_path_sym,
-                        &v,
-                        evaluation_sym,
-                        index,
-                    );
-
-                    // Update `v` with next value pᵢ₊₁(𝜐^(2ⁱ⁺¹)).
-                    v = (&v + evaluation_sym) + evaluation_point_inv * &zetas[i + 1] * (&v - evaluation_sym);
-
-                    // Update index for next iteration. The index of the squares in the next layer
-                    // is obtained by halving the current index. This is due to the bit-reverse
-                    // ordering of the elements in the Merkle tree.
-                    index >>= 1;
-
-                    if i < fri_decommitment.layers_evaluations_sym.len() - 1 {
-                        result & openings_ok
-                    } else {
-                        // Check that final value is the given by the prover
-                        result & (v == proof.fri_last_value) & openings_ok
-                    }
-                },
-            )
+            .enumerate()
+        {
+            // Verify opening Open(pᵢ(Dₖ), −𝜐^(2ⁱ)) and Open(pᵢ(Dₖ), 𝜐^(2ⁱ)).
+            // `v` is pᵢ(𝜐^(2ⁱ)). `evaluation_sym` is pᵢ(−𝜐^(2ⁱ)).
+            if !Self::verify_fri_layer_openings(
+                merkle_root,
+                auth_path_sym,
+                &v,
+                evaluation_sym,
+                index,
+            ) {
+                return false;
+            }
+
+            // Update `v` with next value pᵢ₊₁(𝜐^(2ⁱ⁺¹)).
+            v = (&v + evaluation_sym)
+                + &evaluation_point_squared * &zetas[i + 1] * (&v - evaluation_sym);
+
+            // Index of the squares in the next layer = current index halved
+            // (bit-reverse ordering of the Merkle tree).
+            index >>= 1;
+
+            if i == last_layer_idx {
+                return v == proof.fri_last_value;
+            }
+            evaluation_point_squared = evaluation_point_squared.square();
+        }
+
+        // Unreachable: the length guard above ensures the loop iterates at
+        // least once (we passed the is_empty check) and hits the
+        // `i == last_layer_idx` return.
+        unreachable!("loop must hit the last_layer_idx return")
     }
 
     fn reconstruct_deep_composition_poly_evaluations_for_all_queries(
@@ -562,11 +576,19 @@ pub trait IsStarkVerifier<
         let primitive_root = &Field::get_primitive_root_of_unity(domain.root_order as u64)
             .expect("verifier domain root_order is a valid power of two");
 
+        // Scratch buffers hoisted out of the query loop: each iteration calls
+        // `.clear()` (preserves capacity) instead of re-allocating. With
+        // ~64-128 queries this saves hundreds of small allocations, which
+        // matters when the verifier runs as a RISC-V recursion guest.
+        let mut lde_base: Vec<FieldElement<Field>> = Vec::new();
+        let mut lde_base_sym: Vec<FieldElement<Field>> = Vec::new();
+        let mut denoms_scratch: Vec<FieldElement<FieldExtension>> = Vec::new();
+
         for (i, iota) in challenges.iotas.iter().enumerate() {
             let opening = &proof.deep_poly_openings[i];
 
             // Base-field portion: precomputed columns FIRST, then main trace columns.
-            let mut lde_base: Vec<FieldElement<Field>> = Vec::new();
+            lde_base.clear();
             if let Some(p) = &opening.precomputed_trace_polys {
                 lde_base.extend_from_slice(&p.evaluations);
             }
@@ -587,10 +609,11 @@ pub trait IsStarkVerifier<
                 &lde_base,
                 lde_aux,
                 &opening.composition_poly.evaluations,
+                &mut denoms_scratch,
             )?);
 
             // Mirror for the symmetric query point.
-            let mut lde_base_sym: Vec<FieldElement<Field>> = Vec::new();
+            lde_base_sym.clear();
             if let Some(p) = &opening.precomputed_trace_polys {
                 lde_base_sym.extend_from_slice(&p.evaluations_sym);
             }
@@ -611,11 +634,17 @@ pub trait IsStarkVerifier<
                 &lde_base_sym,
                 lde_aux_sym,
                 &opening.composition_poly.evaluations_sym,
+                &mut denoms_scratch,
             )?);
         }
         Some((deep_poly_evaluations, deep_poly_evaluations_sym))
     }
 
+    /// `denoms_scratch` is a caller-owned buffer reused across invocations.
+    /// It is cleared on entry and refilled with `ood_evaluations_table_height`
+    /// values, so passing the same buffer across all queries avoids a fresh
+    /// allocation per call (relevant in the recursion guest).
+    #[allow(clippy::too_many_arguments)]
     fn reconstruct_deep_composition_poly_evaluation(
         proof: &StarkProof<Field, FieldExtension, PI>,
         evaluation_point: &FieldElement<Field>,
@@ -624,6 +653,7 @@ pub trait IsStarkVerifier<
         lde_trace_base_evaluations: &[FieldElement<Field>],
         lde_trace_aux_evaluations: &[FieldElement<FieldExtension>],
         lde_composition_poly_parts_evaluation: &[FieldElement<FieldExtension>],
+        denoms_scratch: &mut Vec<FieldElement<FieldExtension>>,
     ) -> Option<FieldElement<FieldExtension>> {
         let ood_evaluations_table_height = proof.trace_ood_evaluations.height;
         let ood_evaluations_table_width = proof.trace_ood_evaluations.width;
@@ -645,14 +675,15 @@ pub trait IsStarkVerifier<
             return None;
         }
 
-        let mut denoms_trace = Vec::with_capacity(ood_evaluations_table_height);
+        denoms_scratch.clear();
+        denoms_scratch.reserve(ood_evaluations_table_height);
         let mut current_z = challenges.z.clone();
         for _ in 0..ood_evaluations_table_height {
-            denoms_trace.push(evaluation_point - &current_z);
+            denoms_scratch.push(evaluation_point - &current_z);
             current_z = primitive_root * &current_z;
         }
         // A malformed proof can land an OOD evaluation point on the LDE coset, reject.
-        FieldElement::inplace_batch_inverse(&mut denoms_trace).ok()?;
+        FieldElement::inplace_batch_inverse(denoms_scratch.as_mut_slice()).ok()?;
 
         let num_base = lde_trace_base_evaluations.len();
         let trace_term = (0..ood_evaluations_table_width)
@@ -668,7 +699,7 @@ pub trait IsStarkVerifier<
                         } else {
                             &lde_trace_aux_evaluations[col_idx - num_base] - ood_val
                         };
-                        let poly_evaluation = diff * &denoms_trace[row_idx];
+                        let poly_evaluation = diff * &denoms_scratch[row_idx];
                         trace_t + &poly_evaluation * coeff
                     },
                 );
@@ -723,11 +754,27 @@ pub trait IsStarkVerifier<
         FieldElement<Field>: AsBytes + Sync + Send,
         FieldElement<FieldExtension>: AsBytes + Sync + Send,
     {
-        if airs.len() != multi_proof.proofs.len() {
+        Self::multi_verify_proofs(airs, &multi_proof.proofs, transcript, expected_bus_balance)
+    }
+
+    /// Slice-taking variant of [`Self::multi_verify`]. Callers that already
+    /// hold a slice of proofs (or a single proof via [`core::slice::from_ref`])
+    /// can call this directly without constructing a [`MultiProof`].
+    fn multi_verify_proofs(
+        airs: &[&dyn AIR<Field = Field, FieldExtension = FieldExtension, PublicInputs = PI>],
+        proofs: &[StarkProof<Field, FieldExtension, PI>],
+        transcript: &mut (impl IsStarkTranscript<FieldExtension, Field> + Clone),
+        expected_bus_balance: &FieldElement<FieldExtension>,
+    ) -> bool
+    where
+        FieldElement<Field>: AsBytes + Sync + Send,
+        FieldElement<FieldExtension>: AsBytes + Sync + Send,
+    {
+        if airs.len() != proofs.len() {
             error!(
                 "AIR count ({}) does not match proof count ({})",
                 airs.len(),
-                multi_proof.proofs.len()
+                proofs.len()
             );
             return false;
         }
@@ -741,7 +788,7 @@ pub trait IsStarkVerifier<
         // For preprocessed tables, use the hardcoded commitment (verifier cannot
         // trust the prover). For normal tables, use the commitment from the proof.
 
-        for (idx, (air, proof)) in airs.iter().zip(&multi_proof.proofs).enumerate() {
+        for (idx, (air, proof)) in airs.iter().zip(proofs).enumerate() {
             if air.is_preprocessed() {
                 // Preprocessed table: VERIFY precomputed commitment matches hardcoded.
                 // This is the critical soundness check - ensures prover used correct precomputed values.
@@ -795,7 +842,7 @@ pub trait IsStarkVerifier<
         // boundary constraints on LogUp columns, so the bus balance check is
         // the only cross-table validation.
 
-        for (idx, (air, proof)) in airs.iter().zip(&multi_proof.proofs).enumerate() {
+        for (idx, (air, proof)) in airs.iter().zip(proofs).enumerate() {
             if air.has_trace_interaction() && proof.bus_public_inputs.is_none() {
                 error!(
                     "Table {idx}: AIR has LogUp interactions but proof is missing bus_public_inputs"
@@ -817,7 +864,7 @@ pub trait IsStarkVerifier<
         // state after Phase B, domain-separated by table index). This matches
         // the prover's forking and makes per-table verification independent.
 
-        for (idx, (air, proof)) in airs.iter().zip(&multi_proof.proofs).enumerate() {
+        for (idx, (air, proof)) in airs.iter().zip(proofs).enumerate() {
             // Must match prover: fork with domain separator for multi-table,
             // use original transcript directly for single-table.
             let num_tables = airs.len();
@@ -865,7 +912,7 @@ pub trait IsStarkVerifier<
 
         if needs_lookup_challenges {
             let mut total = FieldElement::<FieldExtension>::zero();
-            for (air, proof) in airs.iter().zip(&multi_proof.proofs) {
+            for (air, proof) in airs.iter().zip(proofs) {
                 if air.has_trace_interaction()
                     && let Some(interaction) = &proof.bus_public_inputs
                 {
@@ -898,12 +945,13 @@ pub trait IsStarkVerifier<
     where
         FieldElement<Field>: AsBytes + Sync + Send,
         FieldElement<FieldExtension>: AsBytes + Sync + Send,
-        PI: Clone,
     {
-        let multi_proof = MultiProof {
-            proofs: vec![proof.clone()],
-        };
-        Self::multi_verify(&[air], &multi_proof, transcript, &FieldElement::zero())
+        Self::multi_verify_proofs(
+            &[air],
+            core::slice::from_ref(proof),
+            transcript,
+            &FieldElement::zero(),
+        )
     }
 
     /// Replays rounds 2, 3 and 4 of the protocol for a given proof, assuming round 1 has