From afae49b6587e7a9cf6daec080658eb256dc0c833 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Fri, 29 May 2026 15:18:31 -0300
Subject: [PATCH 1/8] perf(verifier): batch all DEEP denominators into one
 inversion

---
 crypto/stark/src/verifier.rs | 157 ++++++++++++++++++++++-------------
 1 file changed, 100 insertions(+), 57 deletions(-)
diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs
index 8091c8b32..73a2df162 100644
--- a/crypto/stark/src/verifier.rs
+++ b/crypto/stark/src/verifier.rs
@@ -562,68 +562,122 @@ pub trait IsStarkVerifier<
         let primitive_root = &Field::get_primitive_root_of_unity(domain.root_order as u64)
             .expect("verifier domain root_order is a valid power of two");
 
+        let height = proof.trace_ood_evaluations.height;
+        // Per-entry stride in the flat denominator buffer: `height` trace
+        // denominators followed by one composition denominator.
+        let stride = height + 1;
+
+        // Per-entry data carried from Pass 1 to Pass 2. We own `lde_base`
+        // (concatenated precomputed + main columns) and borrow the aux and
+        // composition evaluation slices straight out of the proof opening.
+        struct DeepEntry<'a, Field: IsField, FieldExtension: IsField> {
+            lde_base: Vec<FieldElement<Field>>,
+            lde_aux: &'a [FieldElement<FieldExtension>],
+            comp_evals: &'a [FieldElement<FieldExtension>],
+            is_sym: bool,
+        }
+
+        let mut entries: Vec<DeepEntry<Field, FieldExtension>> =
+            Vec::with_capacity(num_queries * 2);
+        // Flat buffer of all denominators across every (query, query-point).
+        // A SINGLE batch inversion is performed over this whole buffer.
+        let mut all_denoms: Vec<FieldElement<FieldExtension>> =
+            Vec::with_capacity(num_queries * 2 * stride);
+
+        // Pass 1: collect openings + denominators (no inversions yet).
         for (i, iota) in challenges.iotas.iter().enumerate() {
             let opening = &proof.deep_poly_openings[i];
 
-            // Base-field portion: precomputed columns FIRST, then main trace columns.
-            let mut lde_base: Vec<FieldElement<Field>> = Vec::new();
-            if let Some(p) = &opening.precomputed_trace_polys {
-                lde_base.extend_from_slice(&p.evaluations);
-            }
-            lde_base.extend_from_slice(&opening.main_trace_polys.evaluations);
+            for is_sym in [false, true] {
+                // Base-field portion: precomputed columns FIRST, then main trace columns.
+                let mut lde_base: Vec<FieldElement<Field>> = Vec::new();
+                if let Some(p) = &opening.precomputed_trace_polys {
+                    if is_sym {
+                        lde_base.extend_from_slice(&p.evaluations_sym);
+                    } else {
+                        lde_base.extend_from_slice(&p.evaluations);
+                    }
+                }
+                if is_sym {
+                    lde_base.extend_from_slice(&opening.main_trace_polys.evaluations_sym);
+                } else {
+                    lde_base.extend_from_slice(&opening.main_trace_polys.evaluations);
+                }
 
-            let lde_aux: &[FieldElement<FieldExtension>] = opening
-                .aux_trace_polys
-                .as_ref()
-                .map(|a| a.evaluations.as_slice())
-                .unwrap_or(&[]);
+                let lde_aux: &[FieldElement<FieldExtension>] = match &opening.aux_trace_polys {
+                    Some(a) if is_sym => a.evaluations_sym.as_slice(),
+                    Some(a) => a.evaluations.as_slice(),
+                    None => &[],
+                };
 
-            let evaluation_point = Self::query_challenge_to_evaluation_point(*iota, false, domain);
-            deep_poly_evaluations.push(Self::reconstruct_deep_composition_poly_evaluation(
-                proof,
-                &evaluation_point,
-                primitive_root,
-                challenges,
-                &lde_base,
-                lde_aux,
-                &opening.composition_poly.evaluations,
-            )?);
-
-            // Mirror for the symmetric query point.
-            let mut lde_base_sym: Vec<FieldElement<Field>> = Vec::new();
-            if let Some(p) = &opening.precomputed_trace_polys {
-                lde_base_sym.extend_from_slice(&p.evaluations_sym);
+                let comp_evals: &[FieldElement<FieldExtension>] = if is_sym {
+                    &opening.composition_poly.evaluations_sym
+                } else {
+                    &opening.composition_poly.evaluations
+                };
+
+                let evaluation_point =
+                    Self::query_challenge_to_evaluation_point(*iota, is_sym, domain);
+
+                // `height` trace denominators: (upsilon - z*g^k) for k = 0..height.
+                let mut current_z = challenges.z.clone();
+                for _ in 0..height {
+                    all_denoms.push(&evaluation_point - &current_z);
+                    current_z = primitive_root * &current_z;
+                }
+                // One composition denominator: (upsilon - z^N).
+                let z_pow = challenges.z.pow(comp_evals.len());
+                all_denoms.push(&evaluation_point - &z_pow);
+
+                entries.push(DeepEntry {
+                    lde_base,
+                    lde_aux,
+                    comp_evals,
+                    is_sym,
+                });
             }
-            lde_base_sym.extend_from_slice(&opening.main_trace_polys.evaluations_sym);
+        }
+
+        // Single global batch inversion. A malformed proof can land an OOD
+        // evaluation point on the LDE coset (zero denominator); this rejects
+        // the whole verify, matching the prior per-call semantics.
+        FieldElement::inplace_batch_inverse(&mut all_denoms).ok()?;
 
-            let lde_aux_sym: &[FieldElement<FieldExtension>] = opening
-                .aux_trace_polys
-                .as_ref()
-                .map(|a| a.evaluations_sym.as_slice())
-                .unwrap_or(&[]);
+        // Pass 2: reconstruct each DEEP evaluation using the pre-inverted denoms.
+        for (e, entry) in entries.iter().enumerate() {
+            let denoms_slice = &all_denoms[e * stride..e * stride + stride];
+            let trace_denoms_inv = &denoms_slice[..height];
+            let comp_denom_inv = &denoms_slice[height];
 
-            let evaluation_point = Self::query_challenge_to_evaluation_point(*iota, true, domain);
-            deep_poly_evaluations_sym.push(Self::reconstruct_deep_composition_poly_evaluation(
+            let value = Self::reconstruct_deep_composition_poly_evaluation(
                 proof,
-                &evaluation_point,
-                primitive_root,
                 challenges,
-                &lde_base_sym,
-                lde_aux_sym,
-                &opening.composition_poly.evaluations_sym,
-            )?);
+                &entry.lde_base,
+                entry.lde_aux,
+                entry.comp_evals,
+                trace_denoms_inv,
+                comp_denom_inv,
+            )?;
+
+            if entry.is_sym {
+                deep_poly_evaluations_sym.push(value);
+            } else {
+                deep_poly_evaluations.push(value);
+            }
         }
+
         Some((deep_poly_evaluations, deep_poly_evaluations_sym))
     }
 
+    #[allow(clippy::too_many_arguments)]
     fn reconstruct_deep_composition_poly_evaluation(
         proof: &StarkProof<Field, FieldExtension, PI>,
-        evaluation_point: &FieldElement<Field>,
-        primitive_root: &FieldElement<Field>,
         challenges: &Challenges<FieldExtension>,
         lde_trace_base_evaluations: &[FieldElement<Field>],
         lde_trace_aux_evaluations: &[FieldElement<FieldExtension>],
         lde_composition_poly_parts_evaluation: &[FieldElement<FieldExtension>],
+        trace_denoms_inv: &[FieldElement<FieldExtension>],
+        comp_denom_inv: &FieldElement<FieldExtension>,
     ) -> Option<FieldElement<FieldExtension>> {
         let ood_evaluations_table_height = proof.trace_ood_evaluations.height;
         let ood_evaluations_table_width = proof.trace_ood_evaluations.width;
@@ -644,15 +698,9 @@ pub trait IsStarkVerifier<
         {
             return None;
         }
-
-        let mut denoms_trace = Vec::with_capacity(ood_evaluations_table_height);
-        let mut current_z = challenges.z.clone();
-        for _ in 0..ood_evaluations_table_height {
-            denoms_trace.push(evaluation_point - &current_z);
-            current_z = primitive_root * &current_z;
+        if trace_denoms_inv.len() != ood_evaluations_table_height {
+            return None;
         }
-        // A malformed proof can land an OOD evaluation point on the LDE coset, reject.
-        FieldElement::inplace_batch_inverse(&mut denoms_trace).ok()?;
 
         let num_base = lde_trace_base_evaluations.len();
         let trace_term = (0..ood_evaluations_table_width)
@@ -668,18 +716,13 @@ pub trait IsStarkVerifier<
                         } else {
                             &lde_trace_aux_evaluations[col_idx - num_base] - ood_val
                         };
-                        let poly_evaluation = diff * &denoms_trace[row_idx];
+                        let poly_evaluation = diff * &trace_denoms_inv[row_idx];
                         trace_t + &poly_evaluation * coeff
                     },
                 );
                 trace_terms + trace_i
             });
 
-        let number_of_parts = lde_composition_poly_parts_evaluation.len();
-        let z_pow = &challenges.z.pow(number_of_parts);
-
-        // A malformed proof can make evaluation_point == z^N, reject.
-        let denom_composition = (evaluation_point - z_pow).inv().ok()?;
         let mut h_terms = FieldElement::zero();
         for (j, h_i_upsilon) in lde_composition_poly_parts_evaluation.iter().enumerate() {
             // Bounds-check via `.get(j)?`: a malformed opening may have more
@@ -689,7 +732,7 @@ pub trait IsStarkVerifier<
             let h_i_term = (h_i_upsilon - h_i_zpower) * gamma;
             h_terms += h_i_term;
         }
-        h_terms *= denom_composition;
+        h_terms *= comp_denom_inv;
 
         Some(trace_term + h_terms)
     }

From 618f97f5ece4441ba0c08fd2ab7a9eb8bb536aa3 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Fri, 29 May 2026 15:54:36 -0300
Subject: [PATCH 2/8] perf(verifier): drop per-query lde_base concatenation;
 hoist z_pow

---
 crypto/stark/src/verifier.rs | 66 +++++++++++++++++++++---------------
 1 file changed, 39 insertions(+), 27 deletions(-)

diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs
index 73a2df162..b63823e9b 100644
--- a/crypto/stark/src/verifier.rs
+++ b/crypto/stark/src/verifier.rs
@@ -567,11 +567,19 @@ pub trait IsStarkVerifier<
         // denominators followed by one composition denominator.
         let stride = height + 1;
 
-        // Per-entry data carried from Pass 1 to Pass 2. We own `lde_base`
-        // (concatenated precomputed + main columns) and borrow the aux and
-        // composition evaluation slices straight out of the proof opening.
+        // The composition denominator exponent is constant across all queries:
+        // it is the number of composition poly parts the proof advertises (the
+        // same array the consumer validates against). Hoist `z^N` once.
+        let number_of_parts = proof.composition_poly_parts_ood_evaluation.len();
+        let z_pow = challenges.z.pow(number_of_parts);
+
+        // Per-entry data carried from Pass 1 to Pass 2. We borrow every slice
+        // straight out of the proof opening: precomputed and main base-field
+        // columns separately (avoiding a per-query concatenation allocation),
+        // plus the aux and composition evaluation slices.
         struct DeepEntry<'a, Field: IsField, FieldExtension: IsField> {
-            lde_base: Vec<FieldElement<Field>>,
+            lde_precomputed: &'a [FieldElement<Field>],
+            lde_main: &'a [FieldElement<Field>],
             lde_aux: &'a [FieldElement<FieldExtension>],
             comp_evals: &'a [FieldElement<FieldExtension>],
             is_sym: bool,
@@ -589,20 +597,20 @@ pub trait IsStarkVerifier<
             let opening = &proof.deep_poly_openings[i];
 
             for is_sym in [false, true] {
-                // Base-field portion: precomputed columns FIRST, then main trace columns.
-                let mut lde_base: Vec<FieldElement<Field>> = Vec::new();
-                if let Some(p) = &opening.precomputed_trace_polys {
-                    if is_sym {
-                        lde_base.extend_from_slice(&p.evaluations_sym);
-                    } else {
-                        lde_base.extend_from_slice(&p.evaluations);
-                    }
-                }
-                if is_sym {
-                    lde_base.extend_from_slice(&opening.main_trace_polys.evaluations_sym);
+                // Base-field portion: precomputed columns FIRST, then main trace
+                // columns. Borrow both slices directly (empty slice when the
+                // opening carries no precomputed trace).
+                let lde_precomputed: &[FieldElement<Field>] = match &opening.precomputed_trace_polys
+                {
+                    Some(p) if is_sym => p.evaluations_sym.as_slice(),
+                    Some(p) => p.evaluations.as_slice(),
+                    None => &[],
+                };
+                let lde_main: &[FieldElement<Field>] = if is_sym {
+                    opening.main_trace_polys.evaluations_sym.as_slice()
                 } else {
-                    lde_base.extend_from_slice(&opening.main_trace_polys.evaluations);
-                }
+                    opening.main_trace_polys.evaluations.as_slice()
+                };
 
                 let lde_aux: &[FieldElement<FieldExtension>] = match &opening.aux_trace_polys {
                     Some(a) if is_sym => a.evaluations_sym.as_slice(),
@@ -626,11 +634,11 @@ pub trait IsStarkVerifier<
                     current_z = primitive_root * &current_z;
                 }
                 // One composition denominator: (upsilon - z^N).
-                let z_pow = challenges.z.pow(comp_evals.len());
                 all_denoms.push(&evaluation_point - &z_pow);
 
                 entries.push(DeepEntry {
-                    lde_base,
+                    lde_precomputed,
+                    lde_main,
                     lde_aux,
                     comp_evals,
                     is_sym,
@@ -652,7 +660,8 @@ pub trait IsStarkVerifier<
             let value = Self::reconstruct_deep_composition_poly_evaluation(
                 proof,
                 challenges,
-                &entry.lde_base,
+                entry.lde_precomputed,
+                entry.lde_main,
                 entry.lde_aux,
                 entry.comp_evals,
                 trace_denoms_inv,
@@ -673,7 +682,8 @@ pub trait IsStarkVerifier<
     fn reconstruct_deep_composition_poly_evaluation(
         proof: &StarkProof<Field, FieldExtension, PI>,
         challenges: &Challenges<FieldExtension>,
-        lde_trace_base_evaluations: &[FieldElement<Field>],
+        lde_precomputed: &[FieldElement<Field>],
+        lde_main: &[FieldElement<Field>],
         lde_trace_aux_evaluations: &[FieldElement<FieldExtension>],
         lde_composition_poly_parts_evaluation: &[FieldElement<FieldExtension>],
         trace_denoms_inv: &[FieldElement<FieldExtension>],
@@ -687,9 +697,9 @@ pub trait IsStarkVerifier<
         // column count does not match the OOD table width, or whose composition
         // poly parts count does not match the proof's `composition_poly_parts_ood_evaluation`.
         // Without these checks the indexing below would panic in release builds.
-        if lde_trace_base_evaluations.len() + lde_trace_aux_evaluations.len()
-            != ood_evaluations_table_width
-        {
+        let num_precomp = lde_precomputed.len();
+        let num_base = num_precomp + lde_main.len();
+        if num_base + lde_trace_aux_evaluations.len() != ood_evaluations_table_width {
             return None;
         }
         if trace_term_coeffs.is_empty()
@@ -702,7 +712,6 @@ pub trait IsStarkVerifier<
             return None;
         }
 
-        let num_base = lde_trace_base_evaluations.len();
         let trace_term = (0..ood_evaluations_table_width)
             .zip(&challenges.trace_term_coeffs)
             .fold(FieldElement::zero(), |trace_terms, (col_idx, coeff_row)| {
@@ -711,8 +720,11 @@ pub trait IsStarkVerifier<
                     |trace_t, (row_idx, coeff)| {
                         let ood_val = &proof.trace_ood_evaluations.get_row(row_idx)[col_idx];
                         // Stay in base when we can: F: IsSubFieldOf<E> gives F - E -> E.
-                        let diff: FieldElement<FieldExtension> = if col_idx < num_base {
-                            &lde_trace_base_evaluations[col_idx] - ood_val
+                        // Base columns are precomputed first, then main, then aux.
+                        let diff: FieldElement<FieldExtension> = if col_idx < num_precomp {
+                            &lde_precomputed[col_idx] - ood_val
+                        } else if col_idx < num_base {
+                            &lde_main[col_idx - num_precomp] - ood_val
                         } else {
                             &lde_trace_aux_evaluations[col_idx - num_base] - ood_val
                         };

From 1958280f2bf8c356ff1f3bec55b614c35364fe77 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Fri, 29 May 2026 15:59:59 -0300
Subject: [PATCH 3/8] perf(verifier): compute LogUp alpha powers once, slice
 per table

---
 crypto/stark/src/verifier.rs | 41 ++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs
index b63823e9b..bbe1efb20 100644
--- a/crypto/stark/src/verifier.rs
+++ b/crypto/stark/src/verifier.rs
@@ -102,6 +102,7 @@ pub trait IsStarkVerifier<
         proof: &StarkProof<Field, FieldExtension, PI>,
         domain: &VerifierDomain<Field>,
         challenges: &Challenges<FieldExtension>,
+        logup_alpha_powers: &[FieldElement<FieldExtension>],
     ) -> bool {
         let trace_length = proof.trace_length;
         let boundary_constraints = air.boundary_constraints(
@@ -173,14 +174,15 @@ pub trait IsStarkVerifier<
         let num_main_trace_columns =
             proof.trace_ood_evaluations.width - air.num_auxiliary_rap_columns();
 
-        let logup_alpha_powers: Vec<FieldElement<FieldExtension>> =
-            if challenges.rap_challenges.len() > LOGUP_CHALLENGE_ALPHA {
-                compute_alpha_powers(
-                    &challenges.rap_challenges[LOGUP_CHALLENGE_ALPHA],
-                    air.max_bus_elements(),
-                )
+        // Reuse a prefix slice of the globally-computed alpha powers instead of
+        // recomputing the multiplication chain per table. The global vector is
+        // sized to the maximum bus element count across all AIRs, so this
+        // table's prefix is always available; `.min` is purely defensive.
+        let logup_alpha_powers_slice: &[FieldElement<FieldExtension>] =
+            if !logup_alpha_powers.is_empty() {
+                &logup_alpha_powers[..air.max_bus_elements().min(logup_alpha_powers.len())]
             } else {
-                Vec::new()
+                &[]
             };
 
         let logup_table_offset = match &proof.bus_public_inputs {
@@ -201,7 +203,7 @@ pub trait IsStarkVerifier<
             &ood_frame,
             &periodic_values,
             &challenges.rap_challenges,
-            &logup_alpha_powers,
+            logup_alpha_powers_slice,
             &logup_table_offset,
             &packing_shifts,
         );
@@ -842,6 +844,19 @@ pub trait IsStarkVerifier<
             Vec::new()
         };
 
+        // Compute the LogUp alpha powers ONCE, up to the global maximum bus
+        // element count across all AIRs. `compute_alpha_powers` returns the
+        // strict prefix sequence `[1, α, α², …]`, and the alpha challenge is
+        // shared (identical) across all tables, so each table can reuse a
+        // prefix slice of this global vector instead of recomputing the chain.
+        let logup_alpha_powers_global: Vec<FieldElement<FieldExtension>> =
+            if lookup_challenges.len() > LOGUP_CHALLENGE_ALPHA {
+                let global_max_bus = airs.iter().map(|a| a.max_bus_elements()).max().unwrap_or(0);
+                compute_alpha_powers(&lookup_challenges[LOGUP_CHALLENGE_ALPHA], global_max_bus)
+            } else {
+                Vec::new()
+            };
+
         // =====================================================================
         // Validate bus_public_inputs presence against AIR layout
         // =====================================================================
@@ -897,6 +912,7 @@ pub trait IsStarkVerifier<
                 proof,
                 &mut table_transcript,
                 lookup_challenges.clone(),
+                &logup_alpha_powers_global,
             ) {
                 error!(
                     "Table {} failed verify_rounds_2_to_4 (num_constraints={}, trace_cols={})",
@@ -1103,6 +1119,7 @@ pub trait IsStarkVerifier<
         proof: &StarkProof<Field, FieldExtension, PI>,
         transcript: &mut impl IsStarkTranscript<FieldExtension, Field>,
         rap_challenges: Vec<FieldElement<FieldExtension>>,
+        logup_alpha_powers: &[FieldElement<FieldExtension>],
     ) -> bool
     where
         FieldElement<Field>: AsBytes + Sync + Send,
@@ -1147,7 +1164,13 @@ pub trait IsStarkVerifier<
         #[cfg(feature = "instruments")]
         let timer2 = Instant::now();
 
-        if !Self::step_2_verify_claimed_composition_polynomial(air, proof, &domain, &challenges) {
+        if !Self::step_2_verify_claimed_composition_polynomial(
+            air,
+            proof,
+            &domain,
+            &challenges,
+            logup_alpha_powers,
+        ) {
             #[cfg(not(feature = "test_fiat_shamir"))]
             error!("Composition Polynomial verification failed");
             return false;

From 2a4cb7580c2769ed9be8b1efc45089521be80094 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Fri, 29 May 2026 16:05:27 -0300
Subject: [PATCH 4/8] refactor(verifier): consolidate common pre-fork section
 in multi_verify; borrow lookup challenges

---
 crypto/stark/src/verifier.rs | 37 +++++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs
index bbe1efb20..6f0ba6c62 100644
--- a/crypto/stark/src/verifier.rs
+++ b/crypto/stark/src/verifier.rs
@@ -792,6 +792,17 @@ pub trait IsStarkVerifier<
         // Check if any AIR has an auxiliary trace
         let needs_lookup_challenges = airs.iter().any(|air| air.has_aux_trace());
 
+        // #####################################################################
+        // ##### COMMON (shared, pre-fork) #####################################
+        // #####################################################################
+        // Everything below is computed ONCE on the shared transcript before any
+        // per-table fork: main commitments are appended, the shared LogUp
+        // challenges are sampled, the global alpha powers are derived, and the
+        // bus_public_inputs layout is validated. Only after this section do we
+        // fork the transcript per table. The exact sequence of transcript
+        // operations here is soundness-critical (Fiat-Shamir) and must match
+        // the prover byte-for-byte.
+
         // =====================================================================
         // Round 1, Phase A: Replay main trace commitments
         // =====================================================================
@@ -880,9 +891,12 @@ pub trait IsStarkVerifier<
             }
         }
 
-        // =====================================================================
-        // Phase C + Rounds 2-4: Forked per table
-        // =====================================================================
+        // #####################################################################
+        // ##### PER-TABLE (forked transcript) #################################
+        // #####################################################################
+        // The shared/common section is finished. From here each table branches.
+        //
+        // Phase C + Rounds 2-4: Forked per table.
         // Each table gets an independent transcript fork (cloned from the shared
         // state after Phase B, domain-separated by table index). This matches
         // the prover's forking and makes per-table verification independent.
@@ -911,7 +925,7 @@ pub trait IsStarkVerifier<
                 *air,
                 proof,
                 &mut table_transcript,
-                lookup_challenges.clone(),
+                &lookup_challenges,
                 &logup_alpha_powers_global,
             ) {
                 error!(
@@ -1118,7 +1132,7 @@ pub trait IsStarkVerifier<
         air: &dyn AIR<Field = Field, FieldExtension = FieldExtension, PublicInputs = PI>,
         proof: &StarkProof<Field, FieldExtension, PI>,
         transcript: &mut impl IsStarkTranscript<FieldExtension, Field>,
-        rap_challenges: Vec<FieldElement<FieldExtension>>,
+        rap_challenges: &[FieldElement<FieldExtension>],
         logup_alpha_powers: &[FieldElement<FieldExtension>],
     ) -> bool
     where
@@ -1137,8 +1151,17 @@ pub trait IsStarkVerifier<
         #[cfg(feature = "instruments")]
         let timer1 = Instant::now();
 
-        let challenges =
-            Self::replay_rounds_after_round_1(air, proof, &domain, transcript, rap_challenges);
+        // `replay_rounds_after_round_1` takes ownership of `rap_challenges`
+        // (it is stored owned in the returned `Challenges`). Clone exactly once
+        // here, where ownership is actually required — this removes the
+        // per-table clone that previously lived at the `multi_verify` call site.
+        let challenges = Self::replay_rounds_after_round_1(
+            air,
+            proof,
+            &domain,
+            transcript,
+            rap_challenges.to_vec(),
+        );
 
         // verify grinding
         let security_bits = air.context().proof_options.grinding_factor;

From 8f93fd83da0f42b50511155cbc5fe08d3a2b589e Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Fri, 29 May 2026 16:31:29 -0300
Subject: [PATCH 5/8] perf(verifier): hash Merkle leaves from borrowed slices
 (no per-opening Vec alloc)

---
 .../backends/field_element_vector.rs          | 24 ++++++++++++++
 crypto/crypto/src/merkle_tree/proof.rs        | 22 +++++++++----
 .../src/tests/field_element_vector_tests.rs   | 33 +++++++++++++++++++
 crypto/stark/src/verifier.rs                  | 27 +++++++++------
 4 files changed, 90 insertions(+), 16 deletions(-)

diff --git a/crypto/crypto/src/merkle_tree/backends/field_element_vector.rs b/crypto/crypto/src/merkle_tree/backends/field_element_vector.rs
index 25ba807c6..cc392843f 100644
--- a/crypto/crypto/src/merkle_tree/backends/field_element_vector.rs
+++ b/crypto/crypto/src/merkle_tree/backends/field_element_vector.rs
@@ -88,6 +88,30 @@ where
     }
 }
 
+impl<F, D, const NUM_BYTES: usize> FieldElementVectorBackend<F, D, NUM_BYTES>
+where
+    F: IsField,
+    FieldElement<F>: AsBytes,
+    D: Digest,
+    [u8; NUM_BYTES]: From<Output<D>>,
+{
+    /// Hash a sequence of borrowed field elements into a leaf node, identical
+    /// to `hash_data` over the same sequence — but without materializing a Vec.
+    pub fn hash_elements<'a, I>(elements: I) -> [u8; NUM_BYTES]
+    where
+        I: IntoIterator<Item = &'a FieldElement<F>>,
+        F: 'a,
+    {
+        let mut hasher = D::new();
+        for element in elements {
+            hasher.update(element.as_bytes());
+        }
+        let mut result_hash = [0u8; NUM_BYTES];
+        result_hash.copy_from_slice(&hasher.finalize());
+        result_hash
+    }
+}
+
 impl<F, D: Digest, const NUM_BYTES: usize> IsMerkleTreeBackend
     for FieldElementVectorBackend<F, D, NUM_BYTES>
 where
diff --git a/crypto/crypto/src/merkle_tree/proof.rs b/crypto/crypto/src/merkle_tree/proof.rs
index 20d5452a2..a502d0312 100644
--- a/crypto/crypto/src/merkle_tree/proof.rs
+++ b/crypto/crypto/src/merkle_tree/proof.rs
@@ -20,25 +20,35 @@ pub struct Proof<T: PartialEq + Eq> {
 }
 
 impl<T: PartialEq + Eq> Proof<T> {
-    /// Verifies a Merkle inclusion proof for the value contained at leaf index.
-    pub fn verify<B>(&self, root_hash: &B::Node, mut index: usize, value: &B::Data) -> bool
+    /// Verify inclusion when the caller already computed the leaf hash
+    /// (lets callers hash borrowed leaf data without materializing `B::Data`).
+    pub fn verify_hashed<B>(
+        &self,
+        root_hash: &B::Node,
+        mut index: usize,
+        mut hashed_value: B::Node,
+    ) -> bool
     where
         B: IsMerkleTreeBackend<Node = T>,
     {
-        let mut hashed_value = B::hash_data(value);
-
         for sibling_node in self.merkle_path.iter() {
             if index.is_multiple_of(2) {
                 hashed_value = B::hash_new_parent(&hashed_value, sibling_node);
             } else {
                 hashed_value = B::hash_new_parent(sibling_node, &hashed_value);
             }
-
             index >>= 1;
         }
-
         root_hash == &hashed_value
     }
+
+    /// Verifies a Merkle inclusion proof for the value contained at leaf index.
+    pub fn verify<B>(&self, root_hash: &B::Node, index: usize, value: &B::Data) -> bool
+    where
+        B: IsMerkleTreeBackend<Node = T>,
+    {
+        self.verify_hashed::<B>(root_hash, index, B::hash_data(value))
+    }
 }
 
 #[cfg(feature = "alloc")]
diff --git a/crypto/crypto/src/tests/field_element_vector_tests.rs b/crypto/crypto/src/tests/field_element_vector_tests.rs
index 145e3f463..75b946472 100644
--- a/crypto/crypto/src/tests/field_element_vector_tests.rs
+++ b/crypto/crypto/src/tests/field_element_vector_tests.rs
@@ -6,6 +6,7 @@ use sha3::{Keccak256, Keccak512, Sha3_256, Sha3_512};
 
 use crate::merkle_tree::{
     backends::field_element_vector::FieldElementVectorBackend, merkle::MerkleTree,
+    traits::IsMerkleTreeBackend,
 };
 
 type F = GoldilocksField;
@@ -120,3 +121,35 @@ fn hash_data_field_element_backend_works_with_sha2_512() {
         &values[0]
     ));
 }
+
+#[test]
+fn hash_elements_matches_hash_data_byte_for_byte() {
+    type Backend = FieldElementVectorBackend<F, Keccak256, 32>;
+
+    // Pseudo-random Vec generated from a simple LCG so the test is deterministic
+    // yet exercises a non-trivial sequence of field elements.
+    let mut state: u64 = 0x9E3779B97F4A7C15;
+    let v: Vec<FE> = (0..37)
+        .map(|_| {
+            state = state
+                .wrapping_mul(6364136223846793005)
+                .wrapping_add(1442695040888963407);
+            FE::from(state)
+        })
+        .collect();
+
+    let via_hash_data = Backend::hash_data(&v);
+    let via_hash_elements = Backend::hash_elements(v.iter());
+
+    assert_eq!(
+        via_hash_data, via_hash_elements,
+        "hash_elements must be byte-identical to hash_data over the same sequence"
+    );
+
+    // Empty sequence must also agree.
+    let empty: Vec<FE> = Vec::new();
+    assert_eq!(
+        Backend::hash_data(&empty),
+        Backend::hash_elements(empty.iter())
+    );
+}
diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs
index 6f0ba6c62..8ed86c891 100644
--- a/crypto/stark/src/verifier.rs
+++ b/crypto/stark/src/verifier.rs
@@ -316,7 +316,8 @@ pub trait IsStarkVerifier<
         E: IsField,
         Field: IsSubFieldOf<E>,
     {
-        proof.verify::<BatchedMerkleTreeBackend<E>>(root, index, &value.to_owned())
+        let leaf = BatchedMerkleTreeBackend::<E>::hash_elements(value.iter());
+        proof.verify_hashed::<BatchedMerkleTreeBackend<E>>(root, index, leaf)
     }
 
     /// Verify both (proof, evaluations) and (proof_sym, evaluations_sym) openings
@@ -398,16 +399,21 @@ pub trait IsStarkVerifier<
         FieldElement<Field>: AsBytes + Sync + Send,
         FieldElement<FieldExtension>: AsBytes + Sync + Send,
     {
-        let mut value = deep_poly_openings.composition_poly.evaluations.clone();
-        value.extend_from_slice(&deep_poly_openings.composition_poly.evaluations_sym);
+        let leaf = BatchedMerkleTreeBackend::<FieldExtension>::hash_elements(
+            deep_poly_openings
+                .composition_poly
+                .evaluations
+                .iter()
+                .chain(deep_poly_openings.composition_poly.evaluations_sym.iter()),
+        );
 
         deep_poly_openings
             .composition_poly
             .proof
-            .verify::<BatchedMerkleTreeBackend<FieldExtension>>(
+            .verify_hashed::<BatchedMerkleTreeBackend<FieldExtension>>(
                 composition_poly_merkle_root,
                 *iota,
-                &value,
+                leaf,
             )
     }
 
@@ -447,16 +453,17 @@ pub trait IsStarkVerifier<
         FieldElement<Field>: AsBytes + Sync + Send,
         FieldElement<FieldExtension>: AsBytes + Sync + Send,
     {
-        let evaluations = if iota % 2 == 1 {
-            vec![evaluation_sym.clone(), evaluation.clone()]
+        let (a, b) = if iota % 2 == 1 {
+            (evaluation_sym, evaluation)
         } else {
-            vec![evaluation.clone(), evaluation_sym.clone()]
+            (evaluation, evaluation_sym)
         };
+        let leaf = BatchedMerkleTreeBackend::<FieldExtension>::hash_elements([a, b]);
 
-        auth_path_sym.verify::<BatchedMerkleTreeBackend<FieldExtension>>(
+        auth_path_sym.verify_hashed::<BatchedMerkleTreeBackend<FieldExtension>>(
             merkle_root,
             iota >> 1,
-            &evaluations,
+            leaf,
         )
     }
 

From d4de863067b077aa4bd8c7c357f5952b1e6952d0 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Fri, 29 May 2026 16:43:27 -0300
Subject: [PATCH 6/8] perf(verifier): fold in #626 micro-opts (Vec step-memo,
 slice multi_verify, opening short-circuit)

---
 crypto/stark/src/verifier.rs | 195 ++++++++++++++++++++---------------
 1 file changed, 114 insertions(+), 81 deletions(-)

diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs
index 8ed86c891..5132718d5 100644
--- a/crypto/stark/src/verifier.rs
+++ b/crypto/stark/src/verifier.rs
@@ -25,7 +25,6 @@ use math::{
     },
     traits::AsBytes,
 };
-use std::collections::HashMap;
 use std::marker::PhantomData;
 #[cfg(feature = "instruments")]
 use std::time::Instant;
@@ -111,18 +110,23 @@ pub trait IsStarkVerifier<
             proof.bus_public_inputs.as_ref(),
             trace_length,
         );
-        // Precompute g^step once per distinct step to avoid the prior O(B^2)
-        // linear scan. A single pass populates a memo and resolves each
-        // constraint's step to its point in O(1) amortized.
-        let mut step_to_point: HashMap<usize, FieldElement<Field>> = HashMap::new();
+        // Precompute g^step once per distinct step. A small `Vec` with a
+        // linear scan beats `HashMap` here: boundary constraints typically
+        // number in the low tens, the recursion guest pays no allocator/hash
+        // overhead, and the AIR generally emits its constraints grouped by
+        // step so the scan hits the hot entry first.
+        let mut step_to_point: Vec<(usize, FieldElement<Field>)> = Vec::new();
         let boundary_points: Vec<FieldElement<Field>> = boundary_constraints
             .constraints
             .iter()
             .map(|c| {
-                step_to_point
-                    .entry(c.step)
-                    .or_insert_with(|| domain.trace_primitive_root.pow(c.step as u64))
-                    .clone()
+                if let Some((_, point)) = step_to_point.iter().find(|(s, _)| *s == c.step) {
+                    point.clone()
+                } else {
+                    let point = domain.trace_primitive_root.pow(c.step as u64);
+                    step_to_point.push((c.step, point.clone()));
+                    point
+                }
             })
             .collect();
 
@@ -355,37 +359,48 @@ pub trait IsStarkVerifier<
         FieldElement<FieldExtension>: AsBytes + Sync + Send,
     {
         // Main trace (multiplicities for preprocessed, full trace for normal).
-        let mut ok = Self::verify_opening_pair::<Field>(
+        // Short-circuit on any failure: each opening pair is a Merkle-path
+        // verification (~20 Keccak hashes against base-field leaves); in the
+        // recursion guest this is non-trivial cycle cost worth skipping.
+        if !Self::verify_opening_pair::<Field>(
             &deep_poly_openings.main_trace_polys,
             &proof.lde_trace_main_merkle_root,
             iota,
-        );
+        ) {
+            return false;
+        }
 
         // Precomputed trace (preprocessed tables only). Mismatched presence is
         // unreachable in practice (multi_verify rejects such proofs upstream),
         // but a defensive check keeps this function self-contained.
-        ok &= match (
+        match (
             &proof.lde_trace_precomputed_merkle_root,
             &deep_poly_openings.precomputed_trace_polys,
         ) {
-            (Some(root), Some(opening)) => Self::verify_opening_pair::<Field>(opening, root, iota),
-            (None, None) => true,
-            _ => false,
-        };
+            (Some(root), Some(opening)) => {
+                if !Self::verify_opening_pair::<Field>(opening, root, iota) {
+                    return false;
+                }
+            }
+            (None, None) => {}
+            _ => return false,
+        }
 
         // Auxiliary trace.
-        ok &= match (
+        match (
             proof.lde_trace_aux_merkle_root,
             &deep_poly_openings.aux_trace_polys,
         ) {
             (Some(root), Some(opening)) => {
-                Self::verify_opening_pair::<FieldExtension>(opening, &root, iota)
+                if !Self::verify_opening_pair::<FieldExtension>(opening, &root, iota) {
+                    return false;
+                }
             }
-            (None, None) => true,
-            _ => false,
-        };
+            (None, None) => {}
+            _ => return false,
+        }
 
-        ok
+        true
     }
 
     /// Verify opening Open(Hᵢ(D_LDE), 𝜐) and Open(Hᵢ(D_LDE), -𝜐) for all parts Hᵢof the composition
@@ -489,19 +504,12 @@ pub trait IsStarkVerifier<
         FieldElement<FieldExtension>: AsBytes + Sync + Send,
     {
         let fri_layers_merkle_roots = &proof.fri_layers_merkle_roots;
-        let evaluation_point_vec: Vec<FieldElement<Field>> =
-            core::iter::successors(Some(evaluation_point_inv.square()), |evaluation_point| {
-                Some(evaluation_point.square())
-            })
-            .take(fri_layers_merkle_roots.len())
-            .collect();
-
         let p0_eval = deep_composition_evaluation;
         let p0_eval_sym = deep_composition_evaluation_sym;
 
         // Reconstruct p₁(𝜐²)
         let mut v =
-            (p0_eval + p0_eval_sym) + evaluation_point_inv * &zetas[0] * (p0_eval - p0_eval_sym);
+            (p0_eval + p0_eval_sym) + &evaluation_point_inv * &zetas[0] * (p0_eval - p0_eval_sym);
         let mut index = iota;
 
         // Handle case with 0 FRI layers (trace_length <= 2)
@@ -511,49 +519,57 @@ pub trait IsStarkVerifier<
             return v == proof.fri_last_value;
         }
 
-        // For each FRI layer, starting from the layer 1: use the proof to verify the validity of values pᵢ(−𝜐^(2ⁱ)) (given by the prover) and
-        // pᵢ(𝜐^(2ⁱ)) (computed on the previous iteration by the verifier). Then use them to obtain pᵢ₊₁(𝜐^(2ⁱ⁺¹)).
-        // Finally, check that the final value coincides with the given by the prover.
-        fri_layers_merkle_roots
+        // Guard zip alignment: the three iterables MUST have equal lengths.
+        // A malformed proof with mismatched lengths would otherwise silently
+        // truncate the verification or panic on the `len() - 1` below.
+        if fri_decommitment.layers_auth_paths.len() != fri_layers_merkle_roots.len()
+            || fri_decommitment.layers_evaluations_sym.len() != fri_layers_merkle_roots.len()
+        {
+            return false;
+        }
+
+        // For each FRI layer, verify openings then fold to the next layer's
+        // evaluation. `evaluation_point_squared` is stepped in-place instead
+        // of pre-collecting a Vec, and a failed opening short-circuits the
+        // remaining Merkle work (each call is ~log₂(N) Keccak hashes).
+        let last_layer_idx = fri_layers_merkle_roots.len() - 1;
+        let mut evaluation_point_squared = evaluation_point_inv.square();
+        for (i, ((merkle_root, auth_path_sym), evaluation_sym)) in fri_layers_merkle_roots
             .iter()
-            .enumerate()
             .zip(&fri_decommitment.layers_auth_paths)
             .zip(&fri_decommitment.layers_evaluations_sym)
-            .zip(evaluation_point_vec)
-            .fold(
-                true,
-                |result,
-                 (
-                    (((i, merkle_root), auth_path_sym), evaluation_sym),
-                    evaluation_point_inv,
-                )| {
-                    // Verify opening Open(pᵢ(Dₖ), −𝜐^(2ⁱ)) and Open(pᵢ(Dₖ), 𝜐^(2ⁱ)).
-                    // `v` is pᵢ(𝜐^(2ⁱ)).
-                    // `evaluation_sym` is pᵢ(−𝜐^(2ⁱ)).
-                    let openings_ok = Self::verify_fri_layer_openings(
-                        merkle_root,
-                        auth_path_sym,
-                        &v,
-                        evaluation_sym,
-                        index,
-                    );
-
-                    // Update `v` with next value pᵢ₊₁(𝜐^(2ⁱ⁺¹)).
-                    v = (&v + evaluation_sym) + evaluation_point_inv * &zetas[i + 1] * (&v - evaluation_sym);
-
-                    // Update index for next iteration. The index of the squares in the next layer
-                    // is obtained by halving the current index. This is due to the bit-reverse
-                    // ordering of the elements in the Merkle tree.
-                    index >>= 1;
-
-                    if i < fri_decommitment.layers_evaluations_sym.len() - 1 {
-                        result & openings_ok
-                    } else {
-                        // Check that final value is the given by the prover
-                        result & (v == proof.fri_last_value) & openings_ok
-                    }
-                },
-            )
+            .enumerate()
+        {
+            // Verify opening Open(pᵢ(Dₖ), −𝜐^(2ⁱ)) and Open(pᵢ(Dₖ), 𝜐^(2ⁱ)).
+            // `v` is pᵢ(𝜐^(2ⁱ)). `evaluation_sym` is pᵢ(−𝜐^(2ⁱ)).
+            if !Self::verify_fri_layer_openings(
+                merkle_root,
+                auth_path_sym,
+                &v,
+                evaluation_sym,
+                index,
+            ) {
+                return false;
+            }
+
+            // Update `v` with next value pᵢ₊₁(𝜐^(2ⁱ⁺¹)).
+            v = (&v + evaluation_sym)
+                + &evaluation_point_squared * &zetas[i + 1] * (&v - evaluation_sym);
+
+            // Index of the squares in the next layer = current index halved
+            // (bit-reverse ordering of the Merkle tree).
+            index >>= 1;
+
+            if i == last_layer_idx {
+                return v == proof.fri_last_value;
+            }
+            evaluation_point_squared = evaluation_point_squared.square();
+        }
+
+        // Unreachable: the length guard above ensures the loop iterates at
+        // least once (we passed the is_empty check) and hits the
+        // `i == last_layer_idx` return.
+        unreachable!("loop must hit the last_layer_idx return")
     }
 
     fn reconstruct_deep_composition_poly_evaluations_for_all_queries(
@@ -787,11 +803,27 @@ pub trait IsStarkVerifier<
         FieldElement<Field>: AsBytes + Sync + Send,
         FieldElement<FieldExtension>: AsBytes + Sync + Send,
     {
-        if airs.len() != multi_proof.proofs.len() {
+        Self::multi_verify_proofs(airs, &multi_proof.proofs, transcript, expected_bus_balance)
+    }
+
+    /// Slice-taking variant of [`Self::multi_verify`]. Callers that already
+    /// hold a slice of proofs (or a single proof via [`core::slice::from_ref`])
+    /// can call this directly without constructing a [`MultiProof`].
+    fn multi_verify_proofs(
+        airs: &[&dyn AIR<Field = Field, FieldExtension = FieldExtension, PublicInputs = PI>],
+        proofs: &[StarkProof<Field, FieldExtension, PI>],
+        transcript: &mut (impl IsStarkTranscript<FieldExtension, Field> + Clone),
+        expected_bus_balance: &FieldElement<FieldExtension>,
+    ) -> bool
+    where
+        FieldElement<Field>: AsBytes + Sync + Send,
+        FieldElement<FieldExtension>: AsBytes + Sync + Send,
+    {
+        if airs.len() != proofs.len() {
             error!(
                 "AIR count ({}) does not match proof count ({})",
                 airs.len(),
-                multi_proof.proofs.len()
+                proofs.len()
             );
             return false;
         }
@@ -816,7 +848,7 @@ pub trait IsStarkVerifier<
         // For preprocessed tables, use the hardcoded commitment (verifier cannot
         // trust the prover). For normal tables, use the commitment from the proof.
 
-        for (idx, (air, proof)) in airs.iter().zip(&multi_proof.proofs).enumerate() {
+        for (idx, (air, proof)) in airs.iter().zip(proofs).enumerate() {
             if air.is_preprocessed() {
                 // Preprocessed table: VERIFY precomputed commitment matches hardcoded.
                 // This is the critical soundness check - ensures prover used correct precomputed values.
@@ -883,7 +915,7 @@ pub trait IsStarkVerifier<
         // boundary constraints on LogUp columns, so the bus balance check is
         // the only cross-table validation.
 
-        for (idx, (air, proof)) in airs.iter().zip(&multi_proof.proofs).enumerate() {
+        for (idx, (air, proof)) in airs.iter().zip(proofs).enumerate() {
             if air.has_trace_interaction() && proof.bus_public_inputs.is_none() {
                 error!(
                     "Table {idx}: AIR has LogUp interactions but proof is missing bus_public_inputs"
@@ -908,7 +940,7 @@ pub trait IsStarkVerifier<
         // state after Phase B, domain-separated by table index). This matches
         // the prover's forking and makes per-table verification independent.
 
-        for (idx, (air, proof)) in airs.iter().zip(&multi_proof.proofs).enumerate() {
+        for (idx, (air, proof)) in airs.iter().zip(proofs).enumerate() {
             // Must match prover: fork with domain separator for multi-table,
             // use original transcript directly for single-table.
             let num_tables = airs.len();
@@ -957,7 +989,7 @@ pub trait IsStarkVerifier<
 
         if needs_lookup_challenges {
             let mut total = FieldElement::<FieldExtension>::zero();
-            for (air, proof) in airs.iter().zip(&multi_proof.proofs) {
+            for (air, proof) in airs.iter().zip(proofs) {
                 if air.has_trace_interaction()
                     && let Some(interaction) = &proof.bus_public_inputs
                 {
@@ -990,12 +1022,13 @@ pub trait IsStarkVerifier<
     where
         FieldElement<Field>: AsBytes + Sync + Send,
         FieldElement<FieldExtension>: AsBytes + Sync + Send,
-        PI: Clone,
     {
-        let multi_proof = MultiProof {
-            proofs: vec![proof.clone()],
-        };
-        Self::multi_verify(&[air], &multi_proof, transcript, &FieldElement::zero())
+        Self::multi_verify_proofs(
+            &[air],
+            core::slice::from_ref(proof),
+            transcript,
+            &FieldElement::zero(),
+        )
     }
 
     /// Replays rounds 2, 3 and 4 of the protocol for a given proof, assuming round 1 has

From b85fd4670ea2983eb74800bc3f00bfdc5574799a Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Fri, 29 May 2026 17:00:55 -0300
Subject: [PATCH 7/8] feat(recursion): keccak crate shim routing f1600/p1600 to
 the VM precompile

---
 keccak-precompile/Cargo.lock     |  35 ++
 keccak-precompile/Cargo.toml     |  31 ++
 keccak-precompile/LICENSE-APACHE | 201 +++++++++++
 keccak-precompile/LICENSE-MIT    |  25 ++
 keccak-precompile/README.md      |  67 ++++
 keccak-precompile/src/armv8.rs   | 192 ++++++++++
 keccak-precompile/src/lib.rs     | 601 +++++++++++++++++++++++++++++++
 keccak-precompile/src/unroll.rs  |  62 ++++
 8 files changed, 1214 insertions(+)
 create mode 100644 keccak-precompile/Cargo.lock
 create mode 100644 keccak-precompile/Cargo.toml
 create mode 100644 keccak-precompile/LICENSE-APACHE
 create mode 100644 keccak-precompile/LICENSE-MIT
 create mode 100644 keccak-precompile/README.md
 create mode 100644 keccak-precompile/src/armv8.rs
 create mode 100644 keccak-precompile/src/lib.rs
 create mode 100644 keccak-precompile/src/unroll.rs

diff --git a/keccak-precompile/Cargo.lock b/keccak-precompile/Cargo.lock
new file mode 100644
index 000000000..357211b01
--- /dev/null
+++ b/keccak-precompile/Cargo.lock
@@ -0,0 +1,35 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "cpufeatures"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "keccak"
+version = "0.1.5"
+dependencies = [
+ "cpufeatures",
+ "keccak 0.1.6",
+]
+
+[[package]]
+name = "keccak"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb26cec98cce3a3d96cbb7bced3c4b16e3d13f27ec56dbd62cbc8f39cfb9d653"
+dependencies = [
+ "cpufeatures",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.186"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
diff --git a/keccak-precompile/Cargo.toml b/keccak-precompile/Cargo.toml
new file mode 100644
index 000000000..f4ca59294
--- /dev/null
+++ b/keccak-precompile/Cargo.toml
@@ -0,0 +1,31 @@
+# Empty workspace table so this crate is NOT pulled into the main lambda-vm
+# workspace. It is meant to be consumed only via `[patch.crates-io]` in the
+# recursion guest. (Mirrors executor/programs/rust/keccak/Cargo.toml.)
+[workspace]
+
+[package]
+edition = "2018"
+name = "keccak"
+version = "0.1.5"
+authors = ["RustCrypto Developers"]
+description = """
+lambda-vm precompile shim for the `keccak` crate: routes Keccak-f[1600] /
+Keccak-p[1600, 24] to the RISC-V Keccak precompile on the riscv64 guest.
+Verbatim copy of keccak 0.1.5 except the p1600/f1600 override. Intended as a
+`[patch.crates-io]` override in the recursion guest.
+"""
+license = "Apache-2.0 OR MIT"
+repository = "https://github.com/lambdaclass/lambda_vm"
+
+[features]
+asm = []
+no_unroll = []
+simd = []
+
+[target."cfg(target_arch = \"aarch64\")".dependencies.cpufeatures]
+version = "0.2"
+
+[dev-dependencies]
+# Upstream keccak crate under a different name, so the faithfulness test can
+# compare against it without triggering a self-patch cycle.
+dev-dep-keccak = { package = "keccak", version = "0.1.5" }
diff --git a/keccak-precompile/LICENSE-APACHE b/keccak-precompile/LICENSE-APACHE
new file mode 100644
index 000000000..78173fa2e
--- /dev/null
+++ b/keccak-precompile/LICENSE-APACHE
@@ -0,0 +1,201 @@
+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+   To apply the Apache License to your work, attach the following
+   boilerplate notice, with the fields enclosed by brackets "[]"
+   replaced with your own identifying information. (Don't include
+   the brackets!)  The text should be enclosed in the appropriate
+   comment syntax for the file format. We also recommend that a
+   file or class name and description of purpose be included on the
+   same "printed page" as the copyright notice for easier
+   identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/keccak-precompile/LICENSE-MIT b/keccak-precompile/LICENSE-MIT
new file mode 100644
index 000000000..81a3d57ac
--- /dev/null
+++ b/keccak-precompile/LICENSE-MIT
@@ -0,0 +1,25 @@
+Copyright (c) 2018-2022 RustCrypto Developers
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/keccak-precompile/README.md b/keccak-precompile/README.md
new file mode 100644
index 000000000..d5d3adb32
--- /dev/null
+++ b/keccak-precompile/README.md
@@ -0,0 +1,67 @@
+# keccak-precompile
+
+A drop-in shim for the [`keccak`](https://crates.io/crates/keccak) crate
+(version 0.1.5) that routes the Keccak-f[1600] / Keccak-p[1600, 24]
+permutation to the lambda-vm RISC-V **Keccak precompile** when compiled for
+the `riscv64` guest.
+
+## What it is
+
+`src/lib.rs` is a **verbatim copy** of upstream `keccak` 0.1.5, with a single
+targeted change: the non-asm `p1600` / `f1600` functions delegate the full
+24-round permutation to the VM precompile via an `ecall`:
+
+```rust
+#[cfg(target_arch = "riscv64")]
+if round_count == 24 {
+    // ABI: a0 = state ptr, a7 = usize::MAX - 1
+    unsafe { core::arch::asm!("ecall", in("a0") state.as_mut_ptr(), in("a7") usize::MAX - 1); }
+    return;
+}
+```
+
+Every other code path (reduced-round `p1600`, the `LaneSize` trait, `keccak_p`,
+the round constants, the `u8`/`u16`/`u32` lane sizes, the `simd` module, and the
+aarch64 `asm` variants) is unchanged, so host builds behave exactly like
+upstream.
+
+## Why it works
+
+The STARK verifier hashes via `sha3::Keccak256`. `sha3` 0.10.x performs its
+permutation by calling `keccak::p1600(&mut state, 24)`. By overriding that
+single function and patching the `keccak` crate, **all** of `sha3`'s usage —
+the Merkle tree backend and the Fiat-Shamir transcript — transparently routes
+to the precompile on `riscv64`, while reusing `sha3`'s correct sponge and
+padding.
+
+Correctness is guaranteed: the precompile computes exactly Keccak-f[1600]
+(= 24-round `p1600`) in place over the `[u64; 25]` state, so hashes are
+byte-identical to the software implementation.
+
+## How the recursion guest enables it
+
+Add to the **guest's root `Cargo.toml`**:
+
+```toml
+[patch.crates-io]
+keccak = { path = "<relative path to>/keccak-precompile" }
+```
+
+This replaces the upstream `keccak` dependency pulled in transitively by
+`sha3`. On `riscv64` the permutation hits the precompile; on host targets the
+build is unchanged.
+
+## Testing
+
+The riscv64 precompile path cannot run on the host. The included unit test
+(`cargo test`) verifies the copied software permutation is faithful by:
+
+1. Asserting `f1600` of the all-zero state matches the standard Keccak-f[1600]
+   test vector, and that `p1600(_, 24)` equals `f1600`.
+2. Comparing `crate::f1600` / `crate::p1600` against the upstream `keccak`
+   crate (pulled in as a renamed dev-dependency `dev-dep-keccak` to avoid a
+   self-patch cycle) over many pseudo-random states and reduced round counts.
+
+## License
+
+Apache-2.0 OR MIT, matching upstream `keccak`.
diff --git a/keccak-precompile/src/armv8.rs b/keccak-precompile/src/armv8.rs
new file mode 100644
index 000000000..698c8a105
--- /dev/null
+++ b/keccak-precompile/src/armv8.rs
@@ -0,0 +1,192 @@
+/// Keccak-p1600 on ARMv8.4-A with FEAT_SHA3.
+///
+/// See p. K12.2.2  p. 11,749 of the ARM Reference manual.
+/// Adapted from the Keccak-f1600 implementation in the XKCP/K12.
+/// see <https://github.com/XKCP/K12/blob/df6a21e6d1f34c1aa36e8d702540899c97dba5a0/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S#L69>
+#[target_feature(enable = "sha3")]
+pub unsafe fn p1600_armv8_sha3_asm(state: &mut [u64; 25], round_count: usize) {
+    core::arch::asm!("
+        // Read state
+        ld1.1d {{ v0- v3}}, [x0], #32
+        ld1.1d {{ v4- v7}}, [x0], #32
+        ld1.1d {{ v8-v11}}, [x0], #32
+        ld1.1d {{v12-v15}}, [x0], #32
+        ld1.1d {{v16-v19}}, [x0], #32
+        ld1.1d {{v20-v23}}, [x0], #32
+        ld1.1d {{v24}},     [x0]
+        sub x0, x0, #192
+
+        // NOTE: This loop actually computes two f1600 functions in
+        // parallel, in both the lower and the upper 64-bit of the
+        // 128-bit registers v0-v24.
+    0:  sub	x8, x8, #1
+
+        // Theta Calculations
+        eor3.16b   v25, v20, v15, v10
+        eor3.16b   v26, v21, v16, v11
+        eor3.16b   v27, v22, v17, v12
+        eor3.16b   v28, v23, v18, v13
+        eor3.16b   v29, v24, v19, v14
+        eor3.16b   v25, v25,  v5,  v0
+        eor3.16b   v26, v26,  v6,  v1
+        eor3.16b   v27, v27,  v7,  v2
+        eor3.16b   v28, v28,  v8,  v3
+        eor3.16b   v29, v29,  v9,  v4
+        rax1.2d    v30, v25, v27
+        rax1.2d    v31, v26, v28
+        rax1.2d    v27, v27, v29
+        rax1.2d    v28, v28, v25
+        rax1.2d    v29, v29, v26
+
+        // Rho and Phi
+        eor.16b     v0,  v0, v29
+        xar.2d     v25,  v1, v30, #64 -  1
+        xar.2d      v1,  v6, v30, #64 - 44
+        xar.2d      v6,  v9, v28, #64 - 20
+        xar.2d      v9, v22, v31, #64 - 61
+        xar.2d     v22, v14, v28, #64 - 39
+        xar.2d     v14, v20, v29, #64 - 18
+        xar.2d     v26,  v2, v31, #64 - 62
+        xar.2d      v2, v12, v31, #64 - 43
+        xar.2d     v12, v13, v27, #64 - 25
+        xar.2d     v13, v19, v28, #64 -  8
+        xar.2d     v19, v23, v27, #64 - 56
+        xar.2d     v23, v15, v29, #64 - 41
+        xar.2d     v15,  v4, v28, #64 - 27
+        xar.2d     v28, v24, v28, #64 - 14
+        xar.2d     v24, v21, v30, #64 -  2
+        xar.2d      v8,  v8, v27, #64 - 55
+        xar.2d      v4, v16, v30, #64 - 45
+        xar.2d     v16,  v5, v29, #64 - 36
+        xar.2d      v5,  v3, v27, #64 - 28
+        xar.2d     v27, v18, v27, #64 - 21
+        xar.2d      v3, v17, v31, #64 - 15
+        xar.2d     v30, v11, v30, #64 - 10
+        xar.2d     v31,  v7, v31, #64 -  6
+        xar.2d     v29, v10, v29, #64 -  3
+
+        // Chi and Iota
+        bcax.16b   v20, v26, v22,  v8
+        bcax.16b   v21,  v8, v23, v22
+        bcax.16b   v22, v22, v24, v23
+        bcax.16b   v23, v23, v26, v24
+        bcax.16b   v24, v24,  v8, v26
+
+        ld1r.2d    {{v26}}, [x1], #8
+
+        bcax.16b   v17, v30, v19,  v3
+        bcax.16b   v18,  v3, v15, v19
+        bcax.16b   v19, v19, v16, v15
+        bcax.16b   v15, v15, v30, v16
+        bcax.16b   v16, v16,  v3, v30
+
+        bcax.16b   v10, v25, v12, v31
+        bcax.16b   v11, v31, v13, v12
+        bcax.16b   v12, v12, v14, v13
+        bcax.16b   v13, v13, v25, v14
+        bcax.16b   v14, v14, v31, v25
+
+        bcax.16b    v7, v29,  v9,  v4
+        bcax.16b    v8,  v4,  v5,  v9
+        bcax.16b    v9,  v9,  v6,  v5
+        bcax.16b    v5,  v5, v29,  v6
+        bcax.16b    v6,  v6,  v4, v29
+
+        bcax.16b    v3, v27,  v0, v28
+        bcax.16b    v4, v28,  v1,  v0
+        bcax.16b    v0,  v0,  v2,  v1
+        bcax.16b    v1,  v1, v27,  v2
+        bcax.16b    v2,  v2, v28, v27
+
+        eor.16b v0,v0,v26
+
+        // Rounds loop
+        cbnz    w8, 0b
+
+        // Write state
+        st1.1d	{{ v0- v3}}, [x0], #32
+        st1.1d	{{ v4- v7}}, [x0], #32
+        st1.1d	{{ v8-v11}}, [x0], #32
+        st1.1d	{{v12-v15}}, [x0], #32
+        st1.1d	{{v16-v19}}, [x0], #32
+        st1.1d	{{v20-v23}}, [x0], #32
+        st1.1d	{{v24}},     [x0]
+    ",
+        in("x0") state.as_mut_ptr(),
+        in("x1") crate::RC[24-round_count..].as_ptr(),
+        in("x8") round_count,
+        clobber_abi("C"),
+        options(nostack)
+    );
+}
+
+#[cfg(all(test, target_feature = "sha3"))]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_keccak_f1600() {
+        // Test vectors are copied from XKCP (eXtended Keccak Code Package)
+        // https://github.com/XKCP/XKCP/blob/master/tests/TestVectors/KeccakF-1600-IntermediateValues.txt
+        let state_first = [
+            0xF1258F7940E1DDE7,
+            0x84D5CCF933C0478A,
+            0xD598261EA65AA9EE,
+            0xBD1547306F80494D,
+            0x8B284E056253D057,
+            0xFF97A42D7F8E6FD4,
+            0x90FEE5A0A44647C4,
+            0x8C5BDA0CD6192E76,
+            0xAD30A6F71B19059C,
+            0x30935AB7D08FFC64,
+            0xEB5AA93F2317D635,
+            0xA9A6E6260D712103,
+            0x81A57C16DBCF555F,
+            0x43B831CD0347C826,
+            0x01F22F1A11A5569F,
+            0x05E5635A21D9AE61,
+            0x64BEFEF28CC970F2,
+            0x613670957BC46611,
+            0xB87C5A554FD00ECB,
+            0x8C3EE88A1CCF32C8,
+            0x940C7922AE3A2614,
+            0x1841F924A2C509E4,
+            0x16F53526E70465C2,
+            0x75F644E97F30A13B,
+            0xEAF1FF7B5CECA249,
+        ];
+        let state_second = [
+            0x2D5C954DF96ECB3C,
+            0x6A332CD07057B56D,
+            0x093D8D1270D76B6C,
+            0x8A20D9B25569D094,
+            0x4F9C4F99E5E7F156,
+            0xF957B9A2DA65FB38,
+            0x85773DAE1275AF0D,
+            0xFAF4F247C3D810F7,
+            0x1F1B9EE6F79A8759,
+            0xE4FECC0FEE98B425,
+            0x68CE61B6B9CE68A1,
+            0xDEEA66C4BA8F974F,
+            0x33C43D836EAFB1F5,
+            0xE00654042719DBD9,
+            0x7CF8A9F009831265,
+            0xFD5449A6BF174743,
+            0x97DDAD33D8994B40,
+            0x48EAD5FC5D0BE774,
+            0xE3B8C8EE55B7B03C,
+            0x91A0226E649E42E9,
+            0x900E3129E7BADD7B,
+            0x202A9EC5FAA3CCE8,
+            0x5B3402464E1C3DB6,
+            0x609F4E62A44C1059,
+            0x20D06CD26A8FBF5C,
+        ];
+
+        let mut state = [0u64; 25];
+        unsafe { p1600_armv8_sha3_asm(&mut state, 24) };
+        assert_eq!(state, state_first);
+        unsafe { p1600_armv8_sha3_asm(&mut state, 24) };
+        assert_eq!(state, state_second);
+    }
+}
diff --git a/keccak-precompile/src/lib.rs b/keccak-precompile/src/lib.rs
new file mode 100644
index 000000000..50c0bdb5d
--- /dev/null
+++ b/keccak-precompile/src/lib.rs
@@ -0,0 +1,601 @@
+//! Keccak [sponge function](https://en.wikipedia.org/wiki/Sponge_function).
+//!
+//! If you are looking for SHA-3 hash functions take a look at [`sha3`][1] and
+//! [`tiny-keccak`][2] crates.
+//!
+//! To disable loop unrolling (e.g. for constraint targets) use `no_unroll`
+//! feature.
+//!
+//! ```
+//! // Test vectors are from KeccakCodePackage
+//! let mut data = [0u64; 25];
+//!
+//! keccak::f1600(&mut data);
+//! assert_eq!(data, [
+//!     0xF1258F7940E1DDE7, 0x84D5CCF933C0478A, 0xD598261EA65AA9EE, 0xBD1547306F80494D,
+//!     0x8B284E056253D057, 0xFF97A42D7F8E6FD4, 0x90FEE5A0A44647C4, 0x8C5BDA0CD6192E76,
+//!     0xAD30A6F71B19059C, 0x30935AB7D08FFC64, 0xEB5AA93F2317D635, 0xA9A6E6260D712103,
+//!     0x81A57C16DBCF555F, 0x43B831CD0347C826, 0x01F22F1A11A5569F, 0x05E5635A21D9AE61,
+//!     0x64BEFEF28CC970F2, 0x613670957BC46611, 0xB87C5A554FD00ECB, 0x8C3EE88A1CCF32C8,
+//!     0x940C7922AE3A2614, 0x1841F924A2C509E4, 0x16F53526E70465C2, 0x75F644E97F30A13B,
+//!     0xEAF1FF7B5CECA249,
+//! ]);
+//!
+//! keccak::f1600(&mut data);
+//! assert_eq!(data, [
+//!     0x2D5C954DF96ECB3C, 0x6A332CD07057B56D, 0x093D8D1270D76B6C, 0x8A20D9B25569D094,
+//!     0x4F9C4F99E5E7F156, 0xF957B9A2DA65FB38, 0x85773DAE1275AF0D, 0xFAF4F247C3D810F7,
+//!     0x1F1B9EE6F79A8759, 0xE4FECC0FEE98B425, 0x68CE61B6B9CE68A1, 0xDEEA66C4BA8F974F,
+//!     0x33C43D836EAFB1F5, 0xE00654042719DBD9, 0x7CF8A9F009831265, 0xFD5449A6BF174743,
+//!     0x97DDAD33D8994B40, 0x48EAD5FC5D0BE774, 0xE3B8C8EE55B7B03C, 0x91A0226E649E42E9,
+//!     0x900E3129E7BADD7B, 0x202A9EC5FAA3CCE8, 0x5B3402464E1C3DB6, 0x609F4E62A44C1059,
+//!     0x20D06CD26A8FBF5C,
+//! ]);
+//! ```
+//!
+//! [1]: https://docs.rs/sha3
+//! [2]: https://docs.rs/tiny-keccak
+
+#![no_std]
+#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+#![cfg_attr(feature = "simd", feature(portable_simd))]
+#![doc(
+    html_logo_url = "https://raw.githubusercontent.com/RustCrypto/meta/master/logo.svg",
+    html_favicon_url = "https://raw.githubusercontent.com/RustCrypto/meta/master/logo.svg"
+)]
+#![allow(non_upper_case_globals)]
+#![warn(
+    clippy::mod_module_files,
+    clippy::unwrap_used,
+    missing_docs,
+    rust_2018_idioms,
+    unused_lifetimes,
+    unused_qualifications
+)]
+
+use core::{
+    convert::TryInto,
+    fmt::Debug,
+    mem::size_of,
+    ops::{BitAnd, BitAndAssign, BitXor, BitXorAssign, Not},
+};
+
+#[rustfmt::skip]
+mod unroll;
+
+#[cfg(all(target_arch = "aarch64", feature = "asm"))]
+mod armv8;
+
+#[cfg(all(target_arch = "aarch64", feature = "asm"))]
+cpufeatures::new!(armv8_sha3_intrinsics, "sha3");
+
+const PLEN: usize = 25;
+
+const RHO: [u32; 24] = [
+    1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44,
+];
+
+const PI: [usize; 24] = [
+    10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1,
+];
+
+const RC: [u64; 24] = [
+    0x0000000000000001,
+    0x0000000000008082,
+    0x800000000000808a,
+    0x8000000080008000,
+    0x000000000000808b,
+    0x0000000080000001,
+    0x8000000080008081,
+    0x8000000000008009,
+    0x000000000000008a,
+    0x0000000000000088,
+    0x0000000080008009,
+    0x000000008000000a,
+    0x000000008000808b,
+    0x800000000000008b,
+    0x8000000000008089,
+    0x8000000000008003,
+    0x8000000000008002,
+    0x8000000000000080,
+    0x000000000000800a,
+    0x800000008000000a,
+    0x8000000080008081,
+    0x8000000000008080,
+    0x0000000080000001,
+    0x8000000080008008,
+];
+
+/// Keccak is a permutation over an array of lanes which comprise the sponge
+/// construction.
+pub trait LaneSize:
+    Copy
+    + Clone
+    + Debug
+    + Default
+    + PartialEq
+    + BitAndAssign
+    + BitAnd<Output = Self>
+    + BitXorAssign
+    + BitXor<Output = Self>
+    + Not<Output = Self>
+{
+    /// Number of rounds of the Keccak-f permutation.
+    const KECCAK_F_ROUND_COUNT: usize;
+
+    /// Truncate function.
+    fn truncate_rc(rc: u64) -> Self;
+
+    /// Rotate left function.
+    fn rotate_left(self, n: u32) -> Self;
+}
+
+macro_rules! impl_lanesize {
+    ($type:ty, $round:expr, $truncate:expr) => {
+        impl LaneSize for $type {
+            const KECCAK_F_ROUND_COUNT: usize = $round;
+
+            fn truncate_rc(rc: u64) -> Self {
+                $truncate(rc)
+            }
+
+            fn rotate_left(self, n: u32) -> Self {
+                self.rotate_left(n)
+            }
+        }
+    };
+}
+
+impl_lanesize!(u8, 18, |rc: u64| { rc.to_le_bytes()[0] });
+impl_lanesize!(u16, 20, |rc: u64| {
+    let tmp = rc.to_le_bytes();
+    #[allow(clippy::unwrap_used)]
+    Self::from_le_bytes(tmp[..size_of::<Self>()].try_into().unwrap())
+});
+impl_lanesize!(u32, 22, |rc: u64| {
+    let tmp = rc.to_le_bytes();
+    #[allow(clippy::unwrap_used)]
+    Self::from_le_bytes(tmp[..size_of::<Self>()].try_into().unwrap())
+});
+impl_lanesize!(u64, 24, |rc: u64| { rc });
+
+macro_rules! impl_keccak {
+    ($pname:ident, $fname:ident, $type:ty) => {
+        /// Keccak-p sponge function
+        pub fn $pname(state: &mut [$type; PLEN], round_count: usize) {
+            keccak_p(state, round_count);
+        }
+
+        /// Keccak-f sponge function
+        pub fn $fname(state: &mut [$type; PLEN]) {
+            keccak_p(state, <$type>::KECCAK_F_ROUND_COUNT);
+        }
+    };
+}
+
+impl_keccak!(p200, f200, u8);
+impl_keccak!(p400, f400, u16);
+impl_keccak!(p800, f800, u32);
+
+/// Keccak-p[1600, rc] permutation. On the lambda-vm RISC-V guest, the full
+/// 24-round permutation is delegated to the Keccak precompile.
+#[cfg(not(all(target_arch = "aarch64", feature = "asm")))]
+pub fn p1600(state: &mut [u64; PLEN], round_count: usize) {
+    #[cfg(target_arch = "riscv64")]
+    if round_count == 24 {
+        // SAFETY: the host implements the KeccakPermute precompile, applying
+        // Keccak-f[1600] in place to the 25-lane state. ABI: a0 = state ptr,
+        // a7 = usize::MAX - 1.
+        unsafe {
+            core::arch::asm!(
+                "ecall",
+                in("a0") state.as_mut_ptr(),
+                in("a7") usize::MAX - 1,
+            );
+        }
+        return;
+    }
+    keccak_p(state, round_count);
+}
+
+/// Keccak-f[1600] permutation (full 24 rounds).
+#[cfg(not(all(target_arch = "aarch64", feature = "asm")))]
+pub fn f1600(state: &mut [u64; PLEN]) {
+    p1600(state, u64::KECCAK_F_ROUND_COUNT);
+}
+
+/// Keccak-p[1600, rc] permutation.
+#[cfg(all(target_arch = "aarch64", feature = "asm"))]
+pub fn p1600(state: &mut [u64; PLEN], round_count: usize) {
+    if armv8_sha3_intrinsics::get() {
+        unsafe { armv8::p1600_armv8_sha3_asm(state, round_count) }
+    } else {
+        keccak_p(state, round_count);
+    }
+}
+
+/// Keccak-f[1600] permutation.
+#[cfg(all(target_arch = "aarch64", feature = "asm"))]
+pub fn f1600(state: &mut [u64; PLEN]) {
+    if armv8_sha3_intrinsics::get() {
+        unsafe { armv8::p1600_armv8_sha3_asm(state, 24) }
+    } else {
+        keccak_p(state, u64::KECCAK_F_ROUND_COUNT);
+    }
+}
+
+#[cfg(feature = "simd")]
+/// SIMD implementations for Keccak-f1600 sponge function
+pub mod simd {
+    use crate::{keccak_p, LaneSize, PLEN};
+    pub use core::simd::{u64x2, u64x4, u64x8};
+
+    macro_rules! impl_lanesize_simd_u64xn {
+        ($type:ty) => {
+            impl LaneSize for $type {
+                const KECCAK_F_ROUND_COUNT: usize = 24;
+
+                fn truncate_rc(rc: u64) -> Self {
+                    Self::splat(rc)
+                }
+
+                fn rotate_left(self, n: u32) -> Self {
+                    self << Self::splat(n.into()) | self >> Self::splat((64 - n).into())
+                }
+            }
+        };
+    }
+
+    impl_lanesize_simd_u64xn!(u64x2);
+    impl_lanesize_simd_u64xn!(u64x4);
+    impl_lanesize_simd_u64xn!(u64x8);
+
+    impl_keccak!(p1600x2, f1600x2, u64x2);
+    impl_keccak!(p1600x4, f1600x4, u64x4);
+    impl_keccak!(p1600x8, f1600x8, u64x8);
+}
+
+#[allow(unused_assignments)]
+/// Generic Keccak-p sponge function
+pub fn keccak_p<L: LaneSize>(state: &mut [L; PLEN], round_count: usize) {
+    if round_count > L::KECCAK_F_ROUND_COUNT {
+        panic!("A round_count greater than KECCAK_F_ROUND_COUNT is not supported!");
+    }
+
+    // https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf#page=25
+    // "the rounds of KECCAK-p[b, nr] match the last rounds of KECCAK-f[b]"
+    let round_consts = &RC[(L::KECCAK_F_ROUND_COUNT - round_count)..L::KECCAK_F_ROUND_COUNT];
+
+    // not unrolling this loop results in a much smaller function, plus
+    // it positively influences performance due to the smaller load on I-cache
+    for &rc in round_consts {
+        let mut array = [L::default(); 5];
+
+        // Theta
+        unroll5!(x, {
+            unroll5!(y, {
+                array[x] ^= state[5 * y + x];
+            });
+        });
+
+        unroll5!(x, {
+            unroll5!(y, {
+                let t1 = array[(x + 4) % 5];
+                let t2 = array[(x + 1) % 5].rotate_left(1);
+                state[5 * y + x] ^= t1 ^ t2;
+            });
+        });
+
+        // Rho and pi
+        let mut last = state[1];
+        unroll24!(x, {
+            array[0] = state[PI[x]];
+            state[PI[x]] = last.rotate_left(RHO[x]);
+            last = array[0];
+        });
+
+        // Chi
+        unroll5!(y_step, {
+            let y = 5 * y_step;
+
+            unroll5!(x, {
+                array[x] = state[y + x];
+            });
+
+            unroll5!(x, {
+                let t1 = !array[(x + 1) % 5];
+                let t2 = array[(x + 2) % 5];
+                state[y + x] = array[x] ^ (t1 & t2);
+            });
+        });
+
+        // Iota
+        state[0] ^= L::truncate_rc(rc);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{keccak_p, LaneSize, PLEN};
+
+    fn keccak_f<L: LaneSize>(state_first: [L; PLEN], state_second: [L; PLEN]) {
+        let mut state = [L::default(); PLEN];
+
+        keccak_p(&mut state, L::KECCAK_F_ROUND_COUNT);
+        assert_eq!(state, state_first);
+
+        keccak_p(&mut state, L::KECCAK_F_ROUND_COUNT);
+        assert_eq!(state, state_second);
+    }
+
+    #[test]
+    fn keccak_f200() {
+        // Test vectors are copied from XKCP (eXtended Keccak Code Package)
+        // https://github.com/XKCP/XKCP/blob/master/tests/TestVectors/KeccakF-200-IntermediateValues.txt
+        let state_first = [
+            0x3C, 0x28, 0x26, 0x84, 0x1C, 0xB3, 0x5C, 0x17, 0x1E, 0xAA, 0xE9, 0xB8, 0x11, 0x13,
+            0x4C, 0xEA, 0xA3, 0x85, 0x2C, 0x69, 0xD2, 0xC5, 0xAB, 0xAF, 0xEA,
+        ];
+        let state_second = [
+            0x1B, 0xEF, 0x68, 0x94, 0x92, 0xA8, 0xA5, 0x43, 0xA5, 0x99, 0x9F, 0xDB, 0x83, 0x4E,
+            0x31, 0x66, 0xA1, 0x4B, 0xE8, 0x27, 0xD9, 0x50, 0x40, 0x47, 0x9E,
+        ];
+
+        keccak_f::<u8>(state_first, state_second);
+    }
+
+    #[test]
+    fn keccak_f400() {
+        // Test vectors are copied from XKCP (eXtended Keccak Code Package)
+        // https://github.com/XKCP/XKCP/blob/master/tests/TestVectors/KeccakF-400-IntermediateValues.txt
+        let state_first = [
+            0x09F5, 0x40AC, 0x0FA9, 0x14F5, 0xE89F, 0xECA0, 0x5BD1, 0x7870, 0xEFF0, 0xBF8F, 0x0337,
+            0x6052, 0xDC75, 0x0EC9, 0xE776, 0x5246, 0x59A1, 0x5D81, 0x6D95, 0x6E14, 0x633E, 0x58EE,
+            0x71FF, 0x714C, 0xB38E,
+        ];
+        let state_second = [
+            0xE537, 0xD5D6, 0xDBE7, 0xAAF3, 0x9BC7, 0xCA7D, 0x86B2, 0xFDEC, 0x692C, 0x4E5B, 0x67B1,
+            0x15AD, 0xA7F7, 0xA66F, 0x67FF, 0x3F8A, 0x2F99, 0xE2C2, 0x656B, 0x5F31, 0x5BA6, 0xCA29,
+            0xC224, 0xB85C, 0x097C,
+        ];
+
+        keccak_f::<u16>(state_first, state_second);
+    }
+
+    #[test]
+    fn keccak_f800() {
+        // Test vectors are copied from XKCP (eXtended Keccak Code Package)
+        // https://github.com/XKCP/XKCP/blob/master/tests/TestVectors/KeccakF-800-IntermediateValues.txt
+        let state_first = [
+            0xE531D45D, 0xF404C6FB, 0x23A0BF99, 0xF1F8452F, 0x51FFD042, 0xE539F578, 0xF00B80A7,
+            0xAF973664, 0xBF5AF34C, 0x227A2424, 0x88172715, 0x9F685884, 0xB15CD054, 0x1BF4FC0E,
+            0x6166FA91, 0x1A9E599A, 0xA3970A1F, 0xAB659687, 0xAFAB8D68, 0xE74B1015, 0x34001A98,
+            0x4119EFF3, 0x930A0E76, 0x87B28070, 0x11EFE996,
+        ];
+        let state_second = [
+            0x75BF2D0D, 0x9B610E89, 0xC826AF40, 0x64CD84AB, 0xF905BDD6, 0xBC832835, 0x5F8001B9,
+            0x15662CCE, 0x8E38C95E, 0x701FE543, 0x1B544380, 0x89ACDEFF, 0x51EDB5DE, 0x0E9702D9,
+            0x6C19AA16, 0xA2913EEE, 0x60754E9A, 0x9819063C, 0xF4709254, 0xD09F9084, 0x772DA259,
+            0x1DB35DF7, 0x5AA60162, 0x358825D5, 0xB3783BAB,
+        ];
+
+        keccak_f::<u32>(state_first, state_second);
+    }
+
+    #[test]
+    fn keccak_f1600() {
+        // Test vectors are copied from XKCP (eXtended Keccak Code Package)
+        // https://github.com/XKCP/XKCP/blob/master/tests/TestVectors/KeccakF-1600-IntermediateValues.txt
+        let state_first = [
+            0xF1258F7940E1DDE7,
+            0x84D5CCF933C0478A,
+            0xD598261EA65AA9EE,
+            0xBD1547306F80494D,
+            0x8B284E056253D057,
+            0xFF97A42D7F8E6FD4,
+            0x90FEE5A0A44647C4,
+            0x8C5BDA0CD6192E76,
+            0xAD30A6F71B19059C,
+            0x30935AB7D08FFC64,
+            0xEB5AA93F2317D635,
+            0xA9A6E6260D712103,
+            0x81A57C16DBCF555F,
+            0x43B831CD0347C826,
+            0x01F22F1A11A5569F,
+            0x05E5635A21D9AE61,
+            0x64BEFEF28CC970F2,
+            0x613670957BC46611,
+            0xB87C5A554FD00ECB,
+            0x8C3EE88A1CCF32C8,
+            0x940C7922AE3A2614,
+            0x1841F924A2C509E4,
+            0x16F53526E70465C2,
+            0x75F644E97F30A13B,
+            0xEAF1FF7B5CECA249,
+        ];
+        let state_second = [
+            0x2D5C954DF96ECB3C,
+            0x6A332CD07057B56D,
+            0x093D8D1270D76B6C,
+            0x8A20D9B25569D094,
+            0x4F9C4F99E5E7F156,
+            0xF957B9A2DA65FB38,
+            0x85773DAE1275AF0D,
+            0xFAF4F247C3D810F7,
+            0x1F1B9EE6F79A8759,
+            0xE4FECC0FEE98B425,
+            0x68CE61B6B9CE68A1,
+            0xDEEA66C4BA8F974F,
+            0x33C43D836EAFB1F5,
+            0xE00654042719DBD9,
+            0x7CF8A9F009831265,
+            0xFD5449A6BF174743,
+            0x97DDAD33D8994B40,
+            0x48EAD5FC5D0BE774,
+            0xE3B8C8EE55B7B03C,
+            0x91A0226E649E42E9,
+            0x900E3129E7BADD7B,
+            0x202A9EC5FAA3CCE8,
+            0x5B3402464E1C3DB6,
+            0x609F4E62A44C1059,
+            0x20D06CD26A8FBF5C,
+        ];
+
+        keccak_f::<u64>(state_first, state_second);
+    }
+
+    // Faithfulness check for the precompile shim: on the host the software
+    // path runs, so `crate::f1600` / `crate::p1600(_, 24)` must produce
+    // byte-identical output to the upstream `keccak` crate (pulled in as a
+    // renamed dev-dependency to avoid a self-patch cycle).
+    #[test]
+    fn p1600_matches_upstream_f1600() {
+        // Known Keccak-f[1600] test vector: f1600 of the all-zero state.
+        let expected: [u64; 25] = [
+            0xF1258F7940E1DDE7,
+            0x84D5CCF933C0478A,
+            0xD598261EA65AA9EE,
+            0xBD1547306F80494D,
+            0x8B284E056253D057,
+            0xFF97A42D7F8E6FD4,
+            0x90FEE5A0A44647C4,
+            0x8C5BDA0CD6192E76,
+            0xAD30A6F71B19059C,
+            0x30935AB7D08FFC64,
+            0xEB5AA93F2317D635,
+            0xA9A6E6260D712103,
+            0x81A57C16DBCF555F,
+            0x43B831CD0347C826,
+            0x01F22F1A11A5569F,
+            0x05E5635A21D9AE61,
+            0x64BEFEF28CC970F2,
+            0x613670957BC46611,
+            0xB87C5A554FD00ECB,
+            0x8C3EE88A1CCF32C8,
+            0x940C7922AE3A2614,
+            0x1841F924A2C509E4,
+            0x16F53526E70465C2,
+            0x75F644E97F30A13B,
+            0xEAF1FF7B5CECA249,
+        ];
+
+        let mut s = [0u64; 25];
+        crate::f1600(&mut s);
+        assert_eq!(s, expected, "f1600(0) must match the standard test vector");
+
+        // p1600(_, 24) must equal f1600.
+        let mut p = [0u64; 25];
+        crate::p1600(&mut p, 24);
+        assert_eq!(p, expected, "p1600(_, 24) must equal f1600");
+
+        // Cross-check against the upstream crate over pseudo-random states.
+        let mut x: u64 = 0x243F6A8885A308D3; // digits of pi
+        for _ in 0..256 {
+            let mut a = [0u64; 25];
+            for lane in a.iter_mut() {
+                // xorshift64 PRNG
+                x ^= x << 13;
+                x ^= x >> 7;
+                x ^= x << 17;
+                *lane = x;
+            }
+            let mut b = a;
+
+            crate::f1600(&mut a);
+            dev_dep_keccak::f1600(&mut b);
+            assert_eq!(a, b, "shim f1600 must match upstream f1600");
+
+            // Also exercise reduced-round p1600 against upstream p1600.
+            for rc in [1usize, 12, 23, 24] {
+                let mut a = [0u64; 25];
+                let mut b = [0u64; 25];
+                for i in 0..25 {
+                    x ^= x << 13;
+                    x ^= x >> 7;
+                    x ^= x << 17;
+                    a[i] = x;
+                    b[i] = x;
+                }
+                crate::p1600(&mut a, rc);
+                dev_dep_keccak::p1600(&mut b, rc);
+                assert_eq!(a, b, "shim p1600 must match upstream p1600 for rc");
+            }
+        }
+    }
+
+    #[cfg(feature = "simd")]
+    mod simd {
+        use super::keccak_f;
+        use core::simd::{u64x2, u64x4, u64x8};
+
+        macro_rules! impl_keccak_f1600xn {
+            ($name:ident, $type:ty) => {
+                #[test]
+                fn $name() {
+                    // Test vectors are copied from XKCP (eXtended Keccak Code Package)
+                    // https://github.com/XKCP/XKCP/blob/master/tests/TestVectors/KeccakF-1600-IntermediateValues.txt
+                    let state_first = [
+                        <$type>::splat(0xF1258F7940E1DDE7),
+                        <$type>::splat(0x84D5CCF933C0478A),
+                        <$type>::splat(0xD598261EA65AA9EE),
+                        <$type>::splat(0xBD1547306F80494D),
+                        <$type>::splat(0x8B284E056253D057),
+                        <$type>::splat(0xFF97A42D7F8E6FD4),
+                        <$type>::splat(0x90FEE5A0A44647C4),
+                        <$type>::splat(0x8C5BDA0CD6192E76),
+                        <$type>::splat(0xAD30A6F71B19059C),
+                        <$type>::splat(0x30935AB7D08FFC64),
+                        <$type>::splat(0xEB5AA93F2317D635),
+                        <$type>::splat(0xA9A6E6260D712103),
+                        <$type>::splat(0x81A57C16DBCF555F),
+                        <$type>::splat(0x43B831CD0347C826),
+                        <$type>::splat(0x01F22F1A11A5569F),
+                        <$type>::splat(0x05E5635A21D9AE61),
+                        <$type>::splat(0x64BEFEF28CC970F2),
+                        <$type>::splat(0x613670957BC46611),
+                        <$type>::splat(0xB87C5A554FD00ECB),
+                        <$type>::splat(0x8C3EE88A1CCF32C8),
+                        <$type>::splat(0x940C7922AE3A2614),
+                        <$type>::splat(0x1841F924A2C509E4),
+                        <$type>::splat(0x16F53526E70465C2),
+                        <$type>::splat(0x75F644E97F30A13B),
+                        <$type>::splat(0xEAF1FF7B5CECA249),
+                    ];
+                    let state_second = [
+                        <$type>::splat(0x2D5C954DF96ECB3C),
+                        <$type>::splat(0x6A332CD07057B56D),
+                        <$type>::splat(0x093D8D1270D76B6C),
+                        <$type>::splat(0x8A20D9B25569D094),
+                        <$type>::splat(0x4F9C4F99E5E7F156),
+                        <$type>::splat(0xF957B9A2DA65FB38),
+                        <$type>::splat(0x85773DAE1275AF0D),
+                        <$type>::splat(0xFAF4F247C3D810F7),
+                        <$type>::splat(0x1F1B9EE6F79A8759),
+                        <$type>::splat(0xE4FECC0FEE98B425),
+                        <$type>::splat(0x68CE61B6B9CE68A1),
+                        <$type>::splat(0xDEEA66C4BA8F974F),
+                        <$type>::splat(0x33C43D836EAFB1F5),
+                        <$type>::splat(0xE00654042719DBD9),
+                        <$type>::splat(0x7CF8A9F009831265),
+                        <$type>::splat(0xFD5449A6BF174743),
+                        <$type>::splat(0x97DDAD33D8994B40),
+                        <$type>::splat(0x48EAD5FC5D0BE774),
+                        <$type>::splat(0xE3B8C8EE55B7B03C),
+                        <$type>::splat(0x91A0226E649E42E9),
+                        <$type>::splat(0x900E3129E7BADD7B),
+                        <$type>::splat(0x202A9EC5FAA3CCE8),
+                        <$type>::splat(0x5B3402464E1C3DB6),
+                        <$type>::splat(0x609F4E62A44C1059),
+                        <$type>::splat(0x20D06CD26A8FBF5C),
+                    ];
+
+                    keccak_f::<$type>(state_first, state_second);
+                }
+            };
+        }
+
+        impl_keccak_f1600xn!(keccak_f1600x2, u64x2);
+        impl_keccak_f1600xn!(keccak_f1600x4, u64x4);
+        impl_keccak_f1600xn!(keccak_f1600x8, u64x8);
+    }
+}
diff --git a/keccak-precompile/src/unroll.rs b/keccak-precompile/src/unroll.rs
new file mode 100644
index 000000000..eab745b9d
--- /dev/null
+++ b/keccak-precompile/src/unroll.rs
@@ -0,0 +1,62 @@
+/// unroll5
+#[cfg(not(feature = "no_unroll"))]
+#[macro_export]
+macro_rules! unroll5 {
+    ($var:ident, $body:block) => {
+        { const $var: usize = 0; $body; }
+        { const $var: usize = 1; $body; }
+        { const $var: usize = 2; $body; }
+        { const $var: usize = 3; $body; }
+        { const $var: usize = 4; $body; }
+    };
+}
+
+/// unroll5
+#[cfg(feature = "no_unroll")]
+#[macro_export]
+macro_rules! unroll5 {
+    ($var:ident, $body:block) => {
+        for $var in 0..5 $body
+    }
+}
+
+/// unroll24
+#[cfg(not(feature = "no_unroll"))]
+#[macro_export]
+macro_rules! unroll24 {
+    ($var: ident, $body: block) => {
+        { const $var: usize = 0; $body; }
+        { const $var: usize = 1; $body; }
+        { const $var: usize = 2; $body; }
+        { const $var: usize = 3; $body; }
+        { const $var: usize = 4; $body; }
+        { const $var: usize = 5; $body; }
+        { const $var: usize = 6; $body; }
+        { const $var: usize = 7; $body; }
+        { const $var: usize = 8; $body; }
+        { const $var: usize = 9; $body; }
+        { const $var: usize = 10; $body; }
+        { const $var: usize = 11; $body; }
+        { const $var: usize = 12; $body; }
+        { const $var: usize = 13; $body; }
+        { const $var: usize = 14; $body; }
+        { const $var: usize = 15; $body; }
+        { const $var: usize = 16; $body; }
+        { const $var: usize = 17; $body; }
+        { const $var: usize = 18; $body; }
+        { const $var: usize = 19; $body; }
+        { const $var: usize = 20; $body; }
+        { const $var: usize = 21; $body; }
+        { const $var: usize = 22; $body; }
+        { const $var: usize = 23; $body; }
+    };
+}
+
+/// unroll24
+#[cfg(feature = "no_unroll")]
+#[macro_export]
+macro_rules! unroll24 {
+    ($var:ident, $body:block) => {
+        for $var in 0..24 $body
+    }
+}

From d279ab6298e693bfe4927eeb05700ee57fe5cfc9 Mon Sep 17 00:00:00 2001
From: diegokingston <dkingston@fi.uba.ar>
Date: Fri, 29 May 2026 17:05:02 -0300
Subject: [PATCH 8/8] tooling: A/B script to measure verifier with/without
 Keccak precompile

Toggles the keccak-precompile [patch] in an external recursion-guest
Cargo.toml, runs the verify bench both ways, prints the cycle comparison.
Parameterized by GUEST_DIR + VERIFY_BENCH_CMD (the guest harness lives
outside this repo).
---
 scripts/measure_verifier_precompile.sh | 95 ++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100755 scripts/measure_verifier_precompile.sh

diff --git a/scripts/measure_verifier_precompile.sh b/scripts/measure_verifier_precompile.sh
new file mode 100755
index 000000000..e3657a724
--- /dev/null
+++ b/scripts/measure_verifier_precompile.sh
@@ -0,0 +1,95 @@
+#!/usr/bin/env bash
+# Measure the recursion-guest verifier WITH vs WITHOUT the Keccak precompile.
+#
+# The recursion guest (the STARK verifier compiled to RISC-V) lives in YOUR
+# guest crate, outside this repo. This script A/Bs the `keccak` precompile shim
+# by toggling a `[patch.crates-io] keccak = { path = <keccak-precompile> }`
+# entry in the guest's Cargo.toml, running your verify benchmark each time, and
+# printing both results so you can see the cycle drop.
+#
+# Usage:
+#   GUEST_DIR=/path/to/recursion-guest \
+#   VERIFY_BENCH_CMD='cargo run --release -- bench-verify' \
+#     scripts/measure_verifier_precompile.sh
+#
+# Required env:
+#   GUEST_DIR         Root of your recursion-guest crate (the one with the
+#                     Cargo.toml that builds the verifier-as-RISC-V program).
+#   VERIFY_BENCH_CMD  The command (run FROM $GUEST_DIR) that builds + runs the
+#                     guest verify and prints its RISC-V cycle count. Whatever
+#                     you already use to get the "40.5B / 67M" numbers.
+#
+# Optional env:
+#   CYCLE_GREP        A grep -oE pattern to extract the cycle number from the
+#                     bench output (default tries common forms). Purely for the
+#                     summary line; full output is always shown.
+#
+# Correctness: the shim routes only the Keccak-f[1600] permutation to the VM
+# precompile (a0=state ptr, a7=usize::MAX-1), reusing sha3's sponge/padding, so
+# every hash is byte-identical — the verify result is unchanged, only faster.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+SHIM_DIR="$ROOT_DIR/keccak-precompile"
+
+: "${GUEST_DIR:?set GUEST_DIR to your recursion-guest crate root}"
+: "${VERIFY_BENCH_CMD:?set VERIFY_BENCH_CMD to the command that runs your guest verify bench}"
+CYCLE_GREP="${CYCLE_GREP:-[0-9][0-9.,]*[ ]*(cycles|B|M|instructions)}"
+
+GUEST_DIR="$(cd "$GUEST_DIR" && pwd)"
+MANIFEST="$GUEST_DIR/Cargo.toml"
+[ -f "$MANIFEST" ] || { echo "no Cargo.toml in $GUEST_DIR" >&2; exit 1; }
+[ -f "$SHIM_DIR/Cargo.toml" ] || { echo "shim not found at $SHIM_DIR" >&2; exit 1; }
+
+BACKUP="$(mktemp)"
+cp "$MANIFEST" "$BACKUP"
+restore() { cp "$BACKUP" "$MANIFEST"; rm -f "$BACKUP"; }
+trap restore EXIT
+
+GREEN='\033[0;32m'; BOLD='\033[1m'; NC='\033[0m'
+
+run_bench() {
+    local label="$1"
+    echo -e "\n${BOLD}=== verify bench: $label ===${NC}"
+    ( cd "$GUEST_DIR" && eval "$VERIFY_BENCH_CMD" ) | tee "/tmp/verify_bench_${label}.out"
+}
+
+add_patch() {
+    # Add `keccak = { path = SHIM }` under [patch.crates-io], creating the
+    # section if absent. Duplicate [patch.crates-io] tables are a cargo error,
+    # so we append into the existing one when present.
+    local line="keccak = { path = \"$SHIM_DIR\" }"
+    if grep -qE '^\[patch\.crates-io\]' "$MANIFEST"; then
+        # insert right after the section header
+        awk -v l="$line" '
+            { print }
+            /^\[patch\.crates-io\]/ && !done { print l; done=1 }
+        ' "$MANIFEST" > "$MANIFEST.tmp" && mv "$MANIFEST.tmp" "$MANIFEST"
+    else
+        printf '\n[patch.crates-io]\n%s\n' "$line" >> "$MANIFEST"
+    fi
+}
+
+extract() { grep -oiE "$CYCLE_GREP" "$1" | head -1 || true; }
+
+# 1) Baseline (software Keccak)
+restore  # ensure clean
+cp "$MANIFEST" "$BACKUP"
+run_bench "baseline"
+
+# 2) With precompile shim
+add_patch
+echo -e "${GREEN}[patched] added: keccak = { path = $SHIM_DIR }${NC}"
+run_bench "precompile"
+
+# restore happens via trap
+
+echo ""
+echo -e "${BOLD}=== Summary ===${NC}"
+echo "  baseline (software Keccak) : $(extract /tmp/verify_bench_baseline.out)"
+echo "  with precompile shim       : $(extract /tmp/verify_bench_precompile.out)"
+echo ""
+echo "Full outputs: /tmp/verify_bench_baseline.out  /tmp/verify_bench_precompile.out"
+echo "Guest Cargo.toml restored."