From afae49b6587e7a9cf6daec080658eb256dc0c833 Mon Sep 17 00:00:00 2001 From: diegokingston Date: Fri, 29 May 2026 15:18:31 -0300 Subject: [PATCH 1/8] perf(verifier): batch all DEEP denominators into one inversion --- crypto/stark/src/verifier.rs | 157 ++++++++++++++++++++++------------- 1 file changed, 100 insertions(+), 57 deletions(-) diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs index 8091c8b32..73a2df162 100644 --- a/crypto/stark/src/verifier.rs +++ b/crypto/stark/src/verifier.rs @@ -562,68 +562,122 @@ pub trait IsStarkVerifier< let primitive_root = &Field::get_primitive_root_of_unity(domain.root_order as u64) .expect("verifier domain root_order is a valid power of two"); + let height = proof.trace_ood_evaluations.height; + // Per-entry stride in the flat denominator buffer: `height` trace + // denominators followed by one composition denominator. + let stride = height + 1; + + // Per-entry data carried from Pass 1 to Pass 2. We own `lde_base` + // (concatenated precomputed + main columns) and borrow the aux and + // composition evaluation slices straight out of the proof opening. + struct DeepEntry<'a, Field: IsField, FieldExtension: IsField> { + lde_base: Vec>, + lde_aux: &'a [FieldElement], + comp_evals: &'a [FieldElement], + is_sym: bool, + } + + let mut entries: Vec> = + Vec::with_capacity(num_queries * 2); + // Flat buffer of all denominators across every (query, query-point). + // A SINGLE batch inversion is performed over this whole buffer. + let mut all_denoms: Vec> = + Vec::with_capacity(num_queries * 2 * stride); + + // Pass 1: collect openings + denominators (no inversions yet). for (i, iota) in challenges.iotas.iter().enumerate() { let opening = &proof.deep_poly_openings[i]; - // Base-field portion: precomputed columns FIRST, then main trace columns. - let mut lde_base: Vec> = Vec::new(); - if let Some(p) = &opening.precomputed_trace_polys { - lde_base.extend_from_slice(&p.evaluations); - } - lde_base.extend_from_slice(&opening.main_trace_polys.evaluations); + for is_sym in [false, true] { + // Base-field portion: precomputed columns FIRST, then main trace columns. + let mut lde_base: Vec> = Vec::new(); + if let Some(p) = &opening.precomputed_trace_polys { + if is_sym { + lde_base.extend_from_slice(&p.evaluations_sym); + } else { + lde_base.extend_from_slice(&p.evaluations); + } + } + if is_sym { + lde_base.extend_from_slice(&opening.main_trace_polys.evaluations_sym); + } else { + lde_base.extend_from_slice(&opening.main_trace_polys.evaluations); + } - let lde_aux: &[FieldElement] = opening - .aux_trace_polys - .as_ref() - .map(|a| a.evaluations.as_slice()) - .unwrap_or(&[]); + let lde_aux: &[FieldElement] = match &opening.aux_trace_polys { + Some(a) if is_sym => a.evaluations_sym.as_slice(), + Some(a) => a.evaluations.as_slice(), + None => &[], + }; - let evaluation_point = Self::query_challenge_to_evaluation_point(*iota, false, domain); - deep_poly_evaluations.push(Self::reconstruct_deep_composition_poly_evaluation( - proof, - &evaluation_point, - primitive_root, - challenges, - &lde_base, - lde_aux, - &opening.composition_poly.evaluations, - )?); - - // Mirror for the symmetric query point. - let mut lde_base_sym: Vec> = Vec::new(); - if let Some(p) = &opening.precomputed_trace_polys { - lde_base_sym.extend_from_slice(&p.evaluations_sym); + let comp_evals: &[FieldElement] = if is_sym { + &opening.composition_poly.evaluations_sym + } else { + &opening.composition_poly.evaluations + }; + + let evaluation_point = + Self::query_challenge_to_evaluation_point(*iota, is_sym, domain); + + // `height` trace denominators: (upsilon - z*g^k) for k = 0..height. + let mut current_z = challenges.z.clone(); + for _ in 0..height { + all_denoms.push(&evaluation_point - ¤t_z); + current_z = primitive_root * ¤t_z; + } + // One composition denominator: (upsilon - z^N). + let z_pow = challenges.z.pow(comp_evals.len()); + all_denoms.push(&evaluation_point - &z_pow); + + entries.push(DeepEntry { + lde_base, + lde_aux, + comp_evals, + is_sym, + }); } - lde_base_sym.extend_from_slice(&opening.main_trace_polys.evaluations_sym); + } + + // Single global batch inversion. A malformed proof can land an OOD + // evaluation point on the LDE coset (zero denominator); this rejects + // the whole verify, matching the prior per-call semantics. + FieldElement::inplace_batch_inverse(&mut all_denoms).ok()?; - let lde_aux_sym: &[FieldElement] = opening - .aux_trace_polys - .as_ref() - .map(|a| a.evaluations_sym.as_slice()) - .unwrap_or(&[]); + // Pass 2: reconstruct each DEEP evaluation using the pre-inverted denoms. + for (e, entry) in entries.iter().enumerate() { + let denoms_slice = &all_denoms[e * stride..e * stride + stride]; + let trace_denoms_inv = &denoms_slice[..height]; + let comp_denom_inv = &denoms_slice[height]; - let evaluation_point = Self::query_challenge_to_evaluation_point(*iota, true, domain); - deep_poly_evaluations_sym.push(Self::reconstruct_deep_composition_poly_evaluation( + let value = Self::reconstruct_deep_composition_poly_evaluation( proof, - &evaluation_point, - primitive_root, challenges, - &lde_base_sym, - lde_aux_sym, - &opening.composition_poly.evaluations_sym, - )?); + &entry.lde_base, + entry.lde_aux, + entry.comp_evals, + trace_denoms_inv, + comp_denom_inv, + )?; + + if entry.is_sym { + deep_poly_evaluations_sym.push(value); + } else { + deep_poly_evaluations.push(value); + } } + Some((deep_poly_evaluations, deep_poly_evaluations_sym)) } + #[allow(clippy::too_many_arguments)] fn reconstruct_deep_composition_poly_evaluation( proof: &StarkProof, - evaluation_point: &FieldElement, - primitive_root: &FieldElement, challenges: &Challenges, lde_trace_base_evaluations: &[FieldElement], lde_trace_aux_evaluations: &[FieldElement], lde_composition_poly_parts_evaluation: &[FieldElement], + trace_denoms_inv: &[FieldElement], + comp_denom_inv: &FieldElement, ) -> Option> { let ood_evaluations_table_height = proof.trace_ood_evaluations.height; let ood_evaluations_table_width = proof.trace_ood_evaluations.width; @@ -644,15 +698,9 @@ pub trait IsStarkVerifier< { return None; } - - let mut denoms_trace = Vec::with_capacity(ood_evaluations_table_height); - let mut current_z = challenges.z.clone(); - for _ in 0..ood_evaluations_table_height { - denoms_trace.push(evaluation_point - ¤t_z); - current_z = primitive_root * ¤t_z; + if trace_denoms_inv.len() != ood_evaluations_table_height { + return None; } - // A malformed proof can land an OOD evaluation point on the LDE coset, reject. - FieldElement::inplace_batch_inverse(&mut denoms_trace).ok()?; let num_base = lde_trace_base_evaluations.len(); let trace_term = (0..ood_evaluations_table_width) @@ -668,18 +716,13 @@ pub trait IsStarkVerifier< } else { &lde_trace_aux_evaluations[col_idx - num_base] - ood_val }; - let poly_evaluation = diff * &denoms_trace[row_idx]; + let poly_evaluation = diff * &trace_denoms_inv[row_idx]; trace_t + &poly_evaluation * coeff }, ); trace_terms + trace_i }); - let number_of_parts = lde_composition_poly_parts_evaluation.len(); - let z_pow = &challenges.z.pow(number_of_parts); - - // A malformed proof can make evaluation_point == z^N, reject. - let denom_composition = (evaluation_point - z_pow).inv().ok()?; let mut h_terms = FieldElement::zero(); for (j, h_i_upsilon) in lde_composition_poly_parts_evaluation.iter().enumerate() { // Bounds-check via `.get(j)?`: a malformed opening may have more @@ -689,7 +732,7 @@ pub trait IsStarkVerifier< let h_i_term = (h_i_upsilon - h_i_zpower) * gamma; h_terms += h_i_term; } - h_terms *= denom_composition; + h_terms *= comp_denom_inv; Some(trace_term + h_terms) } From 618f97f5ece4441ba0c08fd2ab7a9eb8bb536aa3 Mon Sep 17 00:00:00 2001 From: diegokingston Date: Fri, 29 May 2026 15:54:36 -0300 Subject: [PATCH 2/8] perf(verifier): drop per-query lde_base concatenation; hoist z_pow --- crypto/stark/src/verifier.rs | 66 +++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs index 73a2df162..b63823e9b 100644 --- a/crypto/stark/src/verifier.rs +++ b/crypto/stark/src/verifier.rs @@ -567,11 +567,19 @@ pub trait IsStarkVerifier< // denominators followed by one composition denominator. let stride = height + 1; - // Per-entry data carried from Pass 1 to Pass 2. We own `lde_base` - // (concatenated precomputed + main columns) and borrow the aux and - // composition evaluation slices straight out of the proof opening. + // The composition denominator exponent is constant across all queries: + // it is the number of composition poly parts the proof advertises (the + // same array the consumer validates against). Hoist `z^N` once. + let number_of_parts = proof.composition_poly_parts_ood_evaluation.len(); + let z_pow = challenges.z.pow(number_of_parts); + + // Per-entry data carried from Pass 1 to Pass 2. We borrow every slice + // straight out of the proof opening: precomputed and main base-field + // columns separately (avoiding a per-query concatenation allocation), + // plus the aux and composition evaluation slices. struct DeepEntry<'a, Field: IsField, FieldExtension: IsField> { - lde_base: Vec>, + lde_precomputed: &'a [FieldElement], + lde_main: &'a [FieldElement], lde_aux: &'a [FieldElement], comp_evals: &'a [FieldElement], is_sym: bool, @@ -589,20 +597,20 @@ pub trait IsStarkVerifier< let opening = &proof.deep_poly_openings[i]; for is_sym in [false, true] { - // Base-field portion: precomputed columns FIRST, then main trace columns. - let mut lde_base: Vec> = Vec::new(); - if let Some(p) = &opening.precomputed_trace_polys { - if is_sym { - lde_base.extend_from_slice(&p.evaluations_sym); - } else { - lde_base.extend_from_slice(&p.evaluations); - } - } - if is_sym { - lde_base.extend_from_slice(&opening.main_trace_polys.evaluations_sym); + // Base-field portion: precomputed columns FIRST, then main trace + // columns. Borrow both slices directly (empty slice when the + // opening carries no precomputed trace). + let lde_precomputed: &[FieldElement] = match &opening.precomputed_trace_polys + { + Some(p) if is_sym => p.evaluations_sym.as_slice(), + Some(p) => p.evaluations.as_slice(), + None => &[], + }; + let lde_main: &[FieldElement] = if is_sym { + opening.main_trace_polys.evaluations_sym.as_slice() } else { - lde_base.extend_from_slice(&opening.main_trace_polys.evaluations); - } + opening.main_trace_polys.evaluations.as_slice() + }; let lde_aux: &[FieldElement] = match &opening.aux_trace_polys { Some(a) if is_sym => a.evaluations_sym.as_slice(), @@ -626,11 +634,11 @@ pub trait IsStarkVerifier< current_z = primitive_root * ¤t_z; } // One composition denominator: (upsilon - z^N). - let z_pow = challenges.z.pow(comp_evals.len()); all_denoms.push(&evaluation_point - &z_pow); entries.push(DeepEntry { - lde_base, + lde_precomputed, + lde_main, lde_aux, comp_evals, is_sym, @@ -652,7 +660,8 @@ pub trait IsStarkVerifier< let value = Self::reconstruct_deep_composition_poly_evaluation( proof, challenges, - &entry.lde_base, + entry.lde_precomputed, + entry.lde_main, entry.lde_aux, entry.comp_evals, trace_denoms_inv, @@ -673,7 +682,8 @@ pub trait IsStarkVerifier< fn reconstruct_deep_composition_poly_evaluation( proof: &StarkProof, challenges: &Challenges, - lde_trace_base_evaluations: &[FieldElement], + lde_precomputed: &[FieldElement], + lde_main: &[FieldElement], lde_trace_aux_evaluations: &[FieldElement], lde_composition_poly_parts_evaluation: &[FieldElement], trace_denoms_inv: &[FieldElement], @@ -687,9 +697,9 @@ pub trait IsStarkVerifier< // column count does not match the OOD table width, or whose composition // poly parts count does not match the proof's `composition_poly_parts_ood_evaluation`. // Without these checks the indexing below would panic in release builds. - if lde_trace_base_evaluations.len() + lde_trace_aux_evaluations.len() - != ood_evaluations_table_width - { + let num_precomp = lde_precomputed.len(); + let num_base = num_precomp + lde_main.len(); + if num_base + lde_trace_aux_evaluations.len() != ood_evaluations_table_width { return None; } if trace_term_coeffs.is_empty() @@ -702,7 +712,6 @@ pub trait IsStarkVerifier< return None; } - let num_base = lde_trace_base_evaluations.len(); let trace_term = (0..ood_evaluations_table_width) .zip(&challenges.trace_term_coeffs) .fold(FieldElement::zero(), |trace_terms, (col_idx, coeff_row)| { @@ -711,8 +720,11 @@ pub trait IsStarkVerifier< |trace_t, (row_idx, coeff)| { let ood_val = &proof.trace_ood_evaluations.get_row(row_idx)[col_idx]; // Stay in base when we can: F: IsSubFieldOf gives F - E -> E. - let diff: FieldElement = if col_idx < num_base { - &lde_trace_base_evaluations[col_idx] - ood_val + // Base columns are precomputed first, then main, then aux. + let diff: FieldElement = if col_idx < num_precomp { + &lde_precomputed[col_idx] - ood_val + } else if col_idx < num_base { + &lde_main[col_idx - num_precomp] - ood_val } else { &lde_trace_aux_evaluations[col_idx - num_base] - ood_val }; From 1958280f2bf8c356ff1f3bec55b614c35364fe77 Mon Sep 17 00:00:00 2001 From: diegokingston Date: Fri, 29 May 2026 15:59:59 -0300 Subject: [PATCH 3/8] perf(verifier): compute LogUp alpha powers once, slice per table --- crypto/stark/src/verifier.rs | 41 ++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs index b63823e9b..bbe1efb20 100644 --- a/crypto/stark/src/verifier.rs +++ b/crypto/stark/src/verifier.rs @@ -102,6 +102,7 @@ pub trait IsStarkVerifier< proof: &StarkProof, domain: &VerifierDomain, challenges: &Challenges, + logup_alpha_powers: &[FieldElement], ) -> bool { let trace_length = proof.trace_length; let boundary_constraints = air.boundary_constraints( @@ -173,14 +174,15 @@ pub trait IsStarkVerifier< let num_main_trace_columns = proof.trace_ood_evaluations.width - air.num_auxiliary_rap_columns(); - let logup_alpha_powers: Vec> = - if challenges.rap_challenges.len() > LOGUP_CHALLENGE_ALPHA { - compute_alpha_powers( - &challenges.rap_challenges[LOGUP_CHALLENGE_ALPHA], - air.max_bus_elements(), - ) + // Reuse a prefix slice of the globally-computed alpha powers instead of + // recomputing the multiplication chain per table. The global vector is + // sized to the maximum bus element count across all AIRs, so this + // table's prefix is always available; `.min` is purely defensive. + let logup_alpha_powers_slice: &[FieldElement] = + if !logup_alpha_powers.is_empty() { + &logup_alpha_powers[..air.max_bus_elements().min(logup_alpha_powers.len())] } else { - Vec::new() + &[] }; let logup_table_offset = match &proof.bus_public_inputs { @@ -201,7 +203,7 @@ pub trait IsStarkVerifier< &ood_frame, &periodic_values, &challenges.rap_challenges, - &logup_alpha_powers, + logup_alpha_powers_slice, &logup_table_offset, &packing_shifts, ); @@ -842,6 +844,19 @@ pub trait IsStarkVerifier< Vec::new() }; + // Compute the LogUp alpha powers ONCE, up to the global maximum bus + // element count across all AIRs. `compute_alpha_powers` returns the + // strict prefix sequence `[1, α, α², …]`, and the alpha challenge is + // shared (identical) across all tables, so each table can reuse a + // prefix slice of this global vector instead of recomputing the chain. + let logup_alpha_powers_global: Vec> = + if lookup_challenges.len() > LOGUP_CHALLENGE_ALPHA { + let global_max_bus = airs.iter().map(|a| a.max_bus_elements()).max().unwrap_or(0); + compute_alpha_powers(&lookup_challenges[LOGUP_CHALLENGE_ALPHA], global_max_bus) + } else { + Vec::new() + }; + // ===================================================================== // Validate bus_public_inputs presence against AIR layout // ===================================================================== @@ -897,6 +912,7 @@ pub trait IsStarkVerifier< proof, &mut table_transcript, lookup_challenges.clone(), + &logup_alpha_powers_global, ) { error!( "Table {} failed verify_rounds_2_to_4 (num_constraints={}, trace_cols={})", @@ -1103,6 +1119,7 @@ pub trait IsStarkVerifier< proof: &StarkProof, transcript: &mut impl IsStarkTranscript, rap_challenges: Vec>, + logup_alpha_powers: &[FieldElement], ) -> bool where FieldElement: AsBytes + Sync + Send, @@ -1147,7 +1164,13 @@ pub trait IsStarkVerifier< #[cfg(feature = "instruments")] let timer2 = Instant::now(); - if !Self::step_2_verify_claimed_composition_polynomial(air, proof, &domain, &challenges) { + if !Self::step_2_verify_claimed_composition_polynomial( + air, + proof, + &domain, + &challenges, + logup_alpha_powers, + ) { #[cfg(not(feature = "test_fiat_shamir"))] error!("Composition Polynomial verification failed"); return false; From 2a4cb7580c2769ed9be8b1efc45089521be80094 Mon Sep 17 00:00:00 2001 From: diegokingston Date: Fri, 29 May 2026 16:05:27 -0300 Subject: [PATCH 4/8] refactor(verifier): consolidate common pre-fork section in multi_verify; borrow lookup challenges --- crypto/stark/src/verifier.rs | 37 +++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs index bbe1efb20..6f0ba6c62 100644 --- a/crypto/stark/src/verifier.rs +++ b/crypto/stark/src/verifier.rs @@ -792,6 +792,17 @@ pub trait IsStarkVerifier< // Check if any AIR has an auxiliary trace let needs_lookup_challenges = airs.iter().any(|air| air.has_aux_trace()); + // ##################################################################### + // ##### COMMON (shared, pre-fork) ##################################### + // ##################################################################### + // Everything below is computed ONCE on the shared transcript before any + // per-table fork: main commitments are appended, the shared LogUp + // challenges are sampled, the global alpha powers are derived, and the + // bus_public_inputs layout is validated. Only after this section do we + // fork the transcript per table. The exact sequence of transcript + // operations here is soundness-critical (Fiat-Shamir) and must match + // the prover byte-for-byte. + // ===================================================================== // Round 1, Phase A: Replay main trace commitments // ===================================================================== @@ -880,9 +891,12 @@ pub trait IsStarkVerifier< } } - // ===================================================================== - // Phase C + Rounds 2-4: Forked per table - // ===================================================================== + // ##################################################################### + // ##### PER-TABLE (forked transcript) ################################# + // ##################################################################### + // The shared/common section is finished. From here each table branches. + // + // Phase C + Rounds 2-4: Forked per table. // Each table gets an independent transcript fork (cloned from the shared // state after Phase B, domain-separated by table index). This matches // the prover's forking and makes per-table verification independent. @@ -911,7 +925,7 @@ pub trait IsStarkVerifier< *air, proof, &mut table_transcript, - lookup_challenges.clone(), + &lookup_challenges, &logup_alpha_powers_global, ) { error!( @@ -1118,7 +1132,7 @@ pub trait IsStarkVerifier< air: &dyn AIR, proof: &StarkProof, transcript: &mut impl IsStarkTranscript, - rap_challenges: Vec>, + rap_challenges: &[FieldElement], logup_alpha_powers: &[FieldElement], ) -> bool where @@ -1137,8 +1151,17 @@ pub trait IsStarkVerifier< #[cfg(feature = "instruments")] let timer1 = Instant::now(); - let challenges = - Self::replay_rounds_after_round_1(air, proof, &domain, transcript, rap_challenges); + // `replay_rounds_after_round_1` takes ownership of `rap_challenges` + // (it is stored owned in the returned `Challenges`). Clone exactly once + // here, where ownership is actually required — this removes the + // per-table clone that previously lived at the `multi_verify` call site. + let challenges = Self::replay_rounds_after_round_1( + air, + proof, + &domain, + transcript, + rap_challenges.to_vec(), + ); // verify grinding let security_bits = air.context().proof_options.grinding_factor; From 8f93fd83da0f42b50511155cbc5fe08d3a2b589e Mon Sep 17 00:00:00 2001 From: diegokingston Date: Fri, 29 May 2026 16:31:29 -0300 Subject: [PATCH 5/8] perf(verifier): hash Merkle leaves from borrowed slices (no per-opening Vec alloc) --- .../backends/field_element_vector.rs | 24 ++++++++++++++ crypto/crypto/src/merkle_tree/proof.rs | 22 +++++++++---- .../src/tests/field_element_vector_tests.rs | 33 +++++++++++++++++++ crypto/stark/src/verifier.rs | 27 +++++++++------ 4 files changed, 90 insertions(+), 16 deletions(-) diff --git a/crypto/crypto/src/merkle_tree/backends/field_element_vector.rs b/crypto/crypto/src/merkle_tree/backends/field_element_vector.rs index 25ba807c6..cc392843f 100644 --- a/crypto/crypto/src/merkle_tree/backends/field_element_vector.rs +++ b/crypto/crypto/src/merkle_tree/backends/field_element_vector.rs @@ -88,6 +88,30 @@ where } } +impl FieldElementVectorBackend +where + F: IsField, + FieldElement: AsBytes, + D: Digest, + [u8; NUM_BYTES]: From>, +{ + /// Hash a sequence of borrowed field elements into a leaf node, identical + /// to `hash_data` over the same sequence — but without materializing a Vec. + pub fn hash_elements<'a, I>(elements: I) -> [u8; NUM_BYTES] + where + I: IntoIterator>, + F: 'a, + { + let mut hasher = D::new(); + for element in elements { + hasher.update(element.as_bytes()); + } + let mut result_hash = [0u8; NUM_BYTES]; + result_hash.copy_from_slice(&hasher.finalize()); + result_hash + } +} + impl IsMerkleTreeBackend for FieldElementVectorBackend where diff --git a/crypto/crypto/src/merkle_tree/proof.rs b/crypto/crypto/src/merkle_tree/proof.rs index 20d5452a2..a502d0312 100644 --- a/crypto/crypto/src/merkle_tree/proof.rs +++ b/crypto/crypto/src/merkle_tree/proof.rs @@ -20,25 +20,35 @@ pub struct Proof { } impl Proof { - /// Verifies a Merkle inclusion proof for the value contained at leaf index. - pub fn verify(&self, root_hash: &B::Node, mut index: usize, value: &B::Data) -> bool + /// Verify inclusion when the caller already computed the leaf hash + /// (lets callers hash borrowed leaf data without materializing `B::Data`). + pub fn verify_hashed( + &self, + root_hash: &B::Node, + mut index: usize, + mut hashed_value: B::Node, + ) -> bool where B: IsMerkleTreeBackend, { - let mut hashed_value = B::hash_data(value); - for sibling_node in self.merkle_path.iter() { if index.is_multiple_of(2) { hashed_value = B::hash_new_parent(&hashed_value, sibling_node); } else { hashed_value = B::hash_new_parent(sibling_node, &hashed_value); } - index >>= 1; } - root_hash == &hashed_value } + + /// Verifies a Merkle inclusion proof for the value contained at leaf index. + pub fn verify(&self, root_hash: &B::Node, index: usize, value: &B::Data) -> bool + where + B: IsMerkleTreeBackend, + { + self.verify_hashed::(root_hash, index, B::hash_data(value)) + } } #[cfg(feature = "alloc")] diff --git a/crypto/crypto/src/tests/field_element_vector_tests.rs b/crypto/crypto/src/tests/field_element_vector_tests.rs index 145e3f463..75b946472 100644 --- a/crypto/crypto/src/tests/field_element_vector_tests.rs +++ b/crypto/crypto/src/tests/field_element_vector_tests.rs @@ -6,6 +6,7 @@ use sha3::{Keccak256, Keccak512, Sha3_256, Sha3_512}; use crate::merkle_tree::{ backends::field_element_vector::FieldElementVectorBackend, merkle::MerkleTree, + traits::IsMerkleTreeBackend, }; type F = GoldilocksField; @@ -120,3 +121,35 @@ fn hash_data_field_element_backend_works_with_sha2_512() { &values[0] )); } + +#[test] +fn hash_elements_matches_hash_data_byte_for_byte() { + type Backend = FieldElementVectorBackend; + + // Pseudo-random Vec generated from a simple LCG so the test is deterministic + // yet exercises a non-trivial sequence of field elements. + let mut state: u64 = 0x9E3779B97F4A7C15; + let v: Vec = (0..37) + .map(|_| { + state = state + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + FE::from(state) + }) + .collect(); + + let via_hash_data = Backend::hash_data(&v); + let via_hash_elements = Backend::hash_elements(v.iter()); + + assert_eq!( + via_hash_data, via_hash_elements, + "hash_elements must be byte-identical to hash_data over the same sequence" + ); + + // Empty sequence must also agree. + let empty: Vec = Vec::new(); + assert_eq!( + Backend::hash_data(&empty), + Backend::hash_elements(empty.iter()) + ); +} diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs index 6f0ba6c62..8ed86c891 100644 --- a/crypto/stark/src/verifier.rs +++ b/crypto/stark/src/verifier.rs @@ -316,7 +316,8 @@ pub trait IsStarkVerifier< E: IsField, Field: IsSubFieldOf, { - proof.verify::>(root, index, &value.to_owned()) + let leaf = BatchedMerkleTreeBackend::::hash_elements(value.iter()); + proof.verify_hashed::>(root, index, leaf) } /// Verify both (proof, evaluations) and (proof_sym, evaluations_sym) openings @@ -398,16 +399,21 @@ pub trait IsStarkVerifier< FieldElement: AsBytes + Sync + Send, FieldElement: AsBytes + Sync + Send, { - let mut value = deep_poly_openings.composition_poly.evaluations.clone(); - value.extend_from_slice(&deep_poly_openings.composition_poly.evaluations_sym); + let leaf = BatchedMerkleTreeBackend::::hash_elements( + deep_poly_openings + .composition_poly + .evaluations + .iter() + .chain(deep_poly_openings.composition_poly.evaluations_sym.iter()), + ); deep_poly_openings .composition_poly .proof - .verify::>( + .verify_hashed::>( composition_poly_merkle_root, *iota, - &value, + leaf, ) } @@ -447,16 +453,17 @@ pub trait IsStarkVerifier< FieldElement: AsBytes + Sync + Send, FieldElement: AsBytes + Sync + Send, { - let evaluations = if iota % 2 == 1 { - vec![evaluation_sym.clone(), evaluation.clone()] + let (a, b) = if iota % 2 == 1 { + (evaluation_sym, evaluation) } else { - vec![evaluation.clone(), evaluation_sym.clone()] + (evaluation, evaluation_sym) }; + let leaf = BatchedMerkleTreeBackend::::hash_elements([a, b]); - auth_path_sym.verify::>( + auth_path_sym.verify_hashed::>( merkle_root, iota >> 1, - &evaluations, + leaf, ) } From d4de863067b077aa4bd8c7c357f5952b1e6952d0 Mon Sep 17 00:00:00 2001 From: diegokingston Date: Fri, 29 May 2026 16:43:27 -0300 Subject: [PATCH 6/8] perf(verifier): fold in #626 micro-opts (Vec step-memo, slice multi_verify, opening short-circuit) --- crypto/stark/src/verifier.rs | 195 ++++++++++++++++++++--------------- 1 file changed, 114 insertions(+), 81 deletions(-) diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs index 8ed86c891..5132718d5 100644 --- a/crypto/stark/src/verifier.rs +++ b/crypto/stark/src/verifier.rs @@ -25,7 +25,6 @@ use math::{ }, traits::AsBytes, }; -use std::collections::HashMap; use std::marker::PhantomData; #[cfg(feature = "instruments")] use std::time::Instant; @@ -111,18 +110,23 @@ pub trait IsStarkVerifier< proof.bus_public_inputs.as_ref(), trace_length, ); - // Precompute g^step once per distinct step to avoid the prior O(B^2) - // linear scan. A single pass populates a memo and resolves each - // constraint's step to its point in O(1) amortized. - let mut step_to_point: HashMap> = HashMap::new(); + // Precompute g^step once per distinct step. A small `Vec` with a + // linear scan beats `HashMap` here: boundary constraints typically + // number in the low tens, the recursion guest pays no allocator/hash + // overhead, and the AIR generally emits its constraints grouped by + // step so the scan hits the hot entry first. + let mut step_to_point: Vec<(usize, FieldElement)> = Vec::new(); let boundary_points: Vec> = boundary_constraints .constraints .iter() .map(|c| { - step_to_point - .entry(c.step) - .or_insert_with(|| domain.trace_primitive_root.pow(c.step as u64)) - .clone() + if let Some((_, point)) = step_to_point.iter().find(|(s, _)| *s == c.step) { + point.clone() + } else { + let point = domain.trace_primitive_root.pow(c.step as u64); + step_to_point.push((c.step, point.clone())); + point + } }) .collect(); @@ -355,37 +359,48 @@ pub trait IsStarkVerifier< FieldElement: AsBytes + Sync + Send, { // Main trace (multiplicities for preprocessed, full trace for normal). - let mut ok = Self::verify_opening_pair::( + // Short-circuit on any failure: each opening pair is a Merkle-path + // verification (~20 Keccak hashes against base-field leaves); in the + // recursion guest this is non-trivial cycle cost worth skipping. + if !Self::verify_opening_pair::( &deep_poly_openings.main_trace_polys, &proof.lde_trace_main_merkle_root, iota, - ); + ) { + return false; + } // Precomputed trace (preprocessed tables only). Mismatched presence is // unreachable in practice (multi_verify rejects such proofs upstream), // but a defensive check keeps this function self-contained. - ok &= match ( + match ( &proof.lde_trace_precomputed_merkle_root, &deep_poly_openings.precomputed_trace_polys, ) { - (Some(root), Some(opening)) => Self::verify_opening_pair::(opening, root, iota), - (None, None) => true, - _ => false, - }; + (Some(root), Some(opening)) => { + if !Self::verify_opening_pair::(opening, root, iota) { + return false; + } + } + (None, None) => {} + _ => return false, + } // Auxiliary trace. - ok &= match ( + match ( proof.lde_trace_aux_merkle_root, &deep_poly_openings.aux_trace_polys, ) { (Some(root), Some(opening)) => { - Self::verify_opening_pair::(opening, &root, iota) + if !Self::verify_opening_pair::(opening, &root, iota) { + return false; + } } - (None, None) => true, - _ => false, - }; + (None, None) => {} + _ => return false, + } - ok + true } /// Verify opening Open(Hᵢ(D_LDE), 𝜐) and Open(Hᵢ(D_LDE), -𝜐) for all parts Hᵢof the composition @@ -489,19 +504,12 @@ pub trait IsStarkVerifier< FieldElement: AsBytes + Sync + Send, { let fri_layers_merkle_roots = &proof.fri_layers_merkle_roots; - let evaluation_point_vec: Vec> = - core::iter::successors(Some(evaluation_point_inv.square()), |evaluation_point| { - Some(evaluation_point.square()) - }) - .take(fri_layers_merkle_roots.len()) - .collect(); - let p0_eval = deep_composition_evaluation; let p0_eval_sym = deep_composition_evaluation_sym; // Reconstruct p₁(𝜐²) let mut v = - (p0_eval + p0_eval_sym) + evaluation_point_inv * &zetas[0] * (p0_eval - p0_eval_sym); + (p0_eval + p0_eval_sym) + &evaluation_point_inv * &zetas[0] * (p0_eval - p0_eval_sym); let mut index = iota; // Handle case with 0 FRI layers (trace_length <= 2) @@ -511,49 +519,57 @@ pub trait IsStarkVerifier< return v == proof.fri_last_value; } - // For each FRI layer, starting from the layer 1: use the proof to verify the validity of values pᵢ(−𝜐^(2ⁱ)) (given by the prover) and - // pᵢ(𝜐^(2ⁱ)) (computed on the previous iteration by the verifier). Then use them to obtain pᵢ₊₁(𝜐^(2ⁱ⁺¹)). - // Finally, check that the final value coincides with the given by the prover. - fri_layers_merkle_roots + // Guard zip alignment: the three iterables MUST have equal lengths. + // A malformed proof with mismatched lengths would otherwise silently + // truncate the verification or panic on the `len() - 1` below. + if fri_decommitment.layers_auth_paths.len() != fri_layers_merkle_roots.len() + || fri_decommitment.layers_evaluations_sym.len() != fri_layers_merkle_roots.len() + { + return false; + } + + // For each FRI layer, verify openings then fold to the next layer's + // evaluation. `evaluation_point_squared` is stepped in-place instead + // of pre-collecting a Vec, and a failed opening short-circuits the + // remaining Merkle work (each call is ~log₂(N) Keccak hashes). + let last_layer_idx = fri_layers_merkle_roots.len() - 1; + let mut evaluation_point_squared = evaluation_point_inv.square(); + for (i, ((merkle_root, auth_path_sym), evaluation_sym)) in fri_layers_merkle_roots .iter() - .enumerate() .zip(&fri_decommitment.layers_auth_paths) .zip(&fri_decommitment.layers_evaluations_sym) - .zip(evaluation_point_vec) - .fold( - true, - |result, - ( - (((i, merkle_root), auth_path_sym), evaluation_sym), - evaluation_point_inv, - )| { - // Verify opening Open(pᵢ(Dₖ), −𝜐^(2ⁱ)) and Open(pᵢ(Dₖ), 𝜐^(2ⁱ)). - // `v` is pᵢ(𝜐^(2ⁱ)). - // `evaluation_sym` is pᵢ(−𝜐^(2ⁱ)). - let openings_ok = Self::verify_fri_layer_openings( - merkle_root, - auth_path_sym, - &v, - evaluation_sym, - index, - ); - - // Update `v` with next value pᵢ₊₁(𝜐^(2ⁱ⁺¹)). - v = (&v + evaluation_sym) + evaluation_point_inv * &zetas[i + 1] * (&v - evaluation_sym); - - // Update index for next iteration. The index of the squares in the next layer - // is obtained by halving the current index. This is due to the bit-reverse - // ordering of the elements in the Merkle tree. - index >>= 1; - - if i < fri_decommitment.layers_evaluations_sym.len() - 1 { - result & openings_ok - } else { - // Check that final value is the given by the prover - result & (v == proof.fri_last_value) & openings_ok - } - }, - ) + .enumerate() + { + // Verify opening Open(pᵢ(Dₖ), −𝜐^(2ⁱ)) and Open(pᵢ(Dₖ), 𝜐^(2ⁱ)). + // `v` is pᵢ(𝜐^(2ⁱ)). `evaluation_sym` is pᵢ(−𝜐^(2ⁱ)). + if !Self::verify_fri_layer_openings( + merkle_root, + auth_path_sym, + &v, + evaluation_sym, + index, + ) { + return false; + } + + // Update `v` with next value pᵢ₊₁(𝜐^(2ⁱ⁺¹)). + v = (&v + evaluation_sym) + + &evaluation_point_squared * &zetas[i + 1] * (&v - evaluation_sym); + + // Index of the squares in the next layer = current index halved + // (bit-reverse ordering of the Merkle tree). + index >>= 1; + + if i == last_layer_idx { + return v == proof.fri_last_value; + } + evaluation_point_squared = evaluation_point_squared.square(); + } + + // Unreachable: the length guard above ensures the loop iterates at + // least once (we passed the is_empty check) and hits the + // `i == last_layer_idx` return. + unreachable!("loop must hit the last_layer_idx return") } fn reconstruct_deep_composition_poly_evaluations_for_all_queries( @@ -787,11 +803,27 @@ pub trait IsStarkVerifier< FieldElement: AsBytes + Sync + Send, FieldElement: AsBytes + Sync + Send, { - if airs.len() != multi_proof.proofs.len() { + Self::multi_verify_proofs(airs, &multi_proof.proofs, transcript, expected_bus_balance) + } + + /// Slice-taking variant of [`Self::multi_verify`]. Callers that already + /// hold a slice of proofs (or a single proof via [`core::slice::from_ref`]) + /// can call this directly without constructing a [`MultiProof`]. + fn multi_verify_proofs( + airs: &[&dyn AIR], + proofs: &[StarkProof], + transcript: &mut (impl IsStarkTranscript + Clone), + expected_bus_balance: &FieldElement, + ) -> bool + where + FieldElement: AsBytes + Sync + Send, + FieldElement: AsBytes + Sync + Send, + { + if airs.len() != proofs.len() { error!( "AIR count ({}) does not match proof count ({})", airs.len(), - multi_proof.proofs.len() + proofs.len() ); return false; } @@ -816,7 +848,7 @@ pub trait IsStarkVerifier< // For preprocessed tables, use the hardcoded commitment (verifier cannot // trust the prover). For normal tables, use the commitment from the proof. - for (idx, (air, proof)) in airs.iter().zip(&multi_proof.proofs).enumerate() { + for (idx, (air, proof)) in airs.iter().zip(proofs).enumerate() { if air.is_preprocessed() { // Preprocessed table: VERIFY precomputed commitment matches hardcoded. // This is the critical soundness check - ensures prover used correct precomputed values. @@ -883,7 +915,7 @@ pub trait IsStarkVerifier< // boundary constraints on LogUp columns, so the bus balance check is // the only cross-table validation. - for (idx, (air, proof)) in airs.iter().zip(&multi_proof.proofs).enumerate() { + for (idx, (air, proof)) in airs.iter().zip(proofs).enumerate() { if air.has_trace_interaction() && proof.bus_public_inputs.is_none() { error!( "Table {idx}: AIR has LogUp interactions but proof is missing bus_public_inputs" @@ -908,7 +940,7 @@ pub trait IsStarkVerifier< // state after Phase B, domain-separated by table index). This matches // the prover's forking and makes per-table verification independent. - for (idx, (air, proof)) in airs.iter().zip(&multi_proof.proofs).enumerate() { + for (idx, (air, proof)) in airs.iter().zip(proofs).enumerate() { // Must match prover: fork with domain separator for multi-table, // use original transcript directly for single-table. let num_tables = airs.len(); @@ -957,7 +989,7 @@ pub trait IsStarkVerifier< if needs_lookup_challenges { let mut total = FieldElement::::zero(); - for (air, proof) in airs.iter().zip(&multi_proof.proofs) { + for (air, proof) in airs.iter().zip(proofs) { if air.has_trace_interaction() && let Some(interaction) = &proof.bus_public_inputs { @@ -990,12 +1022,13 @@ pub trait IsStarkVerifier< where FieldElement: AsBytes + Sync + Send, FieldElement: AsBytes + Sync + Send, - PI: Clone, { - let multi_proof = MultiProof { - proofs: vec![proof.clone()], - }; - Self::multi_verify(&[air], &multi_proof, transcript, &FieldElement::zero()) + Self::multi_verify_proofs( + &[air], + core::slice::from_ref(proof), + transcript, + &FieldElement::zero(), + ) } /// Replays rounds 2, 3 and 4 of the protocol for a given proof, assuming round 1 has From b85fd4670ea2983eb74800bc3f00bfdc5574799a Mon Sep 17 00:00:00 2001 From: diegokingston Date: Fri, 29 May 2026 17:00:55 -0300 Subject: [PATCH 7/8] feat(recursion): keccak crate shim routing f1600/p1600 to the VM precompile --- keccak-precompile/Cargo.lock | 35 ++ keccak-precompile/Cargo.toml | 31 ++ keccak-precompile/LICENSE-APACHE | 201 +++++++++++ keccak-precompile/LICENSE-MIT | 25 ++ keccak-precompile/README.md | 67 ++++ keccak-precompile/src/armv8.rs | 192 ++++++++++ keccak-precompile/src/lib.rs | 601 +++++++++++++++++++++++++++++++ keccak-precompile/src/unroll.rs | 62 ++++ 8 files changed, 1214 insertions(+) create mode 100644 keccak-precompile/Cargo.lock create mode 100644 keccak-precompile/Cargo.toml create mode 100644 keccak-precompile/LICENSE-APACHE create mode 100644 keccak-precompile/LICENSE-MIT create mode 100644 keccak-precompile/README.md create mode 100644 keccak-precompile/src/armv8.rs create mode 100644 keccak-precompile/src/lib.rs create mode 100644 keccak-precompile/src/unroll.rs diff --git a/keccak-precompile/Cargo.lock b/keccak-precompile/Cargo.lock new file mode 100644 index 000000000..357211b01 --- /dev/null +++ b/keccak-precompile/Cargo.lock @@ -0,0 +1,35 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "keccak" +version = "0.1.5" +dependencies = [ + "cpufeatures", + "keccak 0.1.6", +] + +[[package]] +name = "keccak" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb26cec98cce3a3d96cbb7bced3c4b16e3d13f27ec56dbd62cbc8f39cfb9d653" +dependencies = [ + "cpufeatures", +] + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" diff --git a/keccak-precompile/Cargo.toml b/keccak-precompile/Cargo.toml new file mode 100644 index 000000000..f4ca59294 --- /dev/null +++ b/keccak-precompile/Cargo.toml @@ -0,0 +1,31 @@ +# Empty workspace table so this crate is NOT pulled into the main lambda-vm +# workspace. It is meant to be consumed only via `[patch.crates-io]` in the +# recursion guest. (Mirrors executor/programs/rust/keccak/Cargo.toml.) +[workspace] + +[package] +edition = "2018" +name = "keccak" +version = "0.1.5" +authors = ["RustCrypto Developers"] +description = """ +lambda-vm precompile shim for the `keccak` crate: routes Keccak-f[1600] / +Keccak-p[1600, 24] to the RISC-V Keccak precompile on the riscv64 guest. +Verbatim copy of keccak 0.1.5 except the p1600/f1600 override. Intended as a +`[patch.crates-io]` override in the recursion guest. +""" +license = "Apache-2.0 OR MIT" +repository = "https://github.com/lambdaclass/lambda_vm" + +[features] +asm = [] +no_unroll = [] +simd = [] + +[target."cfg(target_arch = \"aarch64\")".dependencies.cpufeatures] +version = "0.2" + +[dev-dependencies] +# Upstream keccak crate under a different name, so the faithfulness test can +# compare against it without triggering a self-patch cycle. +dev-dep-keccak = { package = "keccak", version = "0.1.5" } diff --git a/keccak-precompile/LICENSE-APACHE b/keccak-precompile/LICENSE-APACHE new file mode 100644 index 000000000..78173fa2e --- /dev/null +++ b/keccak-precompile/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/keccak-precompile/LICENSE-MIT b/keccak-precompile/LICENSE-MIT new file mode 100644 index 000000000..81a3d57ac --- /dev/null +++ b/keccak-precompile/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright (c) 2018-2022 RustCrypto Developers + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/keccak-precompile/README.md b/keccak-precompile/README.md new file mode 100644 index 000000000..d5d3adb32 --- /dev/null +++ b/keccak-precompile/README.md @@ -0,0 +1,67 @@ +# keccak-precompile + +A drop-in shim for the [`keccak`](https://crates.io/crates/keccak) crate +(version 0.1.5) that routes the Keccak-f[1600] / Keccak-p[1600, 24] +permutation to the lambda-vm RISC-V **Keccak precompile** when compiled for +the `riscv64` guest. + +## What it is + +`src/lib.rs` is a **verbatim copy** of upstream `keccak` 0.1.5, with a single +targeted change: the non-asm `p1600` / `f1600` functions delegate the full +24-round permutation to the VM precompile via an `ecall`: + +```rust +#[cfg(target_arch = "riscv64")] +if round_count == 24 { + // ABI: a0 = state ptr, a7 = usize::MAX - 1 + unsafe { core::arch::asm!("ecall", in("a0") state.as_mut_ptr(), in("a7") usize::MAX - 1); } + return; +} +``` + +Every other code path (reduced-round `p1600`, the `LaneSize` trait, `keccak_p`, +the round constants, the `u8`/`u16`/`u32` lane sizes, the `simd` module, and the +aarch64 `asm` variants) is unchanged, so host builds behave exactly like +upstream. + +## Why it works + +The STARK verifier hashes via `sha3::Keccak256`. `sha3` 0.10.x performs its +permutation by calling `keccak::p1600(&mut state, 24)`. By overriding that +single function and patching the `keccak` crate, **all** of `sha3`'s usage — +the Merkle tree backend and the Fiat-Shamir transcript — transparently routes +to the precompile on `riscv64`, while reusing `sha3`'s correct sponge and +padding. + +Correctness is guaranteed: the precompile computes exactly Keccak-f[1600] +(= 24-round `p1600`) in place over the `[u64; 25]` state, so hashes are +byte-identical to the software implementation. + +## How the recursion guest enables it + +Add to the **guest's root `Cargo.toml`**: + +```toml +[patch.crates-io] +keccak = { path = "/keccak-precompile" } +``` + +This replaces the upstream `keccak` dependency pulled in transitively by +`sha3`. On `riscv64` the permutation hits the precompile; on host targets the +build is unchanged. + +## Testing + +The riscv64 precompile path cannot run on the host. The included unit test +(`cargo test`) verifies the copied software permutation is faithful by: + +1. Asserting `f1600` of the all-zero state matches the standard Keccak-f[1600] + test vector, and that `p1600(_, 24)` equals `f1600`. +2. Comparing `crate::f1600` / `crate::p1600` against the upstream `keccak` + crate (pulled in as a renamed dev-dependency `dev-dep-keccak` to avoid a + self-patch cycle) over many pseudo-random states and reduced round counts. + +## License + +Apache-2.0 OR MIT, matching upstream `keccak`. diff --git a/keccak-precompile/src/armv8.rs b/keccak-precompile/src/armv8.rs new file mode 100644 index 000000000..698c8a105 --- /dev/null +++ b/keccak-precompile/src/armv8.rs @@ -0,0 +1,192 @@ +/// Keccak-p1600 on ARMv8.4-A with FEAT_SHA3. +/// +/// See p. K12.2.2 p. 11,749 of the ARM Reference manual. +/// Adapted from the Keccak-f1600 implementation in the XKCP/K12. +/// see +#[target_feature(enable = "sha3")] +pub unsafe fn p1600_armv8_sha3_asm(state: &mut [u64; 25], round_count: usize) { + core::arch::asm!(" + // Read state + ld1.1d {{ v0- v3}}, [x0], #32 + ld1.1d {{ v4- v7}}, [x0], #32 + ld1.1d {{ v8-v11}}, [x0], #32 + ld1.1d {{v12-v15}}, [x0], #32 + ld1.1d {{v16-v19}}, [x0], #32 + ld1.1d {{v20-v23}}, [x0], #32 + ld1.1d {{v24}}, [x0] + sub x0, x0, #192 + + // NOTE: This loop actually computes two f1600 functions in + // parallel, in both the lower and the upper 64-bit of the + // 128-bit registers v0-v24. + 0: sub x8, x8, #1 + + // Theta Calculations + eor3.16b v25, v20, v15, v10 + eor3.16b v26, v21, v16, v11 + eor3.16b v27, v22, v17, v12 + eor3.16b v28, v23, v18, v13 + eor3.16b v29, v24, v19, v14 + eor3.16b v25, v25, v5, v0 + eor3.16b v26, v26, v6, v1 + eor3.16b v27, v27, v7, v2 + eor3.16b v28, v28, v8, v3 + eor3.16b v29, v29, v9, v4 + rax1.2d v30, v25, v27 + rax1.2d v31, v26, v28 + rax1.2d v27, v27, v29 + rax1.2d v28, v28, v25 + rax1.2d v29, v29, v26 + + // Rho and Phi + eor.16b v0, v0, v29 + xar.2d v25, v1, v30, #64 - 1 + xar.2d v1, v6, v30, #64 - 44 + xar.2d v6, v9, v28, #64 - 20 + xar.2d v9, v22, v31, #64 - 61 + xar.2d v22, v14, v28, #64 - 39 + xar.2d v14, v20, v29, #64 - 18 + xar.2d v26, v2, v31, #64 - 62 + xar.2d v2, v12, v31, #64 - 43 + xar.2d v12, v13, v27, #64 - 25 + xar.2d v13, v19, v28, #64 - 8 + xar.2d v19, v23, v27, #64 - 56 + xar.2d v23, v15, v29, #64 - 41 + xar.2d v15, v4, v28, #64 - 27 + xar.2d v28, v24, v28, #64 - 14 + xar.2d v24, v21, v30, #64 - 2 + xar.2d v8, v8, v27, #64 - 55 + xar.2d v4, v16, v30, #64 - 45 + xar.2d v16, v5, v29, #64 - 36 + xar.2d v5, v3, v27, #64 - 28 + xar.2d v27, v18, v27, #64 - 21 + xar.2d v3, v17, v31, #64 - 15 + xar.2d v30, v11, v30, #64 - 10 + xar.2d v31, v7, v31, #64 - 6 + xar.2d v29, v10, v29, #64 - 3 + + // Chi and Iota + bcax.16b v20, v26, v22, v8 + bcax.16b v21, v8, v23, v22 + bcax.16b v22, v22, v24, v23 + bcax.16b v23, v23, v26, v24 + bcax.16b v24, v24, v8, v26 + + ld1r.2d {{v26}}, [x1], #8 + + bcax.16b v17, v30, v19, v3 + bcax.16b v18, v3, v15, v19 + bcax.16b v19, v19, v16, v15 + bcax.16b v15, v15, v30, v16 + bcax.16b v16, v16, v3, v30 + + bcax.16b v10, v25, v12, v31 + bcax.16b v11, v31, v13, v12 + bcax.16b v12, v12, v14, v13 + bcax.16b v13, v13, v25, v14 + bcax.16b v14, v14, v31, v25 + + bcax.16b v7, v29, v9, v4 + bcax.16b v8, v4, v5, v9 + bcax.16b v9, v9, v6, v5 + bcax.16b v5, v5, v29, v6 + bcax.16b v6, v6, v4, v29 + + bcax.16b v3, v27, v0, v28 + bcax.16b v4, v28, v1, v0 + bcax.16b v0, v0, v2, v1 + bcax.16b v1, v1, v27, v2 + bcax.16b v2, v2, v28, v27 + + eor.16b v0,v0,v26 + + // Rounds loop + cbnz w8, 0b + + // Write state + st1.1d {{ v0- v3}}, [x0], #32 + st1.1d {{ v4- v7}}, [x0], #32 + st1.1d {{ v8-v11}}, [x0], #32 + st1.1d {{v12-v15}}, [x0], #32 + st1.1d {{v16-v19}}, [x0], #32 + st1.1d {{v20-v23}}, [x0], #32 + st1.1d {{v24}}, [x0] + ", + in("x0") state.as_mut_ptr(), + in("x1") crate::RC[24-round_count..].as_ptr(), + in("x8") round_count, + clobber_abi("C"), + options(nostack) + ); +} + +#[cfg(all(test, target_feature = "sha3"))] +mod tests { + use super::*; + + #[test] + fn test_keccak_f1600() { + // Test vectors are copied from XKCP (eXtended Keccak Code Package) + // https://github.com/XKCP/XKCP/blob/master/tests/TestVectors/KeccakF-1600-IntermediateValues.txt + let state_first = [ + 0xF1258F7940E1DDE7, + 0x84D5CCF933C0478A, + 0xD598261EA65AA9EE, + 0xBD1547306F80494D, + 0x8B284E056253D057, + 0xFF97A42D7F8E6FD4, + 0x90FEE5A0A44647C4, + 0x8C5BDA0CD6192E76, + 0xAD30A6F71B19059C, + 0x30935AB7D08FFC64, + 0xEB5AA93F2317D635, + 0xA9A6E6260D712103, + 0x81A57C16DBCF555F, + 0x43B831CD0347C826, + 0x01F22F1A11A5569F, + 0x05E5635A21D9AE61, + 0x64BEFEF28CC970F2, + 0x613670957BC46611, + 0xB87C5A554FD00ECB, + 0x8C3EE88A1CCF32C8, + 0x940C7922AE3A2614, + 0x1841F924A2C509E4, + 0x16F53526E70465C2, + 0x75F644E97F30A13B, + 0xEAF1FF7B5CECA249, + ]; + let state_second = [ + 0x2D5C954DF96ECB3C, + 0x6A332CD07057B56D, + 0x093D8D1270D76B6C, + 0x8A20D9B25569D094, + 0x4F9C4F99E5E7F156, + 0xF957B9A2DA65FB38, + 0x85773DAE1275AF0D, + 0xFAF4F247C3D810F7, + 0x1F1B9EE6F79A8759, + 0xE4FECC0FEE98B425, + 0x68CE61B6B9CE68A1, + 0xDEEA66C4BA8F974F, + 0x33C43D836EAFB1F5, + 0xE00654042719DBD9, + 0x7CF8A9F009831265, + 0xFD5449A6BF174743, + 0x97DDAD33D8994B40, + 0x48EAD5FC5D0BE774, + 0xE3B8C8EE55B7B03C, + 0x91A0226E649E42E9, + 0x900E3129E7BADD7B, + 0x202A9EC5FAA3CCE8, + 0x5B3402464E1C3DB6, + 0x609F4E62A44C1059, + 0x20D06CD26A8FBF5C, + ]; + + let mut state = [0u64; 25]; + unsafe { p1600_armv8_sha3_asm(&mut state, 24) }; + assert_eq!(state, state_first); + unsafe { p1600_armv8_sha3_asm(&mut state, 24) }; + assert_eq!(state, state_second); + } +} diff --git a/keccak-precompile/src/lib.rs b/keccak-precompile/src/lib.rs new file mode 100644 index 000000000..50c0bdb5d --- /dev/null +++ b/keccak-precompile/src/lib.rs @@ -0,0 +1,601 @@ +//! Keccak [sponge function](https://en.wikipedia.org/wiki/Sponge_function). +//! +//! If you are looking for SHA-3 hash functions take a look at [`sha3`][1] and +//! [`tiny-keccak`][2] crates. +//! +//! To disable loop unrolling (e.g. for constraint targets) use `no_unroll` +//! feature. +//! +//! ``` +//! // Test vectors are from KeccakCodePackage +//! let mut data = [0u64; 25]; +//! +//! keccak::f1600(&mut data); +//! assert_eq!(data, [ +//! 0xF1258F7940E1DDE7, 0x84D5CCF933C0478A, 0xD598261EA65AA9EE, 0xBD1547306F80494D, +//! 0x8B284E056253D057, 0xFF97A42D7F8E6FD4, 0x90FEE5A0A44647C4, 0x8C5BDA0CD6192E76, +//! 0xAD30A6F71B19059C, 0x30935AB7D08FFC64, 0xEB5AA93F2317D635, 0xA9A6E6260D712103, +//! 0x81A57C16DBCF555F, 0x43B831CD0347C826, 0x01F22F1A11A5569F, 0x05E5635A21D9AE61, +//! 0x64BEFEF28CC970F2, 0x613670957BC46611, 0xB87C5A554FD00ECB, 0x8C3EE88A1CCF32C8, +//! 0x940C7922AE3A2614, 0x1841F924A2C509E4, 0x16F53526E70465C2, 0x75F644E97F30A13B, +//! 0xEAF1FF7B5CECA249, +//! ]); +//! +//! keccak::f1600(&mut data); +//! assert_eq!(data, [ +//! 0x2D5C954DF96ECB3C, 0x6A332CD07057B56D, 0x093D8D1270D76B6C, 0x8A20D9B25569D094, +//! 0x4F9C4F99E5E7F156, 0xF957B9A2DA65FB38, 0x85773DAE1275AF0D, 0xFAF4F247C3D810F7, +//! 0x1F1B9EE6F79A8759, 0xE4FECC0FEE98B425, 0x68CE61B6B9CE68A1, 0xDEEA66C4BA8F974F, +//! 0x33C43D836EAFB1F5, 0xE00654042719DBD9, 0x7CF8A9F009831265, 0xFD5449A6BF174743, +//! 0x97DDAD33D8994B40, 0x48EAD5FC5D0BE774, 0xE3B8C8EE55B7B03C, 0x91A0226E649E42E9, +//! 0x900E3129E7BADD7B, 0x202A9EC5FAA3CCE8, 0x5B3402464E1C3DB6, 0x609F4E62A44C1059, +//! 0x20D06CD26A8FBF5C, +//! ]); +//! ``` +//! +//! [1]: https://docs.rs/sha3 +//! [2]: https://docs.rs/tiny-keccak + +#![no_std] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] +#![cfg_attr(feature = "simd", feature(portable_simd))] +#![doc( + html_logo_url = "https://raw.githubusercontent.com/RustCrypto/meta/master/logo.svg", + html_favicon_url = "https://raw.githubusercontent.com/RustCrypto/meta/master/logo.svg" +)] +#![allow(non_upper_case_globals)] +#![warn( + clippy::mod_module_files, + clippy::unwrap_used, + missing_docs, + rust_2018_idioms, + unused_lifetimes, + unused_qualifications +)] + +use core::{ + convert::TryInto, + fmt::Debug, + mem::size_of, + ops::{BitAnd, BitAndAssign, BitXor, BitXorAssign, Not}, +}; + +#[rustfmt::skip] +mod unroll; + +#[cfg(all(target_arch = "aarch64", feature = "asm"))] +mod armv8; + +#[cfg(all(target_arch = "aarch64", feature = "asm"))] +cpufeatures::new!(armv8_sha3_intrinsics, "sha3"); + +const PLEN: usize = 25; + +const RHO: [u32; 24] = [ + 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44, +]; + +const PI: [usize; 24] = [ + 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1, +]; + +const RC: [u64; 24] = [ + 0x0000000000000001, + 0x0000000000008082, + 0x800000000000808a, + 0x8000000080008000, + 0x000000000000808b, + 0x0000000080000001, + 0x8000000080008081, + 0x8000000000008009, + 0x000000000000008a, + 0x0000000000000088, + 0x0000000080008009, + 0x000000008000000a, + 0x000000008000808b, + 0x800000000000008b, + 0x8000000000008089, + 0x8000000000008003, + 0x8000000000008002, + 0x8000000000000080, + 0x000000000000800a, + 0x800000008000000a, + 0x8000000080008081, + 0x8000000000008080, + 0x0000000080000001, + 0x8000000080008008, +]; + +/// Keccak is a permutation over an array of lanes which comprise the sponge +/// construction. +pub trait LaneSize: + Copy + + Clone + + Debug + + Default + + PartialEq + + BitAndAssign + + BitAnd + + BitXorAssign + + BitXor + + Not +{ + /// Number of rounds of the Keccak-f permutation. + const KECCAK_F_ROUND_COUNT: usize; + + /// Truncate function. + fn truncate_rc(rc: u64) -> Self; + + /// Rotate left function. + fn rotate_left(self, n: u32) -> Self; +} + +macro_rules! impl_lanesize { + ($type:ty, $round:expr, $truncate:expr) => { + impl LaneSize for $type { + const KECCAK_F_ROUND_COUNT: usize = $round; + + fn truncate_rc(rc: u64) -> Self { + $truncate(rc) + } + + fn rotate_left(self, n: u32) -> Self { + self.rotate_left(n) + } + } + }; +} + +impl_lanesize!(u8, 18, |rc: u64| { rc.to_le_bytes()[0] }); +impl_lanesize!(u16, 20, |rc: u64| { + let tmp = rc.to_le_bytes(); + #[allow(clippy::unwrap_used)] + Self::from_le_bytes(tmp[..size_of::()].try_into().unwrap()) +}); +impl_lanesize!(u32, 22, |rc: u64| { + let tmp = rc.to_le_bytes(); + #[allow(clippy::unwrap_used)] + Self::from_le_bytes(tmp[..size_of::()].try_into().unwrap()) +}); +impl_lanesize!(u64, 24, |rc: u64| { rc }); + +macro_rules! impl_keccak { + ($pname:ident, $fname:ident, $type:ty) => { + /// Keccak-p sponge function + pub fn $pname(state: &mut [$type; PLEN], round_count: usize) { + keccak_p(state, round_count); + } + + /// Keccak-f sponge function + pub fn $fname(state: &mut [$type; PLEN]) { + keccak_p(state, <$type>::KECCAK_F_ROUND_COUNT); + } + }; +} + +impl_keccak!(p200, f200, u8); +impl_keccak!(p400, f400, u16); +impl_keccak!(p800, f800, u32); + +/// Keccak-p[1600, rc] permutation. On the lambda-vm RISC-V guest, the full +/// 24-round permutation is delegated to the Keccak precompile. +#[cfg(not(all(target_arch = "aarch64", feature = "asm")))] +pub fn p1600(state: &mut [u64; PLEN], round_count: usize) { + #[cfg(target_arch = "riscv64")] + if round_count == 24 { + // SAFETY: the host implements the KeccakPermute precompile, applying + // Keccak-f[1600] in place to the 25-lane state. ABI: a0 = state ptr, + // a7 = usize::MAX - 1. + unsafe { + core::arch::asm!( + "ecall", + in("a0") state.as_mut_ptr(), + in("a7") usize::MAX - 1, + ); + } + return; + } + keccak_p(state, round_count); +} + +/// Keccak-f[1600] permutation (full 24 rounds). +#[cfg(not(all(target_arch = "aarch64", feature = "asm")))] +pub fn f1600(state: &mut [u64; PLEN]) { + p1600(state, u64::KECCAK_F_ROUND_COUNT); +} + +/// Keccak-p[1600, rc] permutation. +#[cfg(all(target_arch = "aarch64", feature = "asm"))] +pub fn p1600(state: &mut [u64; PLEN], round_count: usize) { + if armv8_sha3_intrinsics::get() { + unsafe { armv8::p1600_armv8_sha3_asm(state, round_count) } + } else { + keccak_p(state, round_count); + } +} + +/// Keccak-f[1600] permutation. +#[cfg(all(target_arch = "aarch64", feature = "asm"))] +pub fn f1600(state: &mut [u64; PLEN]) { + if armv8_sha3_intrinsics::get() { + unsafe { armv8::p1600_armv8_sha3_asm(state, 24) } + } else { + keccak_p(state, u64::KECCAK_F_ROUND_COUNT); + } +} + +#[cfg(feature = "simd")] +/// SIMD implementations for Keccak-f1600 sponge function +pub mod simd { + use crate::{keccak_p, LaneSize, PLEN}; + pub use core::simd::{u64x2, u64x4, u64x8}; + + macro_rules! impl_lanesize_simd_u64xn { + ($type:ty) => { + impl LaneSize for $type { + const KECCAK_F_ROUND_COUNT: usize = 24; + + fn truncate_rc(rc: u64) -> Self { + Self::splat(rc) + } + + fn rotate_left(self, n: u32) -> Self { + self << Self::splat(n.into()) | self >> Self::splat((64 - n).into()) + } + } + }; + } + + impl_lanesize_simd_u64xn!(u64x2); + impl_lanesize_simd_u64xn!(u64x4); + impl_lanesize_simd_u64xn!(u64x8); + + impl_keccak!(p1600x2, f1600x2, u64x2); + impl_keccak!(p1600x4, f1600x4, u64x4); + impl_keccak!(p1600x8, f1600x8, u64x8); +} + +#[allow(unused_assignments)] +/// Generic Keccak-p sponge function +pub fn keccak_p(state: &mut [L; PLEN], round_count: usize) { + if round_count > L::KECCAK_F_ROUND_COUNT { + panic!("A round_count greater than KECCAK_F_ROUND_COUNT is not supported!"); + } + + // https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf#page=25 + // "the rounds of KECCAK-p[b, nr] match the last rounds of KECCAK-f[b]" + let round_consts = &RC[(L::KECCAK_F_ROUND_COUNT - round_count)..L::KECCAK_F_ROUND_COUNT]; + + // not unrolling this loop results in a much smaller function, plus + // it positively influences performance due to the smaller load on I-cache + for &rc in round_consts { + let mut array = [L::default(); 5]; + + // Theta + unroll5!(x, { + unroll5!(y, { + array[x] ^= state[5 * y + x]; + }); + }); + + unroll5!(x, { + unroll5!(y, { + let t1 = array[(x + 4) % 5]; + let t2 = array[(x + 1) % 5].rotate_left(1); + state[5 * y + x] ^= t1 ^ t2; + }); + }); + + // Rho and pi + let mut last = state[1]; + unroll24!(x, { + array[0] = state[PI[x]]; + state[PI[x]] = last.rotate_left(RHO[x]); + last = array[0]; + }); + + // Chi + unroll5!(y_step, { + let y = 5 * y_step; + + unroll5!(x, { + array[x] = state[y + x]; + }); + + unroll5!(x, { + let t1 = !array[(x + 1) % 5]; + let t2 = array[(x + 2) % 5]; + state[y + x] = array[x] ^ (t1 & t2); + }); + }); + + // Iota + state[0] ^= L::truncate_rc(rc); + } +} + +#[cfg(test)] +mod tests { + use crate::{keccak_p, LaneSize, PLEN}; + + fn keccak_f(state_first: [L; PLEN], state_second: [L; PLEN]) { + let mut state = [L::default(); PLEN]; + + keccak_p(&mut state, L::KECCAK_F_ROUND_COUNT); + assert_eq!(state, state_first); + + keccak_p(&mut state, L::KECCAK_F_ROUND_COUNT); + assert_eq!(state, state_second); + } + + #[test] + fn keccak_f200() { + // Test vectors are copied from XKCP (eXtended Keccak Code Package) + // https://github.com/XKCP/XKCP/blob/master/tests/TestVectors/KeccakF-200-IntermediateValues.txt + let state_first = [ + 0x3C, 0x28, 0x26, 0x84, 0x1C, 0xB3, 0x5C, 0x17, 0x1E, 0xAA, 0xE9, 0xB8, 0x11, 0x13, + 0x4C, 0xEA, 0xA3, 0x85, 0x2C, 0x69, 0xD2, 0xC5, 0xAB, 0xAF, 0xEA, + ]; + let state_second = [ + 0x1B, 0xEF, 0x68, 0x94, 0x92, 0xA8, 0xA5, 0x43, 0xA5, 0x99, 0x9F, 0xDB, 0x83, 0x4E, + 0x31, 0x66, 0xA1, 0x4B, 0xE8, 0x27, 0xD9, 0x50, 0x40, 0x47, 0x9E, + ]; + + keccak_f::(state_first, state_second); + } + + #[test] + fn keccak_f400() { + // Test vectors are copied from XKCP (eXtended Keccak Code Package) + // https://github.com/XKCP/XKCP/blob/master/tests/TestVectors/KeccakF-400-IntermediateValues.txt + let state_first = [ + 0x09F5, 0x40AC, 0x0FA9, 0x14F5, 0xE89F, 0xECA0, 0x5BD1, 0x7870, 0xEFF0, 0xBF8F, 0x0337, + 0x6052, 0xDC75, 0x0EC9, 0xE776, 0x5246, 0x59A1, 0x5D81, 0x6D95, 0x6E14, 0x633E, 0x58EE, + 0x71FF, 0x714C, 0xB38E, + ]; + let state_second = [ + 0xE537, 0xD5D6, 0xDBE7, 0xAAF3, 0x9BC7, 0xCA7D, 0x86B2, 0xFDEC, 0x692C, 0x4E5B, 0x67B1, + 0x15AD, 0xA7F7, 0xA66F, 0x67FF, 0x3F8A, 0x2F99, 0xE2C2, 0x656B, 0x5F31, 0x5BA6, 0xCA29, + 0xC224, 0xB85C, 0x097C, + ]; + + keccak_f::(state_first, state_second); + } + + #[test] + fn keccak_f800() { + // Test vectors are copied from XKCP (eXtended Keccak Code Package) + // https://github.com/XKCP/XKCP/blob/master/tests/TestVectors/KeccakF-800-IntermediateValues.txt + let state_first = [ + 0xE531D45D, 0xF404C6FB, 0x23A0BF99, 0xF1F8452F, 0x51FFD042, 0xE539F578, 0xF00B80A7, + 0xAF973664, 0xBF5AF34C, 0x227A2424, 0x88172715, 0x9F685884, 0xB15CD054, 0x1BF4FC0E, + 0x6166FA91, 0x1A9E599A, 0xA3970A1F, 0xAB659687, 0xAFAB8D68, 0xE74B1015, 0x34001A98, + 0x4119EFF3, 0x930A0E76, 0x87B28070, 0x11EFE996, + ]; + let state_second = [ + 0x75BF2D0D, 0x9B610E89, 0xC826AF40, 0x64CD84AB, 0xF905BDD6, 0xBC832835, 0x5F8001B9, + 0x15662CCE, 0x8E38C95E, 0x701FE543, 0x1B544380, 0x89ACDEFF, 0x51EDB5DE, 0x0E9702D9, + 0x6C19AA16, 0xA2913EEE, 0x60754E9A, 0x9819063C, 0xF4709254, 0xD09F9084, 0x772DA259, + 0x1DB35DF7, 0x5AA60162, 0x358825D5, 0xB3783BAB, + ]; + + keccak_f::(state_first, state_second); + } + + #[test] + fn keccak_f1600() { + // Test vectors are copied from XKCP (eXtended Keccak Code Package) + // https://github.com/XKCP/XKCP/blob/master/tests/TestVectors/KeccakF-1600-IntermediateValues.txt + let state_first = [ + 0xF1258F7940E1DDE7, + 0x84D5CCF933C0478A, + 0xD598261EA65AA9EE, + 0xBD1547306F80494D, + 0x8B284E056253D057, + 0xFF97A42D7F8E6FD4, + 0x90FEE5A0A44647C4, + 0x8C5BDA0CD6192E76, + 0xAD30A6F71B19059C, + 0x30935AB7D08FFC64, + 0xEB5AA93F2317D635, + 0xA9A6E6260D712103, + 0x81A57C16DBCF555F, + 0x43B831CD0347C826, + 0x01F22F1A11A5569F, + 0x05E5635A21D9AE61, + 0x64BEFEF28CC970F2, + 0x613670957BC46611, + 0xB87C5A554FD00ECB, + 0x8C3EE88A1CCF32C8, + 0x940C7922AE3A2614, + 0x1841F924A2C509E4, + 0x16F53526E70465C2, + 0x75F644E97F30A13B, + 0xEAF1FF7B5CECA249, + ]; + let state_second = [ + 0x2D5C954DF96ECB3C, + 0x6A332CD07057B56D, + 0x093D8D1270D76B6C, + 0x8A20D9B25569D094, + 0x4F9C4F99E5E7F156, + 0xF957B9A2DA65FB38, + 0x85773DAE1275AF0D, + 0xFAF4F247C3D810F7, + 0x1F1B9EE6F79A8759, + 0xE4FECC0FEE98B425, + 0x68CE61B6B9CE68A1, + 0xDEEA66C4BA8F974F, + 0x33C43D836EAFB1F5, + 0xE00654042719DBD9, + 0x7CF8A9F009831265, + 0xFD5449A6BF174743, + 0x97DDAD33D8994B40, + 0x48EAD5FC5D0BE774, + 0xE3B8C8EE55B7B03C, + 0x91A0226E649E42E9, + 0x900E3129E7BADD7B, + 0x202A9EC5FAA3CCE8, + 0x5B3402464E1C3DB6, + 0x609F4E62A44C1059, + 0x20D06CD26A8FBF5C, + ]; + + keccak_f::(state_first, state_second); + } + + // Faithfulness check for the precompile shim: on the host the software + // path runs, so `crate::f1600` / `crate::p1600(_, 24)` must produce + // byte-identical output to the upstream `keccak` crate (pulled in as a + // renamed dev-dependency to avoid a self-patch cycle). + #[test] + fn p1600_matches_upstream_f1600() { + // Known Keccak-f[1600] test vector: f1600 of the all-zero state. + let expected: [u64; 25] = [ + 0xF1258F7940E1DDE7, + 0x84D5CCF933C0478A, + 0xD598261EA65AA9EE, + 0xBD1547306F80494D, + 0x8B284E056253D057, + 0xFF97A42D7F8E6FD4, + 0x90FEE5A0A44647C4, + 0x8C5BDA0CD6192E76, + 0xAD30A6F71B19059C, + 0x30935AB7D08FFC64, + 0xEB5AA93F2317D635, + 0xA9A6E6260D712103, + 0x81A57C16DBCF555F, + 0x43B831CD0347C826, + 0x01F22F1A11A5569F, + 0x05E5635A21D9AE61, + 0x64BEFEF28CC970F2, + 0x613670957BC46611, + 0xB87C5A554FD00ECB, + 0x8C3EE88A1CCF32C8, + 0x940C7922AE3A2614, + 0x1841F924A2C509E4, + 0x16F53526E70465C2, + 0x75F644E97F30A13B, + 0xEAF1FF7B5CECA249, + ]; + + let mut s = [0u64; 25]; + crate::f1600(&mut s); + assert_eq!(s, expected, "f1600(0) must match the standard test vector"); + + // p1600(_, 24) must equal f1600. + let mut p = [0u64; 25]; + crate::p1600(&mut p, 24); + assert_eq!(p, expected, "p1600(_, 24) must equal f1600"); + + // Cross-check against the upstream crate over pseudo-random states. + let mut x: u64 = 0x243F6A8885A308D3; // digits of pi + for _ in 0..256 { + let mut a = [0u64; 25]; + for lane in a.iter_mut() { + // xorshift64 PRNG + x ^= x << 13; + x ^= x >> 7; + x ^= x << 17; + *lane = x; + } + let mut b = a; + + crate::f1600(&mut a); + dev_dep_keccak::f1600(&mut b); + assert_eq!(a, b, "shim f1600 must match upstream f1600"); + + // Also exercise reduced-round p1600 against upstream p1600. + for rc in [1usize, 12, 23, 24] { + let mut a = [0u64; 25]; + let mut b = [0u64; 25]; + for i in 0..25 { + x ^= x << 13; + x ^= x >> 7; + x ^= x << 17; + a[i] = x; + b[i] = x; + } + crate::p1600(&mut a, rc); + dev_dep_keccak::p1600(&mut b, rc); + assert_eq!(a, b, "shim p1600 must match upstream p1600 for rc"); + } + } + } + + #[cfg(feature = "simd")] + mod simd { + use super::keccak_f; + use core::simd::{u64x2, u64x4, u64x8}; + + macro_rules! impl_keccak_f1600xn { + ($name:ident, $type:ty) => { + #[test] + fn $name() { + // Test vectors are copied from XKCP (eXtended Keccak Code Package) + // https://github.com/XKCP/XKCP/blob/master/tests/TestVectors/KeccakF-1600-IntermediateValues.txt + let state_first = [ + <$type>::splat(0xF1258F7940E1DDE7), + <$type>::splat(0x84D5CCF933C0478A), + <$type>::splat(0xD598261EA65AA9EE), + <$type>::splat(0xBD1547306F80494D), + <$type>::splat(0x8B284E056253D057), + <$type>::splat(0xFF97A42D7F8E6FD4), + <$type>::splat(0x90FEE5A0A44647C4), + <$type>::splat(0x8C5BDA0CD6192E76), + <$type>::splat(0xAD30A6F71B19059C), + <$type>::splat(0x30935AB7D08FFC64), + <$type>::splat(0xEB5AA93F2317D635), + <$type>::splat(0xA9A6E6260D712103), + <$type>::splat(0x81A57C16DBCF555F), + <$type>::splat(0x43B831CD0347C826), + <$type>::splat(0x01F22F1A11A5569F), + <$type>::splat(0x05E5635A21D9AE61), + <$type>::splat(0x64BEFEF28CC970F2), + <$type>::splat(0x613670957BC46611), + <$type>::splat(0xB87C5A554FD00ECB), + <$type>::splat(0x8C3EE88A1CCF32C8), + <$type>::splat(0x940C7922AE3A2614), + <$type>::splat(0x1841F924A2C509E4), + <$type>::splat(0x16F53526E70465C2), + <$type>::splat(0x75F644E97F30A13B), + <$type>::splat(0xEAF1FF7B5CECA249), + ]; + let state_second = [ + <$type>::splat(0x2D5C954DF96ECB3C), + <$type>::splat(0x6A332CD07057B56D), + <$type>::splat(0x093D8D1270D76B6C), + <$type>::splat(0x8A20D9B25569D094), + <$type>::splat(0x4F9C4F99E5E7F156), + <$type>::splat(0xF957B9A2DA65FB38), + <$type>::splat(0x85773DAE1275AF0D), + <$type>::splat(0xFAF4F247C3D810F7), + <$type>::splat(0x1F1B9EE6F79A8759), + <$type>::splat(0xE4FECC0FEE98B425), + <$type>::splat(0x68CE61B6B9CE68A1), + <$type>::splat(0xDEEA66C4BA8F974F), + <$type>::splat(0x33C43D836EAFB1F5), + <$type>::splat(0xE00654042719DBD9), + <$type>::splat(0x7CF8A9F009831265), + <$type>::splat(0xFD5449A6BF174743), + <$type>::splat(0x97DDAD33D8994B40), + <$type>::splat(0x48EAD5FC5D0BE774), + <$type>::splat(0xE3B8C8EE55B7B03C), + <$type>::splat(0x91A0226E649E42E9), + <$type>::splat(0x900E3129E7BADD7B), + <$type>::splat(0x202A9EC5FAA3CCE8), + <$type>::splat(0x5B3402464E1C3DB6), + <$type>::splat(0x609F4E62A44C1059), + <$type>::splat(0x20D06CD26A8FBF5C), + ]; + + keccak_f::<$type>(state_first, state_second); + } + }; + } + + impl_keccak_f1600xn!(keccak_f1600x2, u64x2); + impl_keccak_f1600xn!(keccak_f1600x4, u64x4); + impl_keccak_f1600xn!(keccak_f1600x8, u64x8); + } +} diff --git a/keccak-precompile/src/unroll.rs b/keccak-precompile/src/unroll.rs new file mode 100644 index 000000000..eab745b9d --- /dev/null +++ b/keccak-precompile/src/unroll.rs @@ -0,0 +1,62 @@ +/// unroll5 +#[cfg(not(feature = "no_unroll"))] +#[macro_export] +macro_rules! unroll5 { + ($var:ident, $body:block) => { + { const $var: usize = 0; $body; } + { const $var: usize = 1; $body; } + { const $var: usize = 2; $body; } + { const $var: usize = 3; $body; } + { const $var: usize = 4; $body; } + }; +} + +/// unroll5 +#[cfg(feature = "no_unroll")] +#[macro_export] +macro_rules! unroll5 { + ($var:ident, $body:block) => { + for $var in 0..5 $body + } +} + +/// unroll24 +#[cfg(not(feature = "no_unroll"))] +#[macro_export] +macro_rules! unroll24 { + ($var: ident, $body: block) => { + { const $var: usize = 0; $body; } + { const $var: usize = 1; $body; } + { const $var: usize = 2; $body; } + { const $var: usize = 3; $body; } + { const $var: usize = 4; $body; } + { const $var: usize = 5; $body; } + { const $var: usize = 6; $body; } + { const $var: usize = 7; $body; } + { const $var: usize = 8; $body; } + { const $var: usize = 9; $body; } + { const $var: usize = 10; $body; } + { const $var: usize = 11; $body; } + { const $var: usize = 12; $body; } + { const $var: usize = 13; $body; } + { const $var: usize = 14; $body; } + { const $var: usize = 15; $body; } + { const $var: usize = 16; $body; } + { const $var: usize = 17; $body; } + { const $var: usize = 18; $body; } + { const $var: usize = 19; $body; } + { const $var: usize = 20; $body; } + { const $var: usize = 21; $body; } + { const $var: usize = 22; $body; } + { const $var: usize = 23; $body; } + }; +} + +/// unroll24 +#[cfg(feature = "no_unroll")] +#[macro_export] +macro_rules! unroll24 { + ($var:ident, $body:block) => { + for $var in 0..24 $body + } +} From d279ab6298e693bfe4927eeb05700ee57fe5cfc9 Mon Sep 17 00:00:00 2001 From: diegokingston Date: Fri, 29 May 2026 17:05:02 -0300 Subject: [PATCH 8/8] tooling: A/B script to measure verifier with/without Keccak precompile Toggles the keccak-precompile [patch] in an external recursion-guest Cargo.toml, runs the verify bench both ways, prints the cycle comparison. Parameterized by GUEST_DIR + VERIFY_BENCH_CMD (the guest harness lives outside this repo). --- scripts/measure_verifier_precompile.sh | 95 ++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100755 scripts/measure_verifier_precompile.sh diff --git a/scripts/measure_verifier_precompile.sh b/scripts/measure_verifier_precompile.sh new file mode 100755 index 000000000..e3657a724 --- /dev/null +++ b/scripts/measure_verifier_precompile.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# Measure the recursion-guest verifier WITH vs WITHOUT the Keccak precompile. +# +# The recursion guest (the STARK verifier compiled to RISC-V) lives in YOUR +# guest crate, outside this repo. This script A/Bs the `keccak` precompile shim +# by toggling a `[patch.crates-io] keccak = { path = }` +# entry in the guest's Cargo.toml, running your verify benchmark each time, and +# printing both results so you can see the cycle drop. +# +# Usage: +# GUEST_DIR=/path/to/recursion-guest \ +# VERIFY_BENCH_CMD='cargo run --release -- bench-verify' \ +# scripts/measure_verifier_precompile.sh +# +# Required env: +# GUEST_DIR Root of your recursion-guest crate (the one with the +# Cargo.toml that builds the verifier-as-RISC-V program). +# VERIFY_BENCH_CMD The command (run FROM $GUEST_DIR) that builds + runs the +# guest verify and prints its RISC-V cycle count. Whatever +# you already use to get the "40.5B / 67M" numbers. +# +# Optional env: +# CYCLE_GREP A grep -oE pattern to extract the cycle number from the +# bench output (default tries common forms). Purely for the +# summary line; full output is always shown. +# +# Correctness: the shim routes only the Keccak-f[1600] permutation to the VM +# precompile (a0=state ptr, a7=usize::MAX-1), reusing sha3's sponge/padding, so +# every hash is byte-identical — the verify result is unchanged, only faster. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +SHIM_DIR="$ROOT_DIR/keccak-precompile" + +: "${GUEST_DIR:?set GUEST_DIR to your recursion-guest crate root}" +: "${VERIFY_BENCH_CMD:?set VERIFY_BENCH_CMD to the command that runs your guest verify bench}" +CYCLE_GREP="${CYCLE_GREP:-[0-9][0-9.,]*[ ]*(cycles|B|M|instructions)}" + +GUEST_DIR="$(cd "$GUEST_DIR" && pwd)" +MANIFEST="$GUEST_DIR/Cargo.toml" +[ -f "$MANIFEST" ] || { echo "no Cargo.toml in $GUEST_DIR" >&2; exit 1; } +[ -f "$SHIM_DIR/Cargo.toml" ] || { echo "shim not found at $SHIM_DIR" >&2; exit 1; } + +BACKUP="$(mktemp)" +cp "$MANIFEST" "$BACKUP" +restore() { cp "$BACKUP" "$MANIFEST"; rm -f "$BACKUP"; } +trap restore EXIT + +GREEN='\033[0;32m'; BOLD='\033[1m'; NC='\033[0m' + +run_bench() { + local label="$1" + echo -e "\n${BOLD}=== verify bench: $label ===${NC}" + ( cd "$GUEST_DIR" && eval "$VERIFY_BENCH_CMD" ) | tee "/tmp/verify_bench_${label}.out" +} + +add_patch() { + # Add `keccak = { path = SHIM }` under [patch.crates-io], creating the + # section if absent. Duplicate [patch.crates-io] tables are a cargo error, + # so we append into the existing one when present. + local line="keccak = { path = \"$SHIM_DIR\" }" + if grep -qE '^\[patch\.crates-io\]' "$MANIFEST"; then + # insert right after the section header + awk -v l="$line" ' + { print } + /^\[patch\.crates-io\]/ && !done { print l; done=1 } + ' "$MANIFEST" > "$MANIFEST.tmp" && mv "$MANIFEST.tmp" "$MANIFEST" + else + printf '\n[patch.crates-io]\n%s\n' "$line" >> "$MANIFEST" + fi +} + +extract() { grep -oiE "$CYCLE_GREP" "$1" | head -1 || true; } + +# 1) Baseline (software Keccak) +restore # ensure clean +cp "$MANIFEST" "$BACKUP" +run_bench "baseline" + +# 2) With precompile shim +add_patch +echo -e "${GREEN}[patched] added: keccak = { path = $SHIM_DIR }${NC}" +run_bench "precompile" + +# restore happens via trap + +echo "" +echo -e "${BOLD}=== Summary ===${NC}" +echo " baseline (software Keccak) : $(extract /tmp/verify_bench_baseline.out)" +echo " with precompile shim : $(extract /tmp/verify_bench_precompile.out)" +echo "" +echo "Full outputs: /tmp/verify_bench_baseline.out /tmp/verify_bench_precompile.out" +echo "Guest Cargo.toml restored."