From c512dff2b128765aef6336ef525f12040a059931 Mon Sep 17 00:00:00 2001 From: dylan Date: Wed, 20 May 2026 00:07:33 -0600 Subject: [PATCH] fix(hot): split history shards by encoded size, not count Closes ENG-2287 `AccountsHistory` and `StorageHistory` are stored DUPSORT, so each dup value (key2 || encoded BlockNumberList) is capped at MDBX's DUPSORT value limit (~1980 B on 4 KB pages). The previous splitter gated on ShardedKey::SHARD_COUNT (2000), but a roaring BlockNumberList of 2000 sparse indices serialises to >20 KB. Once a hot account's shard tipped over, every live block touching it crashed the sidecar with MDBX_BAD_VALSIZE. Replace count-based splitting in append_to_sharded_history with a size-based check against a new ShardedKey::MAX_SHARD_BYTES (1500). When the merged list overflows the budget, binary-search the largest prefix that fits and emit shards until the remainder is consumed. Existing oversized shards self-heal on the next write to them, so no migration is required. Add test_history_shard_fits_in_dupsort_limit to the hot conformance suite. It exercises the real update_history_indices_inconsistent path with 2000 sparse blocks (one per roaring container, the worst-case encoding) and asserts the union across shards round-trips. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/hot/src/conformance/history.rs | 53 ++++++++++++++++- crates/hot/src/conformance/mod.rs | 1 + crates/hot/src/db/inconsistent.rs | 83 ++++++++++++++++++++------- crates/types/src/sharded.rs | 17 +++++- 4 files changed, 132 insertions(+), 22 deletions(-) diff --git a/crates/hot/src/conformance/history.rs b/crates/hot/src/conformance/history.rs index af0250c..3f7a67a 100644 --- a/crates/hot/src/conformance/history.rs +++ b/crates/hot/src/conformance/history.rs @@ -2,7 +2,7 @@ use crate::{ db::{HistoryRead, UnsafeDbWrite, UnsafeHistoryWrite}, - model::{HotKv, HotKvWrite}, + model::{HotKv, HotKvRead, HotKvWrite}, tables, }; use alloy::primitives::{U256, address}; @@ -454,3 +454,54 @@ pub fn test_delete_and_rewrite_dual(hot_kv: &T) { assert_eq!(hist.unwrap().iter().collect::>(), vec![100, 200, 300]); } } + +/// Regression: account history must split into MDBX-fittable shards when an +/// account is touched on many sparse blocks. +/// +/// Why: AccountsHistory is DUPSORT, so each stored dup value +/// (`key2 || encoded BlockNumberList`) is capped at MDBX's DUPSORT value +/// limit (~1980 bytes on 4 KB pages). A roaring serialisation of 2000 +/// indices spread across many 16-bit containers exceeds 20 KB. The history +/// pipeline must split shards by encoded size, not by index count, so any +/// pattern of `update_history_indices_inconsistent` survives. ENG-2287. +pub fn test_history_shard_fits_in_dupsort_limit(hot_kv: &T) { + let addr = address!("0xcccccccccccccccccccccccccccccccccccccccc"); + + // Sparse pattern: 2000 block-numbers spread far apart so each lives in + // its own roaring 16-bit container — the worst-case for serialised size + // (per-container header dominates). Write a change-set per block, then + // ask the history pipeline to index the full range in one call. + let blocks: Vec = + (0..ShardedKey::SHARD_COUNT as u64).map(|i| i.saturating_mul(100_000) + 1).collect(); + + { + let writer = hot_kv.writer().unwrap(); + let acc = Account::default(); + for &b in &blocks { + writer.write_account_prestate(b, addr, &acc).unwrap(); + } + writer.commit().unwrap(); + } + + { + let writer = hot_kv.writer().unwrap(); + let first = *blocks.first().unwrap(); + let last = *blocks.last().unwrap(); + writer.update_history_indices_inconsistent(first..=last).unwrap(); + writer.commit().unwrap(); + } + + // Walk every shard for this address; the union must equal `blocks`. + let reader = hot_kv.reader().unwrap(); + let mut cursor = reader.traverse_dual::().unwrap(); + let mut seen = Vec::new(); + let mut entry = cursor.last_of_k1(&addr).unwrap(); + while let Some((a, _shard_key, list)) = entry { + assert_eq!(a, addr); + let list: BlockNumberList = list; + seen.extend(list.iter()); + entry = cursor.previous_k2().unwrap(); + } + seen.sort_unstable(); + assert_eq!(seen, blocks, "all touched blocks must be reachable across shards"); +} diff --git a/crates/hot/src/conformance/mod.rs b/crates/hot/src/conformance/mod.rs index 2639865..1b17cb7 100644 --- a/crates/hot/src/conformance/mod.rs +++ b/crates/hot/src/conformance/mod.rs @@ -27,6 +27,7 @@ pub fn conformance(hot_kv: &T) { test_bytecode_roundtrip(hot_kv); test_account_history(hot_kv); test_storage_history(hot_kv); + test_history_shard_fits_in_dupsort_limit(hot_kv); test_account_changes(hot_kv); test_storage_changes(hot_kv); test_missing_reads(hot_kv); diff --git a/crates/hot/src/db/inconsistent.rs b/crates/hot/src/db/inconsistent.rs index 9a47a2b..5978e72 100644 --- a/crates/hot/src/db/inconsistent.rs +++ b/crates/hot/src/db/inconsistent.rs @@ -521,13 +521,23 @@ impl UnsafeHistoryWrite for T where T: UnsafeDbWrite + HotKvWrite {} /// This helper handles the common pattern of: /// 1. Appending new block numbers to an existing shard /// 2. Deleting the old shard if it exists -/// 3. Splitting into multiple shards if the result exceeds the shard size +/// 3. Splitting into multiple shards if the encoded result exceeds the +/// backend's per-value size budget /// /// # Arguments /// - `existing`: The current last shard (key, list) if any /// - `indices`: New block numbers to append /// - `delete_old`: Called to delete the old shard key before writing new ones /// - `write_shard`: Called for each resulting shard (highest_block, list) +/// +/// # Splitting strategy +/// +/// Splitting is driven by the encoded byte size of the [`BlockNumberList`], +/// not by the index count. This is required because the history tables are +/// stored DUPSORT in MDBX, which caps each dup value at ~1980 bytes on +/// 4 KB pages. A count-based threshold cannot guarantee fitting under that +/// cap because roaring's serialised size depends on the value distribution, +/// not just the count. See [`ShardedKey::MAX_SHARD_BYTES`] (ENG-2287). fn append_to_sharded_history( existing: Option<(K, BlockNumberList)>, indices: impl IntoIterator, @@ -550,29 +560,62 @@ where delete_old(key).map_err(HistoryError::Db)?; } - // Fast path: all indices fit in one shard - if last_shard.len() <= ShardedKey::SHARD_COUNT as u64 { + // Fast path: encoded list fits under the per-shard byte budget. + if last_shard.serialized_size() <= ShardedKey::MAX_SHARD_BYTES { return write_shard(u64::MAX, &last_shard).map_err(HistoryError::Db); } - // Slow path: rechunk into multiple shards - // Reuse a single buffer to avoid allocating a new Vec per chunk - let mut chunk_buf = Vec::with_capacity(ShardedKey::SHARD_COUNT); - let mut iter = last_shard.iter().peekable(); - - while iter.peek().is_some() { - chunk_buf.clear(); - chunk_buf.extend(iter.by_ref().take(ShardedKey::SHARD_COUNT)); - - let highest = if iter.peek().is_some() { - *chunk_buf.last().expect("chunk_buf is non-empty") - } else { - // Insert last list with `u64::MAX` - u64::MAX - }; - - let shard = BlockNumberList::new_pre_sorted(chunk_buf.iter().copied()); + // Slow path: re-chunk into multiple shards whose encoded sizes each + // fit under the budget. + let all: Vec = last_shard.iter().collect(); + let mut start = 0; + while start < all.len() { + let take = max_prefix_fitting(&all[start..]); + // A single value is always far smaller than MAX_SHARD_BYTES, so we + // can always make forward progress. + debug_assert!(take > 0, "no prefix fits in MAX_SHARD_BYTES"); + let end = start + take; + let highest = if end == all.len() { u64::MAX } else { all[end - 1] }; + let shard = BlockNumberList::new_pre_sorted(all[start..end].iter().copied()); write_shard(highest, &shard).map_err(HistoryError::Db)?; + start = end; } Ok(()) } + +/// Binary-search the largest prefix length `n` of `indices` whose encoded +/// `BlockNumberList` fits in [`ShardedKey::MAX_SHARD_BYTES`]. +/// +/// Returns 0 only if `indices` is empty. +fn max_prefix_fitting(indices: &[u64]) -> usize { + if indices.is_empty() { + return 0; + } + // Optimistic check: the entire slice may fit. Skips the binary search + // in the common case where the caller passes a small remainder. + if encoded_size_of(indices) <= ShardedKey::MAX_SHARD_BYTES { + return indices.len(); + } + let mut lo = 1usize; + let mut hi = indices.len(); + let mut best = 1usize; + while lo <= hi { + let mid = (lo + hi) / 2; + if encoded_size_of(&indices[..mid]) <= ShardedKey::MAX_SHARD_BYTES { + best = mid; + lo = mid + 1; + } else { + if mid == 1 { + // Even a single index doesn't fit — shouldn't happen but + // bail out to avoid an infinite loop. + break; + } + hi = mid - 1; + } + } + best +} + +fn encoded_size_of(indices: &[u64]) -> usize { + BlockNumberList::new_pre_sorted(indices.iter().copied()).serialized_size() +} diff --git a/crates/types/src/sharded.rs b/crates/types/src/sharded.rs index bc5ffed..4eba074 100644 --- a/crates/types/src/sharded.rs +++ b/crates/types/src/sharded.rs @@ -13,8 +13,23 @@ pub struct ShardedKey { } impl ShardedKey<()> { - /// Number of indices in one shard. + /// Soft cap on the number of indices in one shard. + /// + /// This is a sanity ceiling used alongside [`Self::MAX_SHARD_BYTES`]; + /// shard splitting is driven by encoded size, not by this count. pub const SHARD_COUNT: usize = 2000; + + /// Maximum encoded byte size of a single shard's [`BlockNumberList`]. + /// + /// [`BlockNumberList`]: crate::BlockNumberList + /// + /// The MDBX `DUPSORT` value limit is ~1980 bytes on 4 KB pages + /// (Linux production). Each stored dup value is `key2 || encoded list`, + /// so this budget reserves headroom for `ShardedKey` (40 bytes), + /// the 2-byte length prefix on `BlockNumberList`, and per-node overhead + /// inside MDBX. Exceeding this triggers `MDBX_BAD_VALSIZE` at write time + /// (ENG-2287). + pub const MAX_SHARD_BYTES: usize = 1500; } impl ShardedKey {