Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 52 additions & 1 deletion crates/hot/src/conformance/history.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

use crate::{
db::{HistoryRead, UnsafeDbWrite, UnsafeHistoryWrite},
model::{HotKv, HotKvWrite},
model::{HotKv, HotKvRead, HotKvWrite},
tables,
};
use alloy::primitives::{U256, address};
Expand Down Expand Up @@ -454,3 +454,54 @@ pub fn test_delete_and_rewrite_dual<T: HotKv>(hot_kv: &T) {
assert_eq!(hist.unwrap().iter().collect::<Vec<_>>(), vec![100, 200, 300]);
}
}

/// Regression: account history must split into MDBX-fittable shards when an
/// account is touched on many sparse blocks.
///
/// Why: AccountsHistory is DUPSORT, so each stored dup value
/// (`key2 || encoded BlockNumberList`) is capped at MDBX's DUPSORT value
/// limit (~1980 bytes on 4 KB pages). A roaring serialisation of 2000
/// indices spread across many 16-bit containers exceeds 20 KB. The history
/// pipeline must split shards by encoded size, not by index count, so any
/// pattern of `update_history_indices_inconsistent` survives. ENG-2287.
pub fn test_history_shard_fits_in_dupsort_limit<T: HotKv>(hot_kv: &T) {
let addr = address!("0xcccccccccccccccccccccccccccccccccccccccc");

// Sparse pattern: 2000 block-numbers spread far apart so each lives in
// its own roaring 16-bit container — the worst-case for serialised size
// (per-container header dominates). Write a change-set per block, then
// ask the history pipeline to index the full range in one call.
let blocks: Vec<u64> =
(0..ShardedKey::SHARD_COUNT as u64).map(|i| i.saturating_mul(100_000) + 1).collect();

{
let writer = hot_kv.writer().unwrap();
let acc = Account::default();
for &b in &blocks {
writer.write_account_prestate(b, addr, &acc).unwrap();
}
writer.commit().unwrap();
}

{
let writer = hot_kv.writer().unwrap();
let first = *blocks.first().unwrap();
let last = *blocks.last().unwrap();
writer.update_history_indices_inconsistent(first..=last).unwrap();
writer.commit().unwrap();
}

// Walk every shard for this address; the union must equal `blocks`.
let reader = hot_kv.reader().unwrap();
let mut cursor = reader.traverse_dual::<tables::AccountsHistory>().unwrap();
let mut seen = Vec::new();
let mut entry = cursor.last_of_k1(&addr).unwrap();
while let Some((a, _shard_key, list)) = entry {
assert_eq!(a, addr);
let list: BlockNumberList = list;
seen.extend(list.iter());
entry = cursor.previous_k2().unwrap();
}
seen.sort_unstable();
assert_eq!(seen, blocks, "all touched blocks must be reachable across shards");
}
1 change: 1 addition & 0 deletions crates/hot/src/conformance/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ pub fn conformance<T: HotKv>(hot_kv: &T) {
test_bytecode_roundtrip(hot_kv);
test_account_history(hot_kv);
test_storage_history(hot_kv);
test_history_shard_fits_in_dupsort_limit(hot_kv);
test_account_changes(hot_kv);
test_storage_changes(hot_kv);
test_missing_reads(hot_kv);
Expand Down
83 changes: 63 additions & 20 deletions crates/hot/src/db/inconsistent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -521,13 +521,23 @@ impl<T> UnsafeHistoryWrite for T where T: UnsafeDbWrite + HotKvWrite {}
/// This helper handles the common pattern of:
/// 1. Appending new block numbers to an existing shard
/// 2. Deleting the old shard if it exists
/// 3. Splitting into multiple shards if the result exceeds the shard size
/// 3. Splitting into multiple shards if the encoded result exceeds the
/// backend's per-value size budget
///
/// # Arguments
/// - `existing`: The current last shard (key, list) if any
/// - `indices`: New block numbers to append
/// - `delete_old`: Called to delete the old shard key before writing new ones
/// - `write_shard`: Called for each resulting shard (highest_block, list)
///
/// # Splitting strategy
///
/// Splitting is driven by the encoded byte size of the [`BlockNumberList`],
/// not by the index count. This is required because the history tables are
/// stored DUPSORT in MDBX, which caps each dup value at ~1980 bytes on
/// 4 KB pages. A count-based threshold cannot guarantee fitting under that
/// cap because roaring's serialised size depends on the value distribution,
/// not just the count. See [`ShardedKey::MAX_SHARD_BYTES`] (ENG-2287).
fn append_to_sharded_history<K, E, D, W>(
existing: Option<(K, BlockNumberList)>,
indices: impl IntoIterator<Item = u64>,
Expand All @@ -550,29 +560,62 @@ where
delete_old(key).map_err(HistoryError::Db)?;
}

// Fast path: all indices fit in one shard
if last_shard.len() <= ShardedKey::SHARD_COUNT as u64 {
// Fast path: encoded list fits under the per-shard byte budget.
if last_shard.serialized_size() <= ShardedKey::MAX_SHARD_BYTES {
return write_shard(u64::MAX, &last_shard).map_err(HistoryError::Db);
}

// Slow path: rechunk into multiple shards
// Reuse a single buffer to avoid allocating a new Vec per chunk
let mut chunk_buf = Vec::with_capacity(ShardedKey::SHARD_COUNT);
let mut iter = last_shard.iter().peekable();

while iter.peek().is_some() {
chunk_buf.clear();
chunk_buf.extend(iter.by_ref().take(ShardedKey::SHARD_COUNT));

let highest = if iter.peek().is_some() {
*chunk_buf.last().expect("chunk_buf is non-empty")
} else {
// Insert last list with `u64::MAX`
u64::MAX
};

let shard = BlockNumberList::new_pre_sorted(chunk_buf.iter().copied());
// Slow path: re-chunk into multiple shards whose encoded sizes each
// fit under the budget.
let all: Vec<u64> = last_shard.iter().collect();
let mut start = 0;
while start < all.len() {
let take = max_prefix_fitting(&all[start..]);
// A single value is always far smaller than MAX_SHARD_BYTES, so we
// can always make forward progress.
debug_assert!(take > 0, "no prefix fits in MAX_SHARD_BYTES");
let end = start + take;
let highest = if end == all.len() { u64::MAX } else { all[end - 1] };
let shard = BlockNumberList::new_pre_sorted(all[start..end].iter().copied());
write_shard(highest, &shard).map_err(HistoryError::Db)?;
start = end;
}
Ok(())
}

/// Binary-search the largest prefix length `n` of `indices` whose encoded
/// `BlockNumberList` fits in [`ShardedKey::MAX_SHARD_BYTES`].
///
/// Returns 0 only if `indices` is empty.
fn max_prefix_fitting(indices: &[u64]) -> usize {
if indices.is_empty() {
return 0;
}
// Optimistic check: the entire slice may fit. Skips the binary search
// in the common case where the caller passes a small remainder.
if encoded_size_of(indices) <= ShardedKey::MAX_SHARD_BYTES {
return indices.len();
}
let mut lo = 1usize;
let mut hi = indices.len();
let mut best = 1usize;
while lo <= hi {
let mid = (lo + hi) / 2;
if encoded_size_of(&indices[..mid]) <= ShardedKey::MAX_SHARD_BYTES {
best = mid;
lo = mid + 1;
} else {
if mid == 1 {
// Even a single index doesn't fit — shouldn't happen but
// bail out to avoid an infinite loop.
break;
}
hi = mid - 1;
}
}
best
}

fn encoded_size_of(indices: &[u64]) -> usize {
BlockNumberList::new_pre_sorted(indices.iter().copied()).serialized_size()
}
17 changes: 16 additions & 1 deletion crates/types/src/sharded.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,23 @@ pub struct ShardedKey<T> {
}

impl ShardedKey<()> {
/// Number of indices in one shard.
/// Soft cap on the number of indices in one shard.
///
/// This is a sanity ceiling used alongside [`Self::MAX_SHARD_BYTES`];
/// shard splitting is driven by encoded size, not by this count.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another thing we should consider is whether we can empirically find size bounds with best and worst case inputs

pub const SHARD_COUNT: usize = 2000;

/// Maximum encoded byte size of a single shard's [`BlockNumberList`].
///
/// [`BlockNumberList`]: crate::BlockNumberList
///
/// The MDBX `DUPSORT` value limit is ~1980 bytes on 4 KB pages
/// (Linux production). Each stored dup value is `key2 || encoded list`,
/// so this budget reserves headroom for `ShardedKey<U256>` (40 bytes),
/// the 2-byte length prefix on `BlockNumberList`, and per-node overhead
/// inside MDBX. Exceeding this triggers `MDBX_BAD_VALSIZE` at write time
/// (ENG-2287).
pub const MAX_SHARD_BYTES: usize = 1500;
}

impl<T> ShardedKey<T> {
Expand Down