From c512dff2b128765aef6336ef525f12040a059931 Mon Sep 17 00:00:00 2001
From: dylan <dylan@init4.technology>
Date: Wed, 20 May 2026 00:07:33 -0600
Subject: [PATCH] fix(hot): split history shards by encoded size, not count

Closes ENG-2287

`AccountsHistory` and `StorageHistory` are stored DUPSORT, so each dup
value (key2 || encoded BlockNumberList) is capped at MDBX's DUPSORT
value limit (~1980 B on 4 KB pages). The previous splitter gated on
ShardedKey::SHARD_COUNT (2000), but a roaring BlockNumberList of 2000
sparse indices serialises to >20 KB. Once a hot account's shard tipped
over, every live block touching it crashed the sidecar with
MDBX_BAD_VALSIZE.

Replace count-based splitting in append_to_sharded_history with a
size-based check against a new ShardedKey::MAX_SHARD_BYTES (1500). When
the merged list overflows the budget, binary-search the largest prefix
that fits and emit shards until the remainder is consumed. Existing
oversized shards self-heal on the next write to them, so no migration
is required.

Add test_history_shard_fits_in_dupsort_limit to the hot conformance
suite. It exercises the real update_history_indices_inconsistent path
with 2000 sparse blocks (one per roaring container, the worst-case
encoding) and asserts the union across shards round-trips.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/hot/src/conformance/history.rs | 53 ++++++++++++++++-
 crates/hot/src/conformance/mod.rs     |  1 +
 crates/hot/src/db/inconsistent.rs     | 83 ++++++++++++++++++++-------
 crates/types/src/sharded.rs           | 17 +++++-
 4 files changed, 132 insertions(+), 22 deletions(-)
diff --git a/crates/hot/src/conformance/history.rs b/crates/hot/src/conformance/history.rs
index af0250c..3f7a67a 100644
--- a/crates/hot/src/conformance/history.rs
+++ b/crates/hot/src/conformance/history.rs
@@ -2,7 +2,7 @@
 
 use crate::{
     db::{HistoryRead, UnsafeDbWrite, UnsafeHistoryWrite},
-    model::{HotKv, HotKvWrite},
+    model::{HotKv, HotKvRead, HotKvWrite},
     tables,
 };
 use alloy::primitives::{U256, address};
@@ -454,3 +454,54 @@ pub fn test_delete_and_rewrite_dual<T: HotKv>(hot_kv: &T) {
         assert_eq!(hist.unwrap().iter().collect::<Vec<_>>(), vec![100, 200, 300]);
     }
 }
+
+/// Regression: account history must split into MDBX-fittable shards when an
+/// account is touched on many sparse blocks.
+///
+/// Why: AccountsHistory is DUPSORT, so each stored dup value
+/// (`key2 || encoded BlockNumberList`) is capped at MDBX's DUPSORT value
+/// limit (~1980 bytes on 4 KB pages). A roaring serialisation of 2000
+/// indices spread across many 16-bit containers exceeds 20 KB. The history
+/// pipeline must split shards by encoded size, not by index count, so any
+/// pattern of `update_history_indices_inconsistent` survives. ENG-2287.
+pub fn test_history_shard_fits_in_dupsort_limit<T: HotKv>(hot_kv: &T) {
+    let addr = address!("0xcccccccccccccccccccccccccccccccccccccccc");
+
+    // Sparse pattern: 2000 block-numbers spread far apart so each lives in
+    // its own roaring 16-bit container — the worst-case for serialised size
+    // (per-container header dominates). Write a change-set per block, then
+    // ask the history pipeline to index the full range in one call.
+    let blocks: Vec<u64> =
+        (0..ShardedKey::SHARD_COUNT as u64).map(|i| i.saturating_mul(100_000) + 1).collect();
+
+    {
+        let writer = hot_kv.writer().unwrap();
+        let acc = Account::default();
+        for &b in &blocks {
+            writer.write_account_prestate(b, addr, &acc).unwrap();
+        }
+        writer.commit().unwrap();
+    }
+
+    {
+        let writer = hot_kv.writer().unwrap();
+        let first = *blocks.first().unwrap();
+        let last = *blocks.last().unwrap();
+        writer.update_history_indices_inconsistent(first..=last).unwrap();
+        writer.commit().unwrap();
+    }
+
+    // Walk every shard for this address; the union must equal `blocks`.
+    let reader = hot_kv.reader().unwrap();
+    let mut cursor = reader.traverse_dual::<tables::AccountsHistory>().unwrap();
+    let mut seen = Vec::new();
+    let mut entry = cursor.last_of_k1(&addr).unwrap();
+    while let Some((a, _shard_key, list)) = entry {
+        assert_eq!(a, addr);
+        let list: BlockNumberList = list;
+        seen.extend(list.iter());
+        entry = cursor.previous_k2().unwrap();
+    }
+    seen.sort_unstable();
+    assert_eq!(seen, blocks, "all touched blocks must be reachable across shards");
+}
diff --git a/crates/hot/src/conformance/mod.rs b/crates/hot/src/conformance/mod.rs
index 2639865..1b17cb7 100644
--- a/crates/hot/src/conformance/mod.rs
+++ b/crates/hot/src/conformance/mod.rs
@@ -27,6 +27,7 @@ pub fn conformance<T: HotKv>(hot_kv: &T) {
     test_bytecode_roundtrip(hot_kv);
     test_account_history(hot_kv);
     test_storage_history(hot_kv);
+    test_history_shard_fits_in_dupsort_limit(hot_kv);
     test_account_changes(hot_kv);
     test_storage_changes(hot_kv);
     test_missing_reads(hot_kv);
diff --git a/crates/hot/src/db/inconsistent.rs b/crates/hot/src/db/inconsistent.rs
index 9a47a2b..5978e72 100644
--- a/crates/hot/src/db/inconsistent.rs
+++ b/crates/hot/src/db/inconsistent.rs
@@ -521,13 +521,23 @@ impl<T> UnsafeHistoryWrite for T where T: UnsafeDbWrite + HotKvWrite {}
 /// This helper handles the common pattern of:
 /// 1. Appending new block numbers to an existing shard
 /// 2. Deleting the old shard if it exists
-/// 3. Splitting into multiple shards if the result exceeds the shard size
+/// 3. Splitting into multiple shards if the encoded result exceeds the
+///    backend's per-value size budget
 ///
 /// # Arguments
 /// - `existing`: The current last shard (key, list) if any
 /// - `indices`: New block numbers to append
 /// - `delete_old`: Called to delete the old shard key before writing new ones
 /// - `write_shard`: Called for each resulting shard (highest_block, list)
+///
+/// # Splitting strategy
+///
+/// Splitting is driven by the encoded byte size of the [`BlockNumberList`],
+/// not by the index count. This is required because the history tables are
+/// stored DUPSORT in MDBX, which caps each dup value at ~1980 bytes on
+/// 4 KB pages. A count-based threshold cannot guarantee fitting under that
+/// cap because roaring's serialised size depends on the value distribution,
+/// not just the count. See [`ShardedKey::MAX_SHARD_BYTES`] (ENG-2287).
 fn append_to_sharded_history<K, E, D, W>(
     existing: Option<(K, BlockNumberList)>,
     indices: impl IntoIterator<Item = u64>,
@@ -550,29 +560,62 @@ where
         delete_old(key).map_err(HistoryError::Db)?;
     }
 
-    // Fast path: all indices fit in one shard
-    if last_shard.len() <= ShardedKey::SHARD_COUNT as u64 {
+    // Fast path: encoded list fits under the per-shard byte budget.
+    if last_shard.serialized_size() <= ShardedKey::MAX_SHARD_BYTES {
         return write_shard(u64::MAX, &last_shard).map_err(HistoryError::Db);
     }
 
-    // Slow path: rechunk into multiple shards
-    // Reuse a single buffer to avoid allocating a new Vec per chunk
-    let mut chunk_buf = Vec::with_capacity(ShardedKey::SHARD_COUNT);
-    let mut iter = last_shard.iter().peekable();
-
-    while iter.peek().is_some() {
-        chunk_buf.clear();
-        chunk_buf.extend(iter.by_ref().take(ShardedKey::SHARD_COUNT));
-
-        let highest = if iter.peek().is_some() {
-            *chunk_buf.last().expect("chunk_buf is non-empty")
-        } else {
-            // Insert last list with `u64::MAX`
-            u64::MAX
-        };
-
-        let shard = BlockNumberList::new_pre_sorted(chunk_buf.iter().copied());
+    // Slow path: re-chunk into multiple shards whose encoded sizes each
+    // fit under the budget.
+    let all: Vec<u64> = last_shard.iter().collect();
+    let mut start = 0;
+    while start < all.len() {
+        let take = max_prefix_fitting(&all[start..]);
+        // A single value is always far smaller than MAX_SHARD_BYTES, so we
+        // can always make forward progress.
+        debug_assert!(take > 0, "no prefix fits in MAX_SHARD_BYTES");
+        let end = start + take;
+        let highest = if end == all.len() { u64::MAX } else { all[end - 1] };
+        let shard = BlockNumberList::new_pre_sorted(all[start..end].iter().copied());
         write_shard(highest, &shard).map_err(HistoryError::Db)?;
+        start = end;
     }
     Ok(())
 }
+
+/// Binary-search the largest prefix length `n` of `indices` whose encoded
+/// `BlockNumberList` fits in [`ShardedKey::MAX_SHARD_BYTES`].
+///
+/// Returns 0 only if `indices` is empty.
+fn max_prefix_fitting(indices: &[u64]) -> usize {
+    if indices.is_empty() {
+        return 0;
+    }
+    // Optimistic check: the entire slice may fit. Skips the binary search
+    // in the common case where the caller passes a small remainder.
+    if encoded_size_of(indices) <= ShardedKey::MAX_SHARD_BYTES {
+        return indices.len();
+    }
+    let mut lo = 1usize;
+    let mut hi = indices.len();
+    let mut best = 1usize;
+    while lo <= hi {
+        let mid = (lo + hi) / 2;
+        if encoded_size_of(&indices[..mid]) <= ShardedKey::MAX_SHARD_BYTES {
+            best = mid;
+            lo = mid + 1;
+        } else {
+            if mid == 1 {
+                // Even a single index doesn't fit — shouldn't happen but
+                // bail out to avoid an infinite loop.
+                break;
+            }
+            hi = mid - 1;
+        }
+    }
+    best
+}
+
+fn encoded_size_of(indices: &[u64]) -> usize {
+    BlockNumberList::new_pre_sorted(indices.iter().copied()).serialized_size()
+}
diff --git a/crates/types/src/sharded.rs b/crates/types/src/sharded.rs
index bc5ffed..4eba074 100644
--- a/crates/types/src/sharded.rs
+++ b/crates/types/src/sharded.rs
@@ -13,8 +13,23 @@ pub struct ShardedKey<T> {
 }
 
 impl ShardedKey<()> {
-    /// Number of indices in one shard.
+    /// Soft cap on the number of indices in one shard.
+    ///
+    /// This is a sanity ceiling used alongside [`Self::MAX_SHARD_BYTES`];
+    /// shard splitting is driven by encoded size, not by this count.
     pub const SHARD_COUNT: usize = 2000;
+
+    /// Maximum encoded byte size of a single shard's [`BlockNumberList`].
+    ///
+    /// [`BlockNumberList`]: crate::BlockNumberList
+    ///
+    /// The MDBX `DUPSORT` value limit is ~1980 bytes on 4 KB pages
+    /// (Linux production). Each stored dup value is `key2 || encoded list`,
+    /// so this budget reserves headroom for `ShardedKey<U256>` (40 bytes),
+    /// the 2-byte length prefix on `BlockNumberList`, and per-node overhead
+    /// inside MDBX. Exceeding this triggers `MDBX_BAD_VALSIZE` at write time
+    /// (ENG-2287).
+    pub const MAX_SHARD_BYTES: usize = 1500;
 }
 
 impl<T> ShardedKey<T> {