From b8be5a6cc3ae7095b0a06c4623e017c9097251bc Mon Sep 17 00:00:00 2001 From: Han Damin Date: Mon, 29 Jun 2026 13:56:18 +0900 Subject: [PATCH 1/4] fix(arrow-avro): bound block-size reservation to available input A crafted OCF block size of i64::MAX hit Vec::reserve before any payload was read, aborting the process. Reserve only what the input backs. Closes #10234 --- arrow-avro/src/reader/block.rs | 77 +++++++++++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 1 deletion(-) diff --git a/arrow-avro/src/reader/block.rs b/arrow-avro/src/reader/block.rs index e20818bf968a..6adaf8bd723d 100644 --- a/arrow-avro/src/reader/block.rs +++ b/arrow-avro/src/reader/block.rs @@ -96,7 +96,13 @@ impl BlockDecoder { AvroError::ParseError(format!("Block size cannot be negative, got {c}")) })?; - self.in_progress.data.reserve(self.bytes_remaining); + // Only reserve what the current input backs: the block size is + // attacker-controlled here, so reserving it outright lets a crafted + // `i64::MAX` size abort the process (#10234). The rest is reserved + // lazily by `extend_from_slice` below as data arrives. + self.in_progress + .data + .reserve(self.bytes_remaining.min(buf.len())); self.state = BlockDecoderState::Data; } } @@ -148,3 +154,72 @@ impl BlockDecoder { self.bytes_remaining } } + +#[cfg(test)] +mod tests { + use super::*; + + /// Zig-zag encode `value` as an Avro `long` (variable-length integer). + fn encode_long(value: i64, out: &mut Vec) { + let mut n = ((value << 1) ^ (value >> 63)) as u64; + while n >= 0x80 { + out.push((n as u8) | 0x80); + n >>= 7; + } + out.push(n as u8); + } + + #[test] + fn test_oversized_block_size_bounds_reserve() { + // A block advertising `i64::MAX` bytes must not reserve that up front when only + // a few payload bytes are present, or a crafted OCF aborts on a huge alloc (#10234). + let mut buf = Vec::new(); + encode_long(1, &mut buf); // object count + encode_long(i64::MAX, &mut buf); // attacker-controlled block size + buf.extend_from_slice(&[0u8; 8]); // a handful of real bytes + + let mut decoder = BlockDecoder::default(); + let read = decoder.decode(&buf).unwrap(); + + assert_eq!(read, buf.len(), "all available input should be consumed"); + assert!( + decoder.in_progress.data.capacity() <= buf.len(), + "capacity {} must stay bounded by available input {}, not the advertised i64::MAX", + decoder.in_progress.data.capacity(), + buf.len(), + ); + } + + #[test] + fn test_negative_block_size_errors() { + let mut buf = Vec::new(); + encode_long(1, &mut buf); // object count + encode_long(-1, &mut buf); // invalid (negative) block size + + let mut decoder = BlockDecoder::default(); + let err = decoder.decode(&buf).unwrap_err(); + assert!( + err.to_string().contains("Block size cannot be negative"), + "unexpected error: {err}", + ); + } + + #[test] + fn test_well_formed_block_round_trips() { + // The capped reserve must not change decoding of a normal block. + let payload = [1u8, 2, 3, 4]; + let sync = [7u8; 16]; + let mut buf = Vec::new(); + encode_long(2, &mut buf); // object count + encode_long(payload.len() as i64, &mut buf); // block size + buf.extend_from_slice(&payload); + buf.extend_from_slice(&sync); + + let mut decoder = BlockDecoder::default(); + assert_eq!(decoder.decode(&buf).unwrap(), buf.len()); + let block = decoder.flush().expect("a complete block"); + assert_eq!(block.count, 2); + assert_eq!(block.data, payload); + assert_eq!(block.sync, sync); + } +} From 6672301856facd8bf688c0657127f9339038b053 Mon Sep 17 00:00:00 2001 From: Han Damin Date: Mon, 29 Jun 2026 13:56:18 +0900 Subject: [PATCH 2/4] fix(arrow-avro): bound array/map block item counts A block count of i64::MAX spun the item loop forever for zero-byte items like null, and i64::MIN overflowed the negation. Reject counts above the bytes remaining and negate with unsigned_abs. Closes #10235 --- arrow-avro/src/reader/cursor.rs | 6 +++ arrow-avro/src/reader/record.rs | 88 +++++++++++++++++++++++++++++---- 2 files changed, 84 insertions(+), 10 deletions(-) diff --git a/arrow-avro/src/reader/cursor.rs b/arrow-avro/src/reader/cursor.rs index b1199dad2080..d05c03dfa6c3 100644 --- a/arrow-avro/src/reader/cursor.rs +++ b/arrow-avro/src/reader/cursor.rs @@ -41,6 +41,12 @@ impl<'a> AvroCursor<'a> { self.start_len - self.buf.len() } + /// Returns the number of bytes left to read + #[inline] + pub(crate) fn remaining(&self) -> usize { + self.buf.len() + } + /// Read a single `u8` #[inline] pub(crate) fn get_u8(&mut self) -> Result { diff --git a/arrow-avro/src/reader/record.rs b/arrow-avro/src/reader/record.rs index 306c77718234..39e51ab99dcb 100644 --- a/arrow-avro/src/reader/record.rs +++ b/arrow-avro/src/reader/record.rs @@ -2286,35 +2286,57 @@ fn process_blockwise( match block_count.cmp(&0) { Ordering::Equal => break, Ordering::Less => { - let count = (-block_count) as usize; + // `unsigned_abs` avoids overflowing `-block_count` for `i64::MIN` (#10235) + let count = block_count.unsigned_abs() as usize; // A negative count is followed by a long of the size in bytes - let size_in_bytes = buf.get_long()? as usize; + let raw_size = buf.get_long()?; + let size_in_bytes = usize::try_from(raw_size).map_err(|_| { + AvroError::ParseError(format!("Block size cannot be negative, got {raw_size}")) + })?; match negative_behavior { NegativeBlockBehavior::ProcessItems => { // Process items one-by-one after reading size - for _ in 0..count { - on_item(buf)?; - } + process_block_items(buf, count, &mut on_item)?; } NegativeBlockBehavior::SkipBySize => { // Skip the entire block payload at once let _ = buf.get_fixed(size_in_bytes)?; } } - total += count; + total = total.saturating_add(count); } Ordering::Greater => { let count = block_count as usize; - for _ in 0..count { - on_item(buf)?; - } - total += count; + process_block_items(buf, count, &mut on_item)?; + total = total.saturating_add(count); } } } Ok(total) } +/// Decode `count` block items, rejecting a `count` larger than the bytes left to +/// decode them from. A crafted block can advertise up to `i64::MAX` items which, +/// with a zero-byte decoder like `null`, would otherwise spin forever (#10235). +/// Items that read input each need at least one byte, so a real `count` always fits. +#[inline] +fn process_block_items( + buf: &mut AvroCursor, + count: usize, + on_item: &mut impl FnMut(&mut AvroCursor) -> Result<(), AvroError>, +) -> Result<(), AvroError> { + let remaining = buf.remaining(); + if count > remaining { + return Err(AvroError::ParseError(format!( + "Block declares {count} items but only {remaining} bytes remain" + ))); + } + for _ in 0..count { + on_item(buf)?; + } + Ok(()) +} + #[inline] fn flush_values(values: &mut Vec) -> Vec { std::mem::replace(values, Vec::with_capacity(DEFAULT_CAPACITY)) @@ -3434,6 +3456,52 @@ mod tests { assert_eq!(values.value(2), 3); } + /// Zig-zag + unsigned-LEB128 encode, correct for all `i64` including `MIN`/`MAX` + /// (`encode_avro_long` loops forever on those two values). + fn encode_avro_long_extreme(value: i64) -> Vec { + let mut n = ((value << 1) ^ (value >> 63)) as u64; + let mut out = Vec::new(); + while n >= 0x80 { + out.push((n as u8) | 0x80); + n >>= 7; + } + out.push(n as u8); + out + } + + // `array` is the worst case: items consume no bytes, so without a bound the + // loop never advances the cursor and spins for the whole `block_count` (#10235). + fn array_of_null_decoder() -> Decoder { + let list_dt = avro_from_codec(Codec::List(Arc::new(avro_from_codec(Codec::Null)))); + Decoder::try_new(&list_dt).unwrap() + } + + #[test] + fn test_array_block_count_i64_max_errors() { + // A positive `i64::MAX` block count must error rather than spin the item loop. + let mut decoder = array_of_null_decoder(); + let mut data = encode_avro_long_extreme(i64::MAX); // item count + data.extend_from_slice(&encode_avro_long(0)); // empty-block terminator + let err = decoder.decode(&mut AvroCursor::new(&data)).unwrap_err(); + assert!( + err.to_string().contains("bytes remain"), + "unexpected error: {err}", + ); + } + + #[test] + fn test_array_block_count_i64_min_errors() { + // `i64::MIN` previously overflowed `-block_count` before spinning the loop. + let mut decoder = array_of_null_decoder(); + let mut data = encode_avro_long_extreme(i64::MIN); // negative item count + data.extend_from_slice(&encode_avro_long(0)); // block size in bytes + let err = decoder.decode(&mut AvroCursor::new(&data)).unwrap_err(); + assert!( + err.to_string().contains("bytes remain"), + "unexpected error: {err}", + ); + } + #[test] fn test_nested_array_decoding() { let inner_ty = avro_from_codec(Codec::List(Arc::new(avro_from_codec(Codec::Int32)))); From d2bb616973229dc76cd964b27efc86ee57c53d69 Mon Sep 17 00:00:00 2001 From: Han Damin Date: Wed, 1 Jul 2026 22:17:52 +0900 Subject: [PATCH 3/4] docs(arrow-avro): soften wording in block reserve comment Co-authored-by: Jeffrey Vo --- arrow-avro/src/reader/block.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-avro/src/reader/block.rs b/arrow-avro/src/reader/block.rs index 6adaf8bd723d..8df76c4b15f0 100644 --- a/arrow-avro/src/reader/block.rs +++ b/arrow-avro/src/reader/block.rs @@ -97,8 +97,8 @@ impl BlockDecoder { })?; // Only reserve what the current input backs: the block size is - // attacker-controlled here, so reserving it outright lets a crafted - // `i64::MAX` size abort the process (#10234). The rest is reserved + // input specified so could be an extreme value (e.g. i64::MAX) + // in case of corrupted/malicious input. The rest is reserved // lazily by `extend_from_slice` below as data arrives. self.in_progress .data From 83f39099be3593b502cf636ff4cbd3c4d58d793f Mon Sep 17 00:00:00 2001 From: Han Damin Date: Wed, 1 Jul 2026 22:47:31 +0900 Subject: [PATCH 4/4] fix(arrow-avro): cap array/map item count at i32::MAX The old check rejected any block with more items than bytes remaining, wrongly refusing valid `array` (zero-byte items). Cap the item total at `i32::MAX` instead; oversized counts still error before the loop. --- arrow-avro/src/reader/cursor.rs | 6 ----- arrow-avro/src/reader/record.rs | 47 ++++++++++++++++++++------------- 2 files changed, 29 insertions(+), 24 deletions(-) diff --git a/arrow-avro/src/reader/cursor.rs b/arrow-avro/src/reader/cursor.rs index d05c03dfa6c3..b1199dad2080 100644 --- a/arrow-avro/src/reader/cursor.rs +++ b/arrow-avro/src/reader/cursor.rs @@ -41,12 +41,6 @@ impl<'a> AvroCursor<'a> { self.start_len - self.buf.len() } - /// Returns the number of bytes left to read - #[inline] - pub(crate) fn remaining(&self) -> usize { - self.buf.len() - } - /// Read a single `u8` #[inline] pub(crate) fn get_u8(&mut self) -> Result { diff --git a/arrow-avro/src/reader/record.rs b/arrow-avro/src/reader/record.rs index 39e51ab99dcb..2a29ba127e4d 100644 --- a/arrow-avro/src/reader/record.rs +++ b/arrow-avro/src/reader/record.rs @@ -2296,45 +2296,46 @@ fn process_blockwise( match negative_behavior { NegativeBlockBehavior::ProcessItems => { // Process items one-by-one after reading size - process_block_items(buf, count, &mut on_item)?; + total = process_block_items(buf, count, total, &mut on_item)?; } NegativeBlockBehavior::SkipBySize => { // Skip the entire block payload at once let _ = buf.get_fixed(size_in_bytes)?; + total = total.saturating_add(count); } } - total = total.saturating_add(count); } Ordering::Greater => { let count = block_count as usize; - process_block_items(buf, count, &mut on_item)?; - total = total.saturating_add(count); + total = process_block_items(buf, count, total, &mut on_item)?; } } } Ok(total) } -/// Decode `count` block items, rejecting a `count` larger than the bytes left to -/// decode them from. A crafted block can advertise up to `i64::MAX` items which, -/// with a zero-byte decoder like `null`, would otherwise spin forever (#10235). -/// Items that read input each need at least one byte, so a real `count` always fits. +/// Decode `count` items, capping the running total at `i32::MAX` (the largest index +/// an Arrow list/map offset holds). Otherwise a crafted `i64::MAX` count of a zero-byte +/// item like `null` spins the loop forever (#10235); byte-consuming items self-terminate +/// on cursor exhaustion, so valid blocks (including `array`) are unaffected. #[inline] fn process_block_items( buf: &mut AvroCursor, count: usize, + total: usize, on_item: &mut impl FnMut(&mut AvroCursor) -> Result<(), AvroError>, -) -> Result<(), AvroError> { - let remaining = buf.remaining(); - if count > remaining { +) -> Result { + let new_total = total.checked_add(count).filter(|&t| t <= i32::MAX as usize); + let Some(new_total) = new_total else { return Err(AvroError::ParseError(format!( - "Block declares {count} items but only {remaining} bytes remain" + "Array/map item count {count} exceeds the maximum {} addressable by 32-bit offsets", + i32::MAX ))); - } + }; for _ in 0..count { on_item(buf)?; } - Ok(()) + Ok(new_total) } #[inline] @@ -3469,13 +3470,23 @@ mod tests { out } - // `array` is the worst case: items consume no bytes, so without a bound the - // loop never advances the cursor and spins for the whole `block_count` (#10235). + // `array` is the worst case: items consume no bytes, so an unbounded + // `block_count` spins the item loop without ever advancing the cursor (#10235). fn array_of_null_decoder() -> Decoder { let list_dt = avro_from_codec(Codec::List(Arc::new(avro_from_codec(Codec::Null)))); Decoder::try_new(&list_dt).unwrap() } + #[test] + fn test_array_of_null_decodes() { + // A valid `array` still decodes: zero-byte items make the count exceed the + // bytes left, so the bound must not reject it (#10235 review). + let mut decoder = array_of_null_decoder(); + let mut data = encode_avro_long(3); // three null items + data.extend_from_slice(&encode_avro_long(0)); // empty-block terminator + decoder.decode(&mut AvroCursor::new(&data)).unwrap(); + } + #[test] fn test_array_block_count_i64_max_errors() { // A positive `i64::MAX` block count must error rather than spin the item loop. @@ -3484,7 +3495,7 @@ mod tests { data.extend_from_slice(&encode_avro_long(0)); // empty-block terminator let err = decoder.decode(&mut AvroCursor::new(&data)).unwrap_err(); assert!( - err.to_string().contains("bytes remain"), + err.to_string().contains("exceeds the maximum"), "unexpected error: {err}", ); } @@ -3497,7 +3508,7 @@ mod tests { data.extend_from_slice(&encode_avro_long(0)); // block size in bytes let err = decoder.decode(&mut AvroCursor::new(&data)).unwrap_err(); assert!( - err.to_string().contains("bytes remain"), + err.to_string().contains("exceeds the maximum"), "unexpected error: {err}", ); }