From 4570a73839f702fee932170720e1aeff4deae68a Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Sat, 4 Jul 2026 22:52:12 +0300 Subject: [PATCH 1/3] fix: don't panic on `ArrayData::try_new` on bad input even when force_validate feature is on --- arrow-data/src/data.rs | 82 ++++++++++++++++++++++++++++-------------- 1 file changed, 56 insertions(+), 26 deletions(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 0ce98aa090df..4a71227fd40c 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -294,24 +294,18 @@ impl ArrayData { buffers: Vec, child_data: Vec, ) -> Self { - let mut skip_validation = UnsafeFlag::new(); - // SAFETY: caller responsible for ensuring data is valid - unsafe { skip_validation.set(true) }; - - ArrayDataBuilder { + let builder = Self::inner_new_builder( data_type, len, null_count, null_bit_buffer, - nulls: None, offset, buffers, child_data, - align_buffers: false, - skip_validation, - } - .build() - .unwrap() + ); + + // SAFETY: caller responsible for ensuring data is valid + unsafe { builder.build_unchecked() } } /// Create a new ArrayData, validating that the provided buffers form a valid @@ -337,7 +331,7 @@ impl ArrayData { ) -> Result { // we must check the length of `null_bit_buffer` first // because we use this buffer to calculate `null_count` - // in `Self::new_unchecked`. + // in `ArrayDataBuilder::build`. if let Some(null_bit_buffer) = null_bit_buffer.as_ref() { let len_plus_offset = checked_len_plus_offset(&data_type, len, offset)?; let needed_len = bit_util::ceil(len_plus_offset, 8); @@ -349,25 +343,47 @@ impl ArrayData { ))); } } - // Safety justification: `validate_full` is called below - let new_self = unsafe { - Self::new_unchecked( - data_type, - len, - None, - null_bit_buffer, - offset, - buffers, - child_data, - ) - }; + + let builder = Self::inner_new_builder( + data_type, + len, + None, + null_bit_buffer, + offset, + buffers, + child_data, + ); + + assert!(!builder.skip_validation.get()); // As the data is not trusted, do a full validation of its contents // We don't need to validate children as we can assume that the // [`ArrayData`] in `child_data` have already been validated through // a call to `ArrayData::try_new` or created using unsafe - new_self.validate_data()?; - Ok(new_self) + builder.build() + } + + fn inner_new_builder( + data_type: DataType, + len: usize, + null_count: Option, + null_bit_buffer: Option, + offset: usize, + buffers: Vec, + child_data: Vec, + ) -> ArrayDataBuilder { + ArrayDataBuilder { + data_type, + len, + null_count, + null_bit_buffer, + nulls: None, + offset, + buffers, + child_data, + align_buffers: false, + skip_validation: UnsafeFlag::new(), + } } /// Return the constituent parts of this ArrayData @@ -2727,4 +2743,18 @@ mod tests { assert!(array.is_null(i)); } } + + // Even when `force_validate` feature is on + #[test] + fn test_dont_panic_on_bad_input_when_using_try_new() { + let + + assert_eq!( + err.to_string(), + format!( + "Invalid argument error: Length {} with offset 1 overflows usize for Binary", + usize::MAX + ) + ); + } } From c79a3eeafd5fd9ae880c0c9a0fec5d6d08e9854b Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Sat, 4 Jul 2026 22:58:57 +0300 Subject: [PATCH 2/3] add test --- arrow-data/src/data.rs | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 4a71227fd40c..becbfddc46dd 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -2747,13 +2747,22 @@ mod tests { // Even when `force_validate` feature is on #[test] fn test_dont_panic_on_bad_input_when_using_try_new() { - let + let empty_bytes = Buffer::default(); + + let builder = ArrayDataBuilder::new(DataType::Utf8) + .len(1) + + // the offsets says that we have 2 bytes but the buffer is empty + .buffers(vec![Buffer::from_vec(vec![0i32, 2i32]), empty_bytes]) + .nulls(None); + + let res = builder.build().expect_err("should get error"); + assert_eq!( - err.to_string(), + res.to_string(), format!( - "Invalid argument error: Length {} with offset 1 overflows usize for Binary", - usize::MAX + "Invalid argument error: Last offset 2 of Utf8 is larger than values length 0", ) ); } From 7549cc43448f82408b4f17338eee707bb2483f80 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Sat, 4 Jul 2026 23:03:18 +0300 Subject: [PATCH 3/3] fix test --- arrow-data/src/data.rs | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index becbfddc46dd..19e3037846ab 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -2749,21 +2749,21 @@ mod tests { fn test_dont_panic_on_bad_input_when_using_try_new() { let empty_bytes = Buffer::default(); - let builder = ArrayDataBuilder::new(DataType::Utf8) - .len(1) - + let array_data = ArrayData::try_new( + DataType::Utf8, + 1, // len + None, + 0, // the offsets says that we have 2 bytes but the buffer is empty - .buffers(vec![Buffer::from_vec(vec![0i32, 2i32]), empty_bytes]) - .nulls(None); - - let res = builder.build().expect_err("should get error"); - - - assert_eq!( - res.to_string(), - format!( - "Invalid argument error: Last offset 2 of Utf8 is larger than values length 0", - ) - ); + vec![Buffer::from_vec(vec![0i32, 2i32]), empty_bytes], + vec![], + ); + + let res = array_data.expect_err("should get error"); + + assert_eq!( + res.to_string(), + format!("Invalid argument error: Last offset 2 of Utf8 is larger than values length 0",) + ); } }