diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index 0f7c43543046..c3b809083ecd 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -61,6 +61,7 @@ impl MapArray { /// * `entries.null_count() != 0` /// * `entries.columns().len() != 2` /// * `field.data_type() != entries.data_type()` + /// * the keys field is nullable pub fn try_new( field: FieldRef, offsets: OffsetBuffer, @@ -109,6 +110,16 @@ impl MapArray { ))); } + // The Arrow spec requires the "key" field to be non-nullable + // + if entries.fields()[0].is_nullable() { + return Err(ArrowError::InvalidArgumentError( + "MapArray keys field cannot be nullable".to_string(), + )); + } + // No need to verify if key contain nulls since `StructArray` + // already disallow nulls for non nullable fields + Ok(Self { data_type: DataType::Map(field, ordered), nulls, @@ -931,6 +942,27 @@ mod tests { ); } + #[test] + fn test_try_new_nullable_keys_field() { + // https://github.com/apache/arrow-rs/issues/10268 + let keys = Int32Array::from(vec![Some(1), None]); + let values = Int32Array::from(vec![None, Some(2)]); + let fields = Fields::from(vec![ + Field::new("keys", DataType::Int32, true), + Field::new("values", DataType::Int32, true), + ]); + let entries = + StructArray::new(fields.clone(), vec![Arc::new(keys), Arc::new(values)], None); + let field = Arc::new(Field::new("entries", DataType::Struct(fields), false)); + + let err = MapArray::try_new(field, OffsetBuffer::from_lengths([2]), entries, None, false) + .unwrap_err(); + assert_eq!( + err.to_string(), + "Invalid argument error: MapArray keys field cannot be nullable" + ); + } + #[test] fn test_from_vec_of_maps() { for ordered in [true, false] { diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 19e3037846ab..905a57dc4f1f 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -991,6 +991,11 @@ impl ArrayData { ))); } } + DataType::Map(f, _) if f.is_nullable() => { + return Err(ArrowError::InvalidArgumentError( + "The nullable should be set to false for the map entries field.".to_string(), + )); + } _ => {} }; @@ -1140,7 +1145,7 @@ impl ArrayData { /// Validates the layout of `child_data` ArrayData structures fn validate_child_data(&self) -> Result<(), ArrowError> { match &self.data_type { - DataType::List(field) | DataType::Map(field, _) => { + DataType::List(field) => { let values_data = self.get_single_valid_child_data(field.data_type())?; self.validate_offsets::(values_data.len)?; Ok(()) @@ -1150,6 +1155,30 @@ impl ArrayData { self.validate_offsets::(values_data.len)?; Ok(()) } + DataType::Map(field, _) => { + let DataType::Struct(entries_fields) = field.data_type() else { + return Err(ArrowError::InvalidArgumentError(format!( + "Map field should be a entries struct data type, got {:?} instead", + field.data_type() + ))); + }; + if entries_fields.len() != 2 { + return Err(ArrowError::InvalidArgumentError(format!( + "Map entries data type should be a struct containing 2 fields, got {} fields", + entries_fields.len() + ))); + } + + // Key field + if entries_fields[0].is_nullable() { + return Err(ArrowError::InvalidArgumentError( + "Map key field must not be nullable".to_string(), + )); + } + let values_data = self.get_single_valid_child_data(field.data_type())?; + self.validate_offsets::(values_data.len)?; + Ok(()) + } DataType::ListView(field) => { let values_data = self.get_single_valid_child_data(field.data_type())?; self.validate_offsets_and_sizes::(values_data.len)?; @@ -2303,6 +2332,7 @@ pub(crate) fn get_fixed_size_binary_width(data_type: &DataType) -> usize { #[cfg(test)] mod tests { use super::*; + use arrow_buffer::{OffsetBuffer, ScalarBuffer}; use arrow_schema::{Field, Fields}; // See arrow/tests/array_data_validation.rs for test of array validation @@ -2766,4 +2796,329 @@ mod tests { format!("Invalid argument error: Last offset 2 of Utf8 is larger than values length 0",) ); } + + #[test] + fn should_fail_validation_when_having_map_field_type_is_not_struct() { + let map_field = Field::new("key", DataType::Int32, false); + + let map_field_data = valid_non_nullable_int32_array_data(2); + + let results = test_both_builder_and_array_data( + DataType::Map(map_field.into(), false), + 1, + None, + 0, + vec![ + OffsetBuffer::::from_lengths(vec![2]) + .into_inner() + .into(), + ], + vec![map_field_data], + ); + + for result in results { + let array_data_err = result.expect_err("should fail for non struct field"); + + match array_data_err { + ArrowError::InvalidArgumentError(msg) => { + assert_eq!( + msg, + "Map field should be a entries struct data type, got Int32 instead" + ) + } + _ => panic!("unexpected error type {array_data_err}"), + }; + } + } + + #[test] + fn should_fail_validation_when_having_map_entries_only_have_1_field() { + let struct_data_type = DataType::Struct(Fields::from(vec![Field::new( + "key", + DataType::Int32, + false, + )])); + + let key_array_data = valid_non_nullable_int32_array_data(2); + + let struct_data = { + let builder = ArrayDataBuilder::new(struct_data_type.clone()) + .len(2) + .nulls(None) + .child_data(vec![key_array_data]); + + builder.build().unwrap() + }; + + let results = test_both_builder_and_array_data( + DataType::Map(Field::new("entries", struct_data_type, false).into(), false), + 1, + None, + 0, + vec![ + OffsetBuffer::::from_lengths(vec![2]) + .into_inner() + .into(), + ], + vec![struct_data], + ); + + for result in results { + let array_data_err = result.expect_err("should fail for nullable key"); + + match array_data_err { + ArrowError::InvalidArgumentError(msg) => { + assert_eq!( + msg, + "Map entries data type should be a struct containing 2 fields, got 1 fields" + ) + } + _ => panic!("unexpected error type {array_data_err}"), + }; + } + } + + #[test] + fn should_fail_validation_when_having_map_entries_have_3_fields() { + let struct_data_type = DataType::Struct(Fields::from(vec![ + Field::new("key", DataType::Int32, false), + Field::new("values", DataType::Utf8, true), + Field::new("other", DataType::Int32, true), + ])); + + let key_array_data = valid_non_nullable_int32_array_data(2); + + let values_array_data = valid_string_array_data(2); + + let other_array_data = key_array_data.clone(); + + let struct_data = { + let builder = ArrayDataBuilder::new(struct_data_type.clone()) + .len(2) + .nulls(None) + .child_data(vec![key_array_data, values_array_data, other_array_data]); + + builder.build().unwrap() + }; + + let results = test_both_builder_and_array_data( + DataType::Map(Field::new("entries", struct_data_type, false).into(), false), + 1, + None, + 0, + vec![ + OffsetBuffer::::from_lengths(vec![2]) + .into_inner() + .into(), + ], + vec![struct_data], + ); + + for result in results { + let array_data_err = result.expect_err("should fail for nullable key"); + + match array_data_err { + ArrowError::InvalidArgumentError(msg) => { + assert_eq!( + msg, + "Map entries data type should be a struct containing 2 fields, got 3 fields" + ) + } + _ => panic!("unexpected error type {array_data_err}"), + }; + } + } + + #[test] + fn should_fail_validation_when_having_nullable_map_keys() { + let struct_data_type = DataType::Struct(Fields::from(vec![ + Field::new("key", DataType::Int32, true), + Field::new("values", DataType::Utf8, true), + ])); + + let key_array_data = valid_non_nullable_int32_array_data(2); + let values_array_data = valid_string_array_data(2); + + let struct_data = { + let builder = ArrayDataBuilder::new(struct_data_type.clone()) + .len(2) + .nulls(None) + .child_data(vec![key_array_data, values_array_data]); + + builder.build().unwrap() + }; + + let results = test_both_builder_and_array_data( + DataType::Map(Field::new("entries", struct_data_type, false).into(), false), + 1, + None, + 0, + vec![ + OffsetBuffer::::from_lengths(vec![2]) + .into_inner() + .into(), + ], + vec![struct_data], + ); + + for result in results { + let array_data_err = result.expect_err("should fail for nullable key"); + + match array_data_err { + ArrowError::InvalidArgumentError(msg) => { + assert_eq!(msg, "Map key field must not be nullable") + } + _ => panic!("unexpected error type {array_data_err}"), + }; + } + } + + #[test] + fn should_fail_validation_when_having_entries_is_nullable_for_map() { + let struct_data_type = DataType::Struct(Fields::from(vec![ + Field::new("key", DataType::Int32, false), + Field::new("values", DataType::Utf8, true), + ])); + + let key_array_data = valid_non_nullable_int32_array_data(2); + + let values_array_data = valid_string_array_data(2); + + let struct_data = { + let builder = ArrayDataBuilder::new(struct_data_type.clone()) + .len(2) + .nulls(None) + .child_data(vec![key_array_data, values_array_data]); + + builder.build().unwrap() + }; + + let results = test_both_builder_and_array_data( + DataType::Map(Field::new("entries", struct_data_type, true).into(), false), + 1, + None, + 0, + vec![ + OffsetBuffer::::from_lengths(vec![2]) + .into_inner() + .into(), + ], + vec![struct_data], + ); + + for result in results { + let array_data_err = result.expect_err("should fail for nullable entries"); + + match array_data_err { + ArrowError::InvalidArgumentError(msg) => assert_eq!( + msg, + "The nullable should be set to false for the map entries field." + ), + _ => panic!("unexpected error type {array_data_err}"), + }; + } + } + + #[test] + fn should_allow_to_create_map_from_data() { + let struct_data_type = DataType::Struct(Fields::from(vec![ + Field::new("key", DataType::Int32, false), + Field::new("values", DataType::Utf8, true), + ])); + + let key_array_data = valid_non_nullable_int32_array_data(2); + let values_array_data = valid_string_array_data(2); + + let struct_data = { + let builder = ArrayDataBuilder::new(struct_data_type.clone()) + .len(2) + .nulls(None) + .child_data(vec![key_array_data, values_array_data]); + + builder.build().unwrap() + }; + + let results = test_both_builder_and_array_data( + DataType::Map(Field::new("entries", struct_data_type, false).into(), false), + 1, + None, + 0, + vec![ + OffsetBuffer::::from_lengths(vec![2]) + .into_inner() + .into(), + ], + vec![struct_data], + ); + + for result in results { + result.expect("should be able to create map ArrayData"); + } + } + + fn valid_string_array_data(length: usize) -> ArrayData { + let offsets = OffsetBuffer::::from_lengths(vec![0; length]) + .into_inner() + .into_inner(); + let empty_bytes = Buffer::default(); + + let builder = ArrayDataBuilder::new(DataType::Utf8) + .len(length) + .buffers(vec![offsets, empty_bytes]) + .nulls(None); + + builder.build().unwrap() + } + + fn valid_non_nullable_int32_array_data(length: usize) -> ArrayData { + let builder = ArrayDataBuilder::new(DataType::Int32) + .len(length) + .nulls(None) + .buffers(vec![ + ScalarBuffer::::from(vec![1; length]).into_inner(), + ]); + + builder.build().unwrap() + } + + #[test] + fn empty_and_null_map_array_should_pass_validation() { + let dt = DataType::Map( + Field::new( + "entries", + DataType::Struct(Fields::from(vec![ + Field::new("key", DataType::Int32, false), + Field::new("values", DataType::Utf8, true), + ])), + false, + ) + .into(), + false, + ); + + ArrayData::new_empty(&dt).validate_full().unwrap(); + ArrayData::new_null(&dt, 1).validate_full().unwrap(); + } + + fn test_both_builder_and_array_data( + data_type: DataType, + len: usize, + null_bit_buffer: Option, + offset: usize, + buffers: Vec, + child_data: Vec, + ) -> [Result; 2] { + let from_builder_res = ArrayData::builder(data_type.clone()) + .len(len) + .add_buffers(buffers.clone()) + .null_bit_buffer(null_bit_buffer.clone()) + .offset(offset) + .child_data(child_data.clone()) + .build(); + + let from_try_new_res = + ArrayData::try_new(data_type, len, null_bit_buffer, offset, buffers, child_data); + + [from_builder_res, from_try_new_res] + } } diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 6cb1d7e6b021..5ee7da20153c 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -2742,14 +2742,14 @@ mod tests { let value_dict_keys = Int8Array::from_iter_values([0, 1, 1, 2, 3, 1]); let value_dict_array = DictionaryArray::new(value_dict_keys, values.clone()); - let key_dict_keys = Int8Array::from_iter_values([0, 0, 2, 1, 1, 3]); + let key_dict_keys = Int8Array::from_iter_values([0, 0, 2, 2, 2, 3]); let key_dict_array = DictionaryArray::new(key_dict_keys, values); #[allow(deprecated)] let keys_field = Arc::new(Field::new_dict( "keys", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), - true, // It is technically not legal for this field to be null. + false, 1, false, )); @@ -2954,13 +2954,13 @@ mod tests { let bin_view_array = Arc::new(BinaryViewArray::from_iter(bin_values)); let utf8_view_array = Arc::new(StringViewArray::from_iter(utf8_values)); - let key_dict_keys = Int8Array::from_iter_values([0, 0, 1, 2, 0, 1, 3]); + let key_dict_keys = Int8Array::from_iter_values([0, 0, 2, 2, 0, 2, 3]); let key_dict_array = DictionaryArray::new(key_dict_keys, utf8_view_array.clone()); #[allow(deprecated)] let keys_field = Arc::new(Field::new_dict( "keys", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8View)), - true, + false, 1, false, )); diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 8ed2debb6a4b..257707469dc5 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -6391,34 +6391,6 @@ mod tests { assert_eq!(&map, &back[0]); } - // Test Map - both keys and values are Null type - #[test] - fn test_map_null_keys_and_null_values() { - let null_keys = Arc::new(NullArray::new(3)) as ArrayRef; - let null_values = Arc::new(NullArray::new(3)) as ArrayRef; - - let offsets = OffsetBuffer::new(vec![0, 1, 1, 3].into()); - let entries_fields = vec![ - Arc::new(Field::new("keys", DataType::Null, true)), - Arc::new(Field::new("values", DataType::Null, true)), - ]; - let struct_field = Arc::new(Field::new( - "entries", - DataType::Struct(entries_fields.clone().into()), - false, - )); - let entries = StructArray::new(entries_fields.into(), vec![null_keys, null_values], None); - - let map: ArrayRef = Arc::new(MapArray::new(struct_field, offsets, entries, None, false)); - - let converter = RowConverter::new(vec![SortField::new(map.data_type().clone())]).unwrap(); - let rows = converter.convert_columns(&[Arc::clone(&map)]).unwrap(); - let back = converter.convert_rows(&rows).unwrap(); - assert_eq!(back.len(), 1); - back[0].to_data().validate_full().unwrap(); - assert_eq!(&map, &back[0]); - } - // Test Map all empty maps #[test] fn test_map_null_all_empty() {