GH-49689: [R][C++] Parquets do not support list-columns of ordered factors (ordered dictionaries)#49937
GH-49689: [R][C++] Parquets do not support list-columns of ordered factors (ordered dictionaries)#49937thisisnic wants to merge 5 commits intoapache:mainfrom
Conversation
|
|
| // GH-49689: Ordered dictionary tests | ||
|
|
||
| TEST(TestDictionaryBuilderOrdered, TypePreservesOrderedFlag) { | ||
| auto ordered_type = dictionary(int8(), utf8(), /*ordered=*/true); | ||
| std::unique_ptr<ArrayBuilder> boxed_builder; | ||
| ASSERT_OK(MakeBuilder(default_memory_pool(), ordered_type, &boxed_builder)); | ||
|
|
||
| auto builder_type = boxed_builder->type(); | ||
| ASSERT_TRUE(checked_cast<const DictionaryType&>(*builder_type).ordered()); | ||
| } | ||
|
|
||
| TEST(TestDictionaryBuilderOrdered, UnorderedTypeStaysUnordered) { | ||
| auto unordered_type = dictionary(int8(), utf8(), /*ordered=*/false); | ||
| std::unique_ptr<ArrayBuilder> boxed_builder; | ||
| ASSERT_OK(MakeBuilder(default_memory_pool(), unordered_type, &boxed_builder)); | ||
|
|
||
| auto builder_type = boxed_builder->type(); | ||
| ASSERT_FALSE(checked_cast<const DictionaryType&>(*builder_type).ordered()); | ||
| } | ||
|
|
||
| TEST(TestDictionaryBuilderOrdered, FinishPreservesOrderedFlag) { | ||
| auto ordered_type = dictionary(int8(), utf8(), /*ordered=*/true); | ||
| std::unique_ptr<ArrayBuilder> boxed_builder; | ||
| ASSERT_OK(MakeBuilder(default_memory_pool(), ordered_type, &boxed_builder)); | ||
| auto& builder = checked_cast<DictionaryBuilder<StringType>&>(*boxed_builder); | ||
|
|
||
| ASSERT_OK(builder.Append("a")); | ||
| ASSERT_OK(builder.Append("b")); | ||
| ASSERT_OK(builder.Append("a")); | ||
|
|
||
| std::shared_ptr<Array> result; | ||
| ASSERT_OK(builder.Finish(&result)); | ||
|
|
||
| const auto& result_type = checked_cast<const DictionaryType&>(*result->type()); | ||
| ASSERT_TRUE(result_type.ordered()); | ||
|
|
||
| auto ex_dict = ArrayFromJSON(utf8(), R"(["a", "b"])"); | ||
| auto ex_indices = ArrayFromJSON(int8(), "[0, 1, 0]"); | ||
| DictionaryArray expected(ordered_type, ex_indices, ex_dict); | ||
| AssertArraysEqual(expected, *result); | ||
| } | ||
|
|
||
| TEST(TestDictionaryBuilderOrdered, ListOfOrderedDictionary) { | ||
| auto ordered_dict_type = dictionary(int8(), utf8(), /*ordered=*/true); | ||
| auto list_type = list(field("item", ordered_dict_type)); | ||
|
|
||
| std::unique_ptr<ArrayBuilder> boxed_builder; | ||
| ASSERT_OK(MakeBuilder(default_memory_pool(), list_type, &boxed_builder)); | ||
| auto& list_builder = checked_cast<ListBuilder&>(*boxed_builder); | ||
| auto& dict_builder = | ||
| checked_cast<DictionaryBuilder<StringType>&>(*list_builder.value_builder()); | ||
|
|
||
| ASSERT_OK(list_builder.Append()); | ||
| ASSERT_OK(dict_builder.Append("a")); | ||
| ASSERT_OK(dict_builder.Append("b")); | ||
| ASSERT_OK(list_builder.Append()); | ||
| ASSERT_OK(dict_builder.Append("a")); | ||
|
|
||
| std::shared_ptr<Array> result; | ||
| ASSERT_OK(list_builder.Finish(&result)); | ||
|
|
||
| const auto& result_list_type = checked_cast<const ListType&>(*result->type()); | ||
| const auto& result_dict_type = | ||
| checked_cast<const DictionaryType&>(*result_list_type.value_type()); | ||
| ASSERT_TRUE(result_dict_type.ordered()); | ||
| } | ||
|
|
||
| TEST(TestDictionaryBuilderOrdered, MakeDictionaryBuilderPreservesOrdered) { | ||
| auto ordered_type = dictionary(int8(), utf8(), /*ordered=*/true); | ||
| std::unique_ptr<ArrayBuilder> builder; | ||
| ASSERT_OK(MakeDictionaryBuilder(default_memory_pool(), ordered_type, | ||
| /*dictionary=*/nullptr, &builder)); | ||
|
|
||
| auto builder_type = builder->type(); | ||
| ASSERT_TRUE(checked_cast<const DictionaryType&>(*builder_type).ordered()); | ||
| } | ||
|
|
There was a problem hiding this comment.
This is all AI-generated. I roughly understood what it does but am not confident in it, other than having checked it looks relatively idiomatic, and all of these tests except UnorderedTypeStaysUnordered failed before the PR and passed after.
| const std::shared_ptr<DataType>& index_type; | ||
| const std::shared_ptr<DataType>& value_type; | ||
| const std::shared_ptr<Array>& dictionary; | ||
| std::shared_ptr<Array> dictionary; |
There was a problem hiding this comment.
This was updated as without it, we ended up with a dangling reference when nullptr is passed in, like when we're creating a builder from a type and don't have a dictionary array to pass in.
|
Would you mind giving this a look over some time @pitrou? I did use Claude and have tried to go through all generated code methodically to understand what it's doing and hopefully make it idiomatic, but let me know if there is anywhere I could have been more diligent in this. CI failures appear to be ones on main and not related to this PR. |
Rationale for this change
Ordered dictionaries inside nested types lose their
orderedflag during construction, becauseDictionaryBuilderdoesn't track it.What changes are included in this PR?
Store the
orderedflag inDictionaryBuilderBaseand pass it through when reconstructing theDictionaryTypeintype()andFinishInternal().Remove the R-side workaround that was patching this for top-level columns only.
Are these changes tested?
Yes
Are there any user-facing changes?
No
AI usage
Written by Claude, reviewed by me and Codex (via roborev). I had Claude create the tests first, to make sure they failed as expected, then had Claude make the fixes and checked the tests passed. I questioned the approach as I went.