@@ -885,3 +885,189 @@ def test_upsert_snapshot_properties(catalog: Catalog) -> None:
885885 for snapshot in snapshots [initial_snapshot_count :]:
886886 assert snapshot .summary is not None
887887 assert snapshot .summary .additional_properties .get ("test_prop" ) == "test_value"
888+
889+
890+ def test_coarse_match_filter_composite_key () -> None :
891+ """
892+ Test that create_coarse_match_filter produces efficient In() predicates for composite keys.
893+ """
894+ from pyiceberg .table .upsert_util import create_coarse_match_filter , create_match_filter
895+
896+ # Create a table with composite key that has overlapping values
897+ # (1, 'x'), (2, 'y'), (1, 'z') - exact filter should have 3 conditions
898+ # coarse filter should have In(a, [1,2]) AND In(b, ['x','y','z'])
899+ data = [
900+ {"a" : 1 , "b" : "x" , "val" : 1 },
901+ {"a" : 2 , "b" : "y" , "val" : 2 },
902+ {"a" : 1 , "b" : "z" , "val" : 3 },
903+ ]
904+ schema = pa .schema ([pa .field ("a" , pa .int32 ()), pa .field ("b" , pa .string ()), pa .field ("val" , pa .int32 ())])
905+ table = pa .Table .from_pylist (data , schema = schema )
906+
907+ exact_filter = create_match_filter (table , ["a" , "b" ])
908+ coarse_filter = create_coarse_match_filter (table , ["a" , "b" ])
909+
910+ # Exact filter is an Or of And conditions
911+ assert "Or" in str (exact_filter )
912+
913+ # Coarse filter is an And of In conditions
914+ assert "And" in str (coarse_filter )
915+ assert "In" in str (coarse_filter )
916+
917+
918+ def test_vectorized_comparison_primitives () -> None :
919+ """Test vectorized comparison with primitive types."""
920+ from pyiceberg .table .upsert_util import _compare_columns_vectorized
921+
922+ # Test integers
923+ source = pa .array ([1 , 2 , 3 , 4 ])
924+ target = pa .array ([1 , 2 , 5 , 4 ])
925+ diff = _compare_columns_vectorized (source , target )
926+ assert diff .to_pylist () == [False , False , True , False ]
927+
928+ # Test strings
929+ source = pa .array (["a" , "b" , "c" ])
930+ target = pa .array (["a" , "x" , "c" ])
931+ diff = _compare_columns_vectorized (source , target )
932+ assert diff .to_pylist () == [False , True , False ]
933+
934+ # Test floats
935+ source = pa .array ([1.0 , 2.5 , 3.0 ])
936+ target = pa .array ([1.0 , 2.5 , 3.1 ])
937+ diff = _compare_columns_vectorized (source , target )
938+ assert diff .to_pylist () == [False , False , True ]
939+
940+
941+ def test_vectorized_comparison_nulls () -> None :
942+ """Test vectorized comparison handles nulls correctly."""
943+ from pyiceberg .table .upsert_util import _compare_columns_vectorized
944+
945+ # null vs non-null = different
946+ source = pa .array ([1 , None , 3 ])
947+ target = pa .array ([1 , 2 , 3 ])
948+ diff = _compare_columns_vectorized (source , target )
949+ assert diff .to_pylist () == [False , True , False ]
950+
951+ # non-null vs null = different
952+ source = pa .array ([1 , 2 , 3 ])
953+ target = pa .array ([1 , None , 3 ])
954+ diff = _compare_columns_vectorized (source , target )
955+ assert diff .to_pylist () == [False , True , False ]
956+
957+ # null vs null = same (no update needed)
958+ source = pa .array ([1 , None , 3 ])
959+ target = pa .array ([1 , None , 3 ])
960+ diff = _compare_columns_vectorized (source , target )
961+ assert diff .to_pylist () == [False , False , False ]
962+
963+
964+ def test_vectorized_comparison_structs () -> None :
965+ """Test vectorized comparison with nested struct types."""
966+ from pyiceberg .table .upsert_util import _compare_columns_vectorized
967+
968+ struct_type = pa .struct ([("x" , pa .int32 ()), ("y" , pa .string ())])
969+
970+ # Same structs
971+ source = pa .array ([{"x" : 1 , "y" : "a" }, {"x" : 2 , "y" : "b" }], type = struct_type )
972+ target = pa .array ([{"x" : 1 , "y" : "a" }, {"x" : 2 , "y" : "b" }], type = struct_type )
973+ diff = _compare_columns_vectorized (source , target )
974+ assert diff .to_pylist () == [False , False ]
975+
976+ # Different struct values
977+ source = pa .array ([{"x" : 1 , "y" : "a" }, {"x" : 2 , "y" : "b" }], type = struct_type )
978+ target = pa .array ([{"x" : 1 , "y" : "a" }, {"x" : 2 , "y" : "c" }], type = struct_type )
979+ diff = _compare_columns_vectorized (source , target )
980+ assert diff .to_pylist () == [False , True ]
981+
982+
983+ def test_vectorized_comparison_nested_structs () -> None :
984+ """Test vectorized comparison with deeply nested struct types."""
985+ from pyiceberg .table .upsert_util import _compare_columns_vectorized
986+
987+ inner_struct = pa .struct ([("val" , pa .int32 ())])
988+ outer_struct = pa .struct ([("inner" , inner_struct ), ("name" , pa .string ())])
989+
990+ source = pa .array (
991+ [{"inner" : {"val" : 1 }, "name" : "a" }, {"inner" : {"val" : 2 }, "name" : "b" }],
992+ type = outer_struct ,
993+ )
994+ target = pa .array (
995+ [{"inner" : {"val" : 1 }, "name" : "a" }, {"inner" : {"val" : 3 }, "name" : "b" }],
996+ type = outer_struct ,
997+ )
998+ diff = _compare_columns_vectorized (source , target )
999+ assert diff .to_pylist () == [False , True ]
1000+
1001+
1002+ def test_vectorized_comparison_lists () -> None :
1003+ """Test vectorized comparison with list types (falls back to Python comparison)."""
1004+ from pyiceberg .table .upsert_util import _compare_columns_vectorized
1005+
1006+ list_type = pa .list_ (pa .int32 ())
1007+
1008+ source = pa .array ([[1 , 2 ], [3 , 4 ]], type = list_type )
1009+ target = pa .array ([[1 , 2 ], [3 , 5 ]], type = list_type )
1010+ diff = _compare_columns_vectorized (source , target )
1011+ assert diff .to_pylist () == [False , True ]
1012+
1013+
1014+ def test_get_rows_to_update_no_non_key_cols () -> None :
1015+ """Test get_rows_to_update when all columns are key columns."""
1016+ from pyiceberg .table .upsert_util import get_rows_to_update
1017+
1018+ # All columns are key columns, so no non-key columns to compare
1019+ source = pa .Table .from_pydict ({"id" : [1 , 2 , 3 ]})
1020+ target = pa .Table .from_pydict ({"id" : [1 , 2 , 3 ]})
1021+ rows = get_rows_to_update (source , target , ["id" ])
1022+ assert len (rows ) == 0
1023+
1024+
1025+ def test_upsert_with_list_field (catalog : Catalog ) -> None :
1026+ """Test upsert with list type as non-key column."""
1027+ from pyiceberg .types import ListType
1028+
1029+ identifier = "default.test_upsert_with_list_field"
1030+ _drop_table (catalog , identifier )
1031+
1032+ schema = Schema (
1033+ NestedField (1 , "id" , IntegerType (), required = True ),
1034+ NestedField (
1035+ 2 ,
1036+ "tags" ,
1037+ ListType (element_id = 3 , element_type = StringType (), element_required = False ),
1038+ required = False ,
1039+ ),
1040+ identifier_field_ids = [1 ],
1041+ )
1042+
1043+ tbl = catalog .create_table (identifier , schema = schema )
1044+
1045+ arrow_schema = pa .schema (
1046+ [
1047+ pa .field ("id" , pa .int32 (), nullable = False ),
1048+ pa .field ("tags" , pa .list_ (pa .large_string ()), nullable = True ),
1049+ ]
1050+ )
1051+
1052+ initial_data = pa .Table .from_pylist (
1053+ [
1054+ {"id" : 1 , "tags" : ["a" , "b" ]},
1055+ {"id" : 2 , "tags" : ["c" ]},
1056+ ],
1057+ schema = arrow_schema ,
1058+ )
1059+ tbl .append (initial_data )
1060+
1061+ # Update with changed list
1062+ update_data = pa .Table .from_pylist (
1063+ [
1064+ {"id" : 1 , "tags" : ["a" , "b" ]}, # Same - no update
1065+ {"id" : 2 , "tags" : ["c" , "d" ]}, # Changed - should update
1066+ {"id" : 3 , "tags" : ["e" ]}, # New - should insert
1067+ ],
1068+ schema = arrow_schema ,
1069+ )
1070+
1071+ res = tbl .upsert (update_data , join_cols = ["id" ])
1072+ assert res .rows_updated == 1
1073+ assert res .rows_inserted == 1
0 commit comments