feat: Optimize upsert process with coarse match filter and vectorized comparisons

EnyMan · EnyMan · commit a947f0ae9e0d · 2026-01-22T21:02:59.000+01:00
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -836,7 +836,8 @@ def upsert(
         )
 
         # get list of rows that exist so we don't have to load the entire target table
-        matched_predicate = upsert_util.create_match_filter(df, join_cols)
+        # Use coarse filter for initial scan - exact matching happens in get_rows_to_update()
+        matched_predicate = upsert_util.create_coarse_match_filter(df, join_cols)
 
         # We must use Transaction.table_metadata for the scan. This includes all uncommitted - but relevant - changes.
 
diff --git a/pyiceberg/table/upsert_util.py b/pyiceberg/table/upsert_util.py
@@ -16,6 +16,7 @@
 # under the License.
 import functools
 import operator
+from typing import Union
 
 import pyarrow as pa
 from pyarrow import Table as pyarrow_table
@@ -31,8 +32,18 @@
 
 
 def create_match_filter(df: pyarrow_table, join_cols: list[str]) -> BooleanExpression:
+    """
+    Create an Iceberg BooleanExpression filter that exactly matches rows based on join columns.
+
+    For single-column keys, uses an efficient In() predicate.
+    For composite keys, creates Or(And(...), And(...), ...) for exact row matching.
+    This function should be used when exact matching is required (e.g., overwrite, insert filtering).
+    """
     unique_keys = df.select(join_cols).group_by(join_cols).aggregate([])
 
+    if len(unique_keys) == 0:
+        return AlwaysFalse()
+
     if len(join_cols) == 1:
         return In(join_cols[0], unique_keys[0].to_pylist())
     else:
@@ -48,17 +59,97 @@ def create_match_filter(df: pyarrow_table, join_cols: list[str]) -> BooleanExpre
             return Or(*filters)
 
 
+def create_coarse_match_filter(df: pyarrow_table, join_cols: list[str]) -> BooleanExpression:
+    """
+    Create a coarse Iceberg BooleanExpression filter for initial row scanning.
+
+    For single-column keys, uses an efficient In() predicate (exact match).
+    For composite keys, uses In() per column as a coarse filter (AND of In() predicates),
+    which may return false positives but is much more efficient than exact matching.
+
+    This function should only be used for initial scans where exact matching happens
+    downstream (e.g., in get_rows_to_update() via the join operation).
+    """
+    unique_keys = df.select(join_cols).group_by(join_cols).aggregate([])
+
+    if len(unique_keys) == 0:
+        return AlwaysFalse()
+
+    if len(join_cols) == 1:
+        return In(join_cols[0], unique_keys[0].to_pylist())
+    else:
+        # For composite keys: use In() per column as a coarse filter
+        # This is more efficient than creating Or(And(...), And(...), ...) for each row
+        # May include false positives, but fine-grained matching happens downstream
+        column_filters = []
+        for col in join_cols:
+            unique_values = pc.unique(unique_keys[col]).to_pylist()
+            column_filters.append(In(col, unique_values))
+        return functools.reduce(operator.and_, column_filters)
+
+
 def has_duplicate_rows(df: pyarrow_table, join_cols: list[str]) -> bool:
     """Check for duplicate rows in a PyArrow table based on the join columns."""
     return len(df.select(join_cols).group_by(join_cols).aggregate([([], "count_all")]).filter(pc.field("count_all") > 1)) > 0
 
 
+def _compare_columns_vectorized(
+    source_col: Union[pa.Array, pa.ChunkedArray], target_col: Union[pa.Array, pa.ChunkedArray]
+) -> pa.Array:
+    """
+    Vectorized comparison of two columns, returning a boolean array where True means values differ.
+
+    Handles struct types recursively by comparing each nested field.
+    Handles null values correctly: null != non-null is True, null == null is True (no update needed).
+    """
+    col_type = source_col.type
+
+    if pa.types.is_struct(col_type):
+        # PyArrow cannot directly compare struct columns, so we recursively compare each field
+        diff_masks = []
+        for i, field in enumerate(col_type):
+            src_field = pc.struct_field(source_col, [i])
+            tgt_field = pc.struct_field(target_col, [i])
+            field_diff = _compare_columns_vectorized(src_field, tgt_field)
+            diff_masks.append(field_diff)
+
+        if not diff_masks:
+            # Empty struct - no fields to compare, so no differences
+            return pa.array([False] * len(source_col), type=pa.bool_())
+
+        return functools.reduce(pc.or_, diff_masks)
+
+    elif pa.types.is_list(col_type) or pa.types.is_large_list(col_type) or pa.types.is_map(col_type):
+        # For list/map types, fall back to Python comparison as PyArrow doesn't support vectorized comparison
+        # This is still faster than the original row-by-row approach since we batch the conversion
+        source_py = source_col.to_pylist()
+        target_py = target_col.to_pylist()
+        return pa.array([s != t for s, t in zip(source_py, target_py, strict=True)], type=pa.bool_())
+
+    else:
+        # For primitive types, use vectorized not_equal
+        # Handle nulls: not_equal returns null when comparing with null
+        # We need: null vs non-null = different (True), null vs null = same (False)
+        diff = pc.not_equal(source_col, target_col)
+        source_null = pc.is_null(source_col)
+        target_null = pc.is_null(target_col)
+
+        # XOR of null masks: True if exactly one is null (meaning they differ)
+        null_diff = pc.xor(source_null, target_null)
+
+        # Combine: different if values differ OR exactly one is null
+        # Fill null comparison results with False (both non-null but comparison returned null shouldn't happen,
+        # but if it does, treat as no difference)
+        diff_filled = pc.fill_null(diff, False)
+        return pc.or_(diff_filled, null_diff)
+
+
 def get_rows_to_update(source_table: pa.Table, target_table: pa.Table, join_cols: list[str]) -> pa.Table:
     """
     Return a table with rows that need to be updated in the target table based on the join columns.
 
+    Uses vectorized PyArrow operations for efficient comparison, avoiding row-by-row Python loops.
     The table is joined on the identifier columns, and then checked if there are any updated rows.
-    Those are selected and everything is renamed correctly.
     """
     all_columns = set(source_table.column_names)
     join_cols_set = set(join_cols)
@@ -69,13 +160,13 @@ def get_rows_to_update(source_table: pa.Table, target_table: pa.Table, join_cols
         raise ValueError("Target table has duplicate rows, aborting upsert")
 
     if len(target_table) == 0:
-        # When the target table is empty, there is nothing to update :)
+        # When the target table is empty, there is nothing to update
+        return source_table.schema.empty_table()
+
+    if len(non_key_cols) == 0:
+        # No non-key columns to compare, all matched rows are "updates" but with no changes
         return source_table.schema.empty_table()
 
-    # We need to compare non_key_cols in Python as PyArrow
-    # 1. Cannot do a join when non-join columns have complex types
-    # 2. Cannot compare columns with complex types
-    # See: https://github.com/apache/arrow/issues/35785
     SOURCE_INDEX_COLUMN_NAME = "__source_index"
     TARGET_INDEX_COLUMN_NAME = "__target_index"
 
@@ -100,25 +191,32 @@ def get_rows_to_update(source_table: pa.Table, target_table: pa.Table, join_cols
     # Step 3: Perform an inner join to find which rows from source exist in target
     matching_indices = source_index.join(target_index, keys=list(join_cols_set), join_type="inner")
 
-    # Step 4: Compare all rows using Python
-    to_update_indices = []
-    for source_idx, target_idx in zip(
-        matching_indices[SOURCE_INDEX_COLUMN_NAME].to_pylist(),
-        matching_indices[TARGET_INDEX_COLUMN_NAME].to_pylist(),
-        strict=True,
-    ):
-        source_row = source_table.slice(source_idx, 1)
-        target_row = target_table.slice(target_idx, 1)
-
-        for key in non_key_cols:
-            source_val = source_row.column(key)[0].as_py()
-            target_val = target_row.column(key)[0].as_py()
-            if source_val != target_val:
-                to_update_indices.append(source_idx)
-                break
-
-    # Step 5: Take rows from source table using the indices and cast to target schema
-    if to_update_indices:
+    if len(matching_indices) == 0:
+        # No matching rows found
+        return source_table.schema.empty_table()
+
+    # Step 4: Take matched rows in batch (vectorized - single operation)
+    source_indices = matching_indices[SOURCE_INDEX_COLUMN_NAME]
+    target_indices = matching_indices[TARGET_INDEX_COLUMN_NAME]
+
+    matched_source = source_table.take(source_indices)
+    matched_target = target_table.take(target_indices)
+
+    # Step 5: Vectorized comparison per column
+    diff_masks = []
+    for col in non_key_cols:
+        source_col = matched_source.column(col)
+        target_col = matched_target.column(col)
+        col_diff = _compare_columns_vectorized(source_col, target_col)
+        diff_masks.append(col_diff)
+
+    # Step 6: Combine masks with OR (any column different = needs update)
+    combined_mask = functools.reduce(pc.or_, diff_masks)
+
+    # Step 7: Filter to get indices of rows that need updating
+    to_update_indices = pc.filter(source_indices, combined_mask)
+
+    if len(to_update_indices) > 0:
         return source_table.take(to_update_indices)
     else:
         return source_table.schema.empty_table()
diff --git a/tests/table/test_upsert.py b/tests/table/test_upsert.py
@@ -885,3 +885,189 @@ def test_upsert_snapshot_properties(catalog: Catalog) -> None:
     for snapshot in snapshots[initial_snapshot_count:]:
         assert snapshot.summary is not None
         assert snapshot.summary.additional_properties.get("test_prop") == "test_value"
+
+
+def test_coarse_match_filter_composite_key() -> None:
+    """
+    Test that create_coarse_match_filter produces efficient In() predicates for composite keys.
+    """
+    from pyiceberg.table.upsert_util import create_coarse_match_filter, create_match_filter
+
+    # Create a table with composite key that has overlapping values
+    # (1, 'x'), (2, 'y'), (1, 'z') - exact filter should have 3 conditions
+    # coarse filter should have In(a, [1,2]) AND In(b, ['x','y','z'])
+    data = [
+        {"a": 1, "b": "x", "val": 1},
+        {"a": 2, "b": "y", "val": 2},
+        {"a": 1, "b": "z", "val": 3},
+    ]
+    schema = pa.schema([pa.field("a", pa.int32()), pa.field("b", pa.string()), pa.field("val", pa.int32())])
+    table = pa.Table.from_pylist(data, schema=schema)
+
+    exact_filter = create_match_filter(table, ["a", "b"])
+    coarse_filter = create_coarse_match_filter(table, ["a", "b"])
+
+    # Exact filter is an Or of And conditions
+    assert "Or" in str(exact_filter)
+
+    # Coarse filter is an And of In conditions
+    assert "And" in str(coarse_filter)
+    assert "In" in str(coarse_filter)
+
+
+def test_vectorized_comparison_primitives() -> None:
+    """Test vectorized comparison with primitive types."""
+    from pyiceberg.table.upsert_util import _compare_columns_vectorized
+
+    # Test integers
+    source = pa.array([1, 2, 3, 4])
+    target = pa.array([1, 2, 5, 4])
+    diff = _compare_columns_vectorized(source, target)
+    assert diff.to_pylist() == [False, False, True, False]
+
+    # Test strings
+    source = pa.array(["a", "b", "c"])
+    target = pa.array(["a", "x", "c"])
+    diff = _compare_columns_vectorized(source, target)
+    assert diff.to_pylist() == [False, True, False]
+
+    # Test floats
+    source = pa.array([1.0, 2.5, 3.0])
+    target = pa.array([1.0, 2.5, 3.1])
+    diff = _compare_columns_vectorized(source, target)
+    assert diff.to_pylist() == [False, False, True]
+
+
+def test_vectorized_comparison_nulls() -> None:
+    """Test vectorized comparison handles nulls correctly."""
+    from pyiceberg.table.upsert_util import _compare_columns_vectorized
+
+    # null vs non-null = different
+    source = pa.array([1, None, 3])
+    target = pa.array([1, 2, 3])
+    diff = _compare_columns_vectorized(source, target)
+    assert diff.to_pylist() == [False, True, False]
+
+    # non-null vs null = different
+    source = pa.array([1, 2, 3])
+    target = pa.array([1, None, 3])
+    diff = _compare_columns_vectorized(source, target)
+    assert diff.to_pylist() == [False, True, False]
+
+    # null vs null = same (no update needed)
+    source = pa.array([1, None, 3])
+    target = pa.array([1, None, 3])
+    diff = _compare_columns_vectorized(source, target)
+    assert diff.to_pylist() == [False, False, False]
+
+
+def test_vectorized_comparison_structs() -> None:
+    """Test vectorized comparison with nested struct types."""
+    from pyiceberg.table.upsert_util import _compare_columns_vectorized
+
+    struct_type = pa.struct([("x", pa.int32()), ("y", pa.string())])
+
+    # Same structs
+    source = pa.array([{"x": 1, "y": "a"}, {"x": 2, "y": "b"}], type=struct_type)
+    target = pa.array([{"x": 1, "y": "a"}, {"x": 2, "y": "b"}], type=struct_type)
+    diff = _compare_columns_vectorized(source, target)
+    assert diff.to_pylist() == [False, False]
+
+    # Different struct values
+    source = pa.array([{"x": 1, "y": "a"}, {"x": 2, "y": "b"}], type=struct_type)
+    target = pa.array([{"x": 1, "y": "a"}, {"x": 2, "y": "c"}], type=struct_type)
+    diff = _compare_columns_vectorized(source, target)
+    assert diff.to_pylist() == [False, True]
+
+
+def test_vectorized_comparison_nested_structs() -> None:
+    """Test vectorized comparison with deeply nested struct types."""
+    from pyiceberg.table.upsert_util import _compare_columns_vectorized
+
+    inner_struct = pa.struct([("val", pa.int32())])
+    outer_struct = pa.struct([("inner", inner_struct), ("name", pa.string())])
+
+    source = pa.array(
+        [{"inner": {"val": 1}, "name": "a"}, {"inner": {"val": 2}, "name": "b"}],
+        type=outer_struct,
+    )
+    target = pa.array(
+        [{"inner": {"val": 1}, "name": "a"}, {"inner": {"val": 3}, "name": "b"}],
+        type=outer_struct,
+    )
+    diff = _compare_columns_vectorized(source, target)
+    assert diff.to_pylist() == [False, True]
+
+
+def test_vectorized_comparison_lists() -> None:
+    """Test vectorized comparison with list types (falls back to Python comparison)."""
+    from pyiceberg.table.upsert_util import _compare_columns_vectorized
+
+    list_type = pa.list_(pa.int32())
+
+    source = pa.array([[1, 2], [3, 4]], type=list_type)
+    target = pa.array([[1, 2], [3, 5]], type=list_type)
+    diff = _compare_columns_vectorized(source, target)
+    assert diff.to_pylist() == [False, True]
+
+
+def test_get_rows_to_update_no_non_key_cols() -> None:
+    """Test get_rows_to_update when all columns are key columns."""
+    from pyiceberg.table.upsert_util import get_rows_to_update
+
+    # All columns are key columns, so no non-key columns to compare
+    source = pa.Table.from_pydict({"id": [1, 2, 3]})
+    target = pa.Table.from_pydict({"id": [1, 2, 3]})
+    rows = get_rows_to_update(source, target, ["id"])
+    assert len(rows) == 0
+
+
+def test_upsert_with_list_field(catalog: Catalog) -> None:
+    """Test upsert with list type as non-key column."""
+    from pyiceberg.types import ListType
+
+    identifier = "default.test_upsert_with_list_field"
+    _drop_table(catalog, identifier)
+
+    schema = Schema(
+        NestedField(1, "id", IntegerType(), required=True),
+        NestedField(
+            2,
+            "tags",
+            ListType(element_id=3, element_type=StringType(), element_required=False),
+            required=False,
+        ),
+        identifier_field_ids=[1],
+    )
+
+    tbl = catalog.create_table(identifier, schema=schema)
+
+    arrow_schema = pa.schema(
+        [
+            pa.field("id", pa.int32(), nullable=False),
+            pa.field("tags", pa.list_(pa.large_string()), nullable=True),
+        ]
+    )
+
+    initial_data = pa.Table.from_pylist(
+        [
+            {"id": 1, "tags": ["a", "b"]},
+            {"id": 2, "tags": ["c"]},
+        ],
+        schema=arrow_schema,
+    )
+    tbl.append(initial_data)
+
+    # Update with changed list
+    update_data = pa.Table.from_pylist(
+        [
+            {"id": 1, "tags": ["a", "b"]},  # Same - no update
+            {"id": 2, "tags": ["c", "d"]},  # Changed - should update
+            {"id": 3, "tags": ["e"]},  # New - should insert
+        ],
+        schema=arrow_schema,
+    )
+
+    res = tbl.upsert(update_data, join_cols=["id"])
+    assert res.rows_updated == 1
+    assert res.rows_inserted == 1

Original file line number	Diff line number	Diff line change
`@@ -836,7 +836,8 @@ def upsert(`
`836`	`836`	`)`
`837`	`837`
`838`	`838`	`# get list of rows that exist so we don't have to load the entire target table`
`839`		`- matched_predicate = upsert_util.create_match_filter(df, join_cols)`
	`839`	`+ # Use coarse filter for initial scan - exact matching happens in get_rows_to_update()`
	`840`	`+ matched_predicate = upsert_util.create_coarse_match_filter(df, join_cols)`
`840`	`841`
`841`	`842`	`# We must use Transaction.table_metadata for the scan. This includes all uncommitted - but relevant - changes.`
`842`	`843`