Do row comparision in Python

koenvo · koenvo · commit 79f61810e551 · 2025-04-24T21:59:53.000+02:00
diff --git a/pyiceberg/table/upsert_util.py b/pyiceberg/table/upsert_util.py
@@ -60,8 +60,11 @@ def get_rows_to_update(source_table: pa.Table, target_table: pa.Table, join_cols
     The table is joined on the identifier columns, and then checked if there are any updated rows.
     Those are selected and everything is renamed correctly.
     """
+    all_columns = set(source_table.column_names)
     join_cols_set = set(join_cols)
 
+    non_key_cols = list(all_columns - join_cols_set)
+
     if has_duplicate_rows(target_table, join_cols):
         raise ValueError("Target table has duplicate rows, aborting upsert")
 
@@ -73,11 +76,12 @@ def get_rows_to_update(source_table: pa.Table, target_table: pa.Table, join_cols
     # fall back to selecting only rows in the source table that do NOT already exist in the target.
     # See: https://github.com/apache/arrow/issues/35785
     MARKER_COLUMN_NAME = "__from_target"
-    INDEX_COLUMN_NAME = "__source_index"
+    SOURCE_INDEX_COLUMN_NAME = "__source_index"
+    TARGET_INDEX_COLUMN_NAME = "__target_index"
 
-    if MARKER_COLUMN_NAME in join_cols or INDEX_COLUMN_NAME in join_cols:
+    if MARKER_COLUMN_NAME in join_cols or SOURCE_INDEX_COLUMN_NAME in join_cols or TARGET_INDEX_COLUMN_NAME in join_cols:
         raise ValueError(
-            f"{MARKER_COLUMN_NAME} and {INDEX_COLUMN_NAME} are reserved for joining "
+            f"{MARKER_COLUMN_NAME}, {SOURCE_INDEX_COLUMN_NAME} and {TARGET_INDEX_COLUMN_NAME} are reserved for joining "
             f"DataFrames, and cannot be used as column names"
         ) from None
 
@@ -87,17 +91,39 @@ def get_rows_to_update(source_table: pa.Table, target_table: pa.Table, join_cols
     source_index = (
         source_table.cast(target_table.schema)
         .select(join_cols_set)
-        .append_column(INDEX_COLUMN_NAME, pa.array(range(len(source_table))))
+        .append_column(SOURCE_INDEX_COLUMN_NAME, pa.array(range(len(source_table))))
     )
 
     # Step 2: Prepare target index with join keys and a marker
-    target_index = target_table.select(join_cols_set).append_column(MARKER_COLUMN_NAME, pa.repeat(True, len(target_table)))
+    target_index = (
+        target_table.select(join_cols_set)
+        .append_column(TARGET_INDEX_COLUMN_NAME, pa.array(range(len(target_table))))
+        .append_column(MARKER_COLUMN_NAME, pa.repeat(True, len(target_table)))
+    )
 
     # Step 3: Perform a left outer join to find which rows from source exist in target
     joined = source_index.join(target_index, keys=list(join_cols_set), join_type="left outer")
 
     # Step 4: Create indices for rows that do exist in the target i.e., where marker column is true after the join
-    to_update_indices = joined.filter(pc.field(MARKER_COLUMN_NAME))[INDEX_COLUMN_NAME]
-
-    # Step 5: Take rows from source table using the indices and cast to target schema
-    return source_table.take(to_update_indices)
+    matching_indices = joined.filter(pc.field(MARKER_COLUMN_NAME))
+
+    # Step 5: Compare all rows using Python
+    to_update_indices = []
+    for source_idx, target_idx in zip(
+        matching_indices[SOURCE_INDEX_COLUMN_NAME].to_pylist(), matching_indices[TARGET_INDEX_COLUMN_NAME].to_pylist()
+    ):
+        source_row = source_table.slice(source_idx, 1)
+        target_row = target_table.slice(target_idx, 1)
+
+        for key in non_key_cols:
+            source_val = source_row.column(key)[0].as_py()
+            target_val = target_row.column(key)[0].as_py()
+            if source_val != target_val:
+                to_update_indices.append(source_idx)
+                break
+
+    # Step 6: Take rows from source table using the indices and cast to target schema
+    if to_update_indices:
+        return source_table.take(to_update_indices)
+    else:
+        return source_table.schema.empty_table()
diff --git a/tests/table/test_upsert.py b/tests/table/test_upsert.py
@@ -186,7 +186,7 @@ def test_merge_scenario_skip_upd_row(catalog: Catalog) -> None:
 
     res = table.upsert(df=source_df, join_cols=["order_id"])
 
-    expected_updated = 2
+    expected_updated = 1
     expected_inserted = 1
 
     assert_upsert_result(res, expected_updated, expected_inserted)
@@ -222,7 +222,7 @@ def test_merge_scenario_date_as_key(catalog: Catalog) -> None:
 
     res = table.upsert(df=source_df, join_cols=["order_date"])
 
-    expected_updated = 2
+    expected_updated = 1
     expected_inserted = 1
 
     assert_upsert_result(res, expected_updated, expected_inserted)
@@ -258,7 +258,7 @@ def test_merge_scenario_string_as_key(catalog: Catalog) -> None:
 
     res = table.upsert(df=source_df, join_cols=["order_id"])
 
-    expected_updated = 2
+    expected_updated = 1
     expected_inserted = 1
 
     assert_upsert_result(res, expected_updated, expected_inserted)
@@ -371,25 +371,16 @@ def test_upsert_with_identifier_fields(catalog: Catalog) -> None:
 
     expected_operations = [Operation.APPEND, Operation.OVERWRITE, Operation.APPEND, Operation.APPEND]
 
-    assert upd.rows_updated == 2
+    assert upd.rows_updated == 1
     assert upd.rows_inserted == 1
 
     assert [snap.summary.operation for snap in tbl.snapshots() if snap.summary is not None] == expected_operations
 
-    # This will update all 3 rows
+    # This should be a no-op
     upd = tbl.upsert(df)
 
-    assert upd.rows_updated == 3
+    assert upd.rows_updated == 0
     assert upd.rows_inserted == 0
-    expected_operations = [
-        Operation.APPEND,
-        Operation.OVERWRITE,
-        Operation.APPEND,
-        Operation.APPEND,
-        Operation.DELETE,
-        Operation.OVERWRITE,
-        Operation.APPEND,
-    ]
 
     assert [snap.summary.operation for snap in tbl.snapshots() if snap.summary is not None] == expected_operations
 
@@ -561,7 +552,7 @@ def test_upsert_struct_field_fails_in_join(catalog: Catalog) -> None:
         [
             {
                 "id": 1,
-                "nested_type": {"sub1": "1_sub1_init", "sub2": "1sub2_init"},
+                "nested_type": {"sub1": "bla1", "sub2": "bla"},
             }
         ],
         schema=arrow_schema,
@@ -572,32 +563,43 @@ def test_upsert_struct_field_fails_in_join(catalog: Catalog) -> None:
         [
             {
                 "id": 2,
-                "nested_type": {"sub1": "2_sub1_new", "sub2": "2_sub2_new"},
+                "nested_type": {"sub1": "bla1", "sub2": "bla"},
             },
             {
                 "id": 1,
-                "nested_type": {"sub1": "1sub1_init", "sub2": "1sub2_new"},
+                "nested_type": {"sub1": "bla1", "sub2": "bla2"},
             },
-            # TODO: struct changes should cause _check_pyarrow_schema_compatible to fail. Introduce a new `sub3` attribute
-            # {
-            #     "id": 1,
-            #     "nested_type": {"sub3": "1sub3_init", "sub2": "1sub2_new"},
-            # },
         ],
         schema=arrow_schema,
     )
 
-    upd = tbl.upsert(update_data, join_cols=["id"])
+    res = tbl.upsert(update_data, join_cols=["id"])
 
-    # Row needs to be updated even tho it's not changed.
-    # When pyarrow isn't able to compare rows, just update everything
-    assert upd.rows_updated == 1
-    assert upd.rows_inserted == 1
+    expected_updated = 1
+    expected_inserted = 1
 
-    assert tbl.scan().to_arrow().to_pylist() == [
-        {"id": 2, "nested_type": {"sub1": "2_sub1_new", "sub2": "2_sub2_new"}},
-        {"id": 1, "nested_type": {"sub1": "1sub1_init", "sub2": "1sub2_new"}},
-    ]
+    assert_upsert_result(res, expected_updated, expected_inserted)
+
+    update_data = pa.Table.from_pylist(
+        [
+            {
+                "id": 2,
+                "nested_type": {"sub1": "bla1", "sub2": "bla"},
+            },
+            {
+                "id": 1,
+                "nested_type": {"sub1": "bla1", "sub2": "bla2"},
+            },
+        ],
+        schema=arrow_schema,
+    )
+
+    res = tbl.upsert(update_data, join_cols=["id"])
+
+    expected_updated = 0
+    expected_inserted = 0
+
+    assert_upsert_result(res, expected_updated, expected_inserted)
 
 
 def test_upsert_with_nulls(catalog: Catalog) -> None: