Fallback for upsert when arrow cannot compare source rows with target rows

koenvo · koenvo · commit f16f8b38f45d · 2025-04-02T22:38:55.000+02:00
diff --git a/pyiceberg/table/upsert_util.py b/pyiceberg/table/upsert_util.py
@@ -67,14 +67,18 @@ def get_rows_to_update(source_table: pa.Table, target_table: pa.Table, join_cols
 
     diff_expr = functools.reduce(operator.or_, [pc.field(f"{col}-lhs") != pc.field(f"{col}-rhs") for col in non_key_cols])
 
-    return (
-        source_table
-        # We already know that the schema is compatible, this is to fix large_ types
-        .cast(target_table.schema)
-        .join(target_table, keys=list(join_cols_set), join_type="inner", left_suffix="-lhs", right_suffix="-rhs")
-        .filter(diff_expr)
-        .drop_columns([f"{col}-rhs" for col in non_key_cols])
-        .rename_columns({f"{col}-lhs" if col not in join_cols else col: col for col in source_table.column_names})
-        # Finally cast to the original schema since it doesn't carry nullability:
-        # https://github.com/apache/arrow/issues/45557
-    ).cast(target_table.schema)
+    try:
+        return (
+            source_table
+            # We already know that the schema is compatible, this is to fix large_ types
+            .cast(target_table.schema)
+            .join(target_table, keys=list(join_cols_set), join_type="inner", left_suffix="-lhs", right_suffix="-rhs")
+            .filter(diff_expr)
+            .drop_columns([f"{col}-rhs" for col in non_key_cols])
+            .rename_columns({f"{col}-lhs" if col not in join_cols else col: col for col in source_table.column_names})
+            # Finally cast to the original schema since it doesn't carry nullability:
+            # https://github.com/apache/arrow/issues/45557
+        ).cast(target_table.schema)
+    except pa.ArrowInvalid:
+        # When we are not able to compare, just update all rows from source table
+        return source_table.cast(target_table.schema)
diff --git a/tests/table/test_upsert.py b/tests/table/test_upsert.py
@@ -30,7 +30,7 @@
 from pyiceberg.table import UpsertResult
 from pyiceberg.table.snapshots import Operation
 from pyiceberg.table.upsert_util import create_match_filter
-from pyiceberg.types import IntegerType, NestedField, StringType
+from pyiceberg.types import IntegerType, NestedField, StringType, StructType
 from tests.catalog.test_base import InMemoryCatalog, Table
 
 
@@ -509,3 +509,69 @@ def test_upsert_without_identifier_fields(catalog: Catalog) -> None:
         ValueError, match="Join columns could not be found, please set identifier-field-ids or pass in explicitly."
     ):
         tbl.upsert(df)
+
+
+def test_upsert_struct_field_fails_in_join(catalog: Catalog) -> None:
+    identifier = "default.test_upsert_struct_field_fails"
+    _drop_table(catalog, identifier)
+
+    schema = Schema(
+        NestedField(1, "id", IntegerType(), required=True),
+        NestedField(
+            2,
+            "nested_type",
+            # Struct<type: string, coordinates: list<double>>
+            StructType(
+                NestedField(3, "sub1", StringType(), required=True),
+                NestedField(4, "sub2", StringType(), required=True),
+            ),
+            required=False,
+        ),
+        identifier_field_ids=[1],
+    )
+
+    tbl = catalog.create_table(identifier, schema=schema)
+
+    arrow_schema = pa.schema(
+        [
+            pa.field("id", pa.int32(), nullable=False),
+            pa.field(
+                "nested_type",
+                pa.struct(
+                    [
+                        pa.field("sub1", pa.large_string(), nullable=False),
+                        pa.field("sub2", pa.large_string(), nullable=False),
+                    ]
+                ),
+                nullable=True,
+            ),
+        ]
+    )
+
+    initial_data = pa.Table.from_pylist(
+        [
+            {
+                "id": 1,
+                "nested_type": {"sub1": "bla1", "sub2": "bla"},
+            }
+        ],
+        schema=arrow_schema,
+    )
+    tbl.append(initial_data)
+
+    update_data = pa.Table.from_pylist(
+        [
+            {
+                "id": 1,
+                "nested_type": {"sub1": "bla1", "sub2": "bla"},
+            }
+        ],
+        schema=arrow_schema,
+    )
+
+    upd = tbl.upsert(update_data, join_cols=["id"])
+
+    # Row needs to be updated even tho it's not changed.
+    # When pyarrow isn't able to compare rows, just update everything
+    assert upd.rows_updated == 1
+    assert upd.rows_inserted == 0