2323
2424from pyiceberg .expressions import (
2525 AlwaysFalse ,
26+ AlwaysTrue ,
27+ And ,
2628 BooleanExpression ,
2729 EqualTo ,
30+ GreaterThanOrEqual ,
2831 In ,
32+ LessThanOrEqual ,
2933 Or ,
3034)
3135
36+ # Threshold for switching from In() predicate to range-based or no filter.
37+ # When unique keys exceed this, the In() predicate becomes too expensive to process.
38+ LARGE_FILTER_THRESHOLD = 10_000
39+
40+ # Minimum density (ratio of unique values to range size) for range filter to be effective.
41+ # Below this threshold, range filters read too much irrelevant data.
42+ DENSITY_THRESHOLD = 0.1
43+
3244
3345def create_match_filter (df : pyarrow_table , join_cols : list [str ]) -> BooleanExpression :
3446 """
@@ -58,32 +70,119 @@ def create_match_filter(df: pyarrow_table, join_cols: list[str]) -> BooleanExpre
5870 return Or (* filters )
5971
6072
73+ def _is_numeric_type (arrow_type : pa .DataType ) -> bool :
74+ """Check if a PyArrow type is numeric (suitable for range filtering)."""
75+ return pa .types .is_integer (arrow_type ) or pa .types .is_floating (arrow_type )
76+
77+
78+ def _create_range_filter (col_name : str , values : pa .Array ) -> BooleanExpression :
79+ """Create a min/max range filter for a numeric column."""
80+ min_val = pc .min (values ).as_py ()
81+ max_val = pc .max (values ).as_py ()
82+ return And (GreaterThanOrEqual (col_name , min_val ), LessThanOrEqual (col_name , max_val ))
83+
84+
6185def create_coarse_match_filter (df : pyarrow_table , join_cols : list [str ]) -> BooleanExpression :
6286 """
6387 Create a coarse Iceberg BooleanExpression filter for initial row scanning.
6488
65- For single-column keys, uses an efficient In() predicate (exact match).
66- For composite keys, uses In() per column as a coarse filter (AND of In() predicates),
67- which may return false positives but is much more efficient than exact matching.
89+ This is an optimization for reducing the scan size before exact matching happens
90+ downstream (e.g., in get_rows_to_update() via the join operation). It trades filter
91+ precision for filter evaluation speed.
92+
93+ IMPORTANT: This is not a silver bullet optimization. It only helps specific use cases:
94+ - Datasets with < 10,000 unique keys benefit from In() predicates
95+ - Large datasets with dense numeric keys (>10% density) benefit from range filters
96+ - Large datasets with sparse keys or non-numeric columns fall back to full scan
97+
98+ For small datasets (< LARGE_FILTER_THRESHOLD unique keys, currently 10,000):
99+ - Single-column keys: uses In() predicate
100+ - Composite keys: uses AND of In() predicates per column
101+
102+ For large datasets (>= LARGE_FILTER_THRESHOLD unique keys):
103+ - Single numeric column with dense IDs (>10% coverage): uses min/max range filter
104+ - Otherwise: returns AlwaysTrue() to skip filtering (full scan)
68105
69- This function should only be used for initial scans where exact matching happens
70- downstream (e.g., in get_rows_to_update() via the join operation).
106+ The density threshold (DENSITY_THRESHOLD = 0.1 or 10%) determines whether a range
107+ filter is efficient. Below this threshold, the range would include too many
108+ non-matching rows, making a full scan more practical.
109+
110+ Args:
111+ df: PyArrow table containing the source data with join columns
112+ join_cols: List of column names to use for matching
113+
114+ Returns:
115+ BooleanExpression filter for Iceberg table scan
71116 """
72117 unique_keys = df .select (join_cols ).group_by (join_cols ).aggregate ([])
118+ num_unique_keys = len (unique_keys )
73119
74- if len ( unique_keys ) == 0 :
120+ if num_unique_keys == 0 :
75121 return AlwaysFalse ()
76122
123+ # For small datasets, use the standard In() approach
124+ if num_unique_keys < LARGE_FILTER_THRESHOLD :
125+ if len (join_cols ) == 1 :
126+ return In (join_cols [0 ], unique_keys [0 ].to_pylist ())
127+ else :
128+ column_filters = []
129+ for col in join_cols :
130+ unique_values = pc .unique (unique_keys [col ]).to_pylist ()
131+ column_filters .append (In (col , unique_values ))
132+ if len (column_filters ) == 0 :
133+ return AlwaysFalse ()
134+ if len (column_filters ) == 1 :
135+ return column_filters [0 ]
136+ return functools .reduce (operator .and_ , column_filters )
137+
138+ # For large datasets, use optimized strategies
77139 if len (join_cols ) == 1 :
78- return In (join_cols [0 ], unique_keys [0 ].to_pylist ())
140+ col_name = join_cols [0 ]
141+ col_data = unique_keys [col_name ]
142+ col_type = col_data .type
143+
144+ # For numeric columns, check if range filter is efficient (dense IDs)
145+ if _is_numeric_type (col_type ):
146+ min_val = pc .min (col_data ).as_py ()
147+ max_val = pc .max (col_data ).as_py ()
148+ value_range = max_val - min_val + 1
149+ density = num_unique_keys / value_range if value_range > 0 else 0
150+
151+ # If IDs are dense (>10% coverage of the range), use range filter
152+ # Otherwise, range filter would read too much irrelevant data
153+ if density > DENSITY_THRESHOLD :
154+ return _create_range_filter (col_name , col_data )
155+ else :
156+ return AlwaysTrue ()
157+ else :
158+ # Non-numeric single column with many values - skip filter
159+ return AlwaysTrue ()
79160 else :
80- # For composite keys: use In() per column as a coarse filter
81- # This is more efficient than creating Or(And(...), And(...), ...) for each row
82- # May include false positives, but fine-grained matching happens downstream
161+ # Composite keys with many values - use range filters for numeric columns where possible
83162 column_filters = []
84163 for col in join_cols :
85- unique_values = pc .unique (unique_keys [col ]).to_pylist ()
86- column_filters .append (In (col , unique_values ))
164+ col_data = unique_keys [col ]
165+ col_type = col_data .type
166+ unique_values = pc .unique (col_data )
167+
168+ if _is_numeric_type (col_type ) and len (unique_values ) >= LARGE_FILTER_THRESHOLD :
169+ # Use range filter for large numeric columns
170+ min_val = pc .min (unique_values ).as_py ()
171+ max_val = pc .max (unique_values ).as_py ()
172+ value_range = max_val - min_val + 1
173+ density = len (unique_values ) / value_range if value_range > 0 else 0
174+
175+ if density > DENSITY_THRESHOLD :
176+ column_filters .append (_create_range_filter (col , unique_values ))
177+ else :
178+ # Sparse numeric column - still use In() as it's part of composite key
179+ column_filters .append (In (col , unique_values .to_pylist ()))
180+ else :
181+ # Small column or non-numeric - use In()
182+ column_filters .append (In (col , unique_values .to_pylist ()))
183+
184+ if len (column_filters ) == 0 :
185+ return AlwaysTrue ()
87186 return functools .reduce (operator .and_ , column_filters )
88187
89188
@@ -98,8 +197,21 @@ def _compare_columns_vectorized(
98197 """
99198 Vectorized comparison of two columns, returning a boolean array where True means values differ.
100199
101- Handles struct types recursively by comparing each nested field.
102- Handles null values correctly: null != non-null is True, null == null is True (no update needed).
200+ Handles different PyArrow types:
201+ - Primitive types: Uses pc.not_equal() with proper null handling
202+ - Struct types: Recursively compares each nested field
203+ - List/Map types: Falls back to Python comparison (still batched, not row-by-row)
204+
205+ Null handling semantics:
206+ - null != non-null -> True (values differ, needs update)
207+ - null == null -> False (values same, no update needed)
208+
209+ Args:
210+ source_col: Column from the source table
211+ target_col: Column from the target table (must have same length)
212+
213+ Returns:
214+ Boolean PyArrow array where True indicates the values at that index differ
103215 """
104216 col_type = source_col .type
105217
@@ -155,7 +267,32 @@ def get_rows_to_update(source_table: pa.Table, target_table: pa.Table, join_cols
155267 Return a table with rows that need to be updated in the target table based on the join columns.
156268
157269 Uses vectorized PyArrow operations for efficient comparison, avoiding row-by-row Python loops.
158- The table is joined on the identifier columns, and then checked if there are any updated rows.
270+ The function performs an inner join on the identifier columns, then compares non-key columns
271+ to find rows where values have actually changed.
272+
273+ Algorithm:
274+ 1. Prepare source and target index tables with row indices
275+ 2. Inner join on join columns to find matching rows
276+ 3. Use take() to extract matched rows in batch
277+ 4. Compare non-key columns using vectorized operations
278+ 5. Filter to rows where at least one non-key column differs
279+
280+ Note: The column names '__source_index' and '__target_index' are reserved for internal use
281+ and cannot be used as join column names.
282+
283+ Args:
284+ source_table: PyArrow table with new/updated data
285+ target_table: PyArrow table with existing data
286+ join_cols: List of column names that form the unique key
287+
288+ Returns:
289+ PyArrow table containing only the rows from source_table that exist in target_table
290+ and have at least one non-key column with a different value. Returns an empty table
291+ if no updates are needed.
292+
293+ Raises:
294+ ValueError: If target_table has duplicate rows based on join_cols
295+ ValueError: If join_cols contains reserved column names
159296 """
160297 all_columns = set (source_table .column_names )
161298 join_cols_set = set (join_cols )
@@ -183,8 +320,8 @@ def get_rows_to_update(source_table: pa.Table, target_table: pa.Table, join_cols
183320 ) from None
184321
185322 # Step 1: Prepare source index with join keys and a marker index
186- # Cast to target table schema, so we can do the join
187- # See: https://github.com/apache/arrow/ issues/37542
323+ # Cast source to target schema to ensure type compatibility for the join
324+ # (e.g., source int32 vs target int64 would cause join issues)
188325 source_index = (
189326 source_table .cast (target_table .schema )
190327 .select (join_cols_set )
0 commit comments