180180from pyiceberg .utils .config import Config
181181from pyiceberg .utils .datetime import millis_to_datetime
182182from pyiceberg .utils .decimal import unscaled_to_decimal
183- from pyiceberg .utils .properties import get_first_property_value , property_as_bool , property_as_int
183+ from pyiceberg .utils .properties import (
184+ get_first_property_value ,
185+ property_as_bool ,
186+ property_as_float ,
187+ property_as_int ,
188+ )
184189from pyiceberg .utils .singleton import Singleton
185190from pyiceberg .utils .truncate import truncate_upper_bound_binary_string , truncate_upper_bound_text_string
186191
@@ -2473,6 +2478,120 @@ def parquet_path_to_id_mapping(
24732478 return result
24742479
24752480
2481+ def id_to_parquet_path_mapping (schema : Schema ) -> dict [int , str ]:
2482+ """
2483+ Compute the mapping of Iceberg column ID to parquet column path.
2484+
2485+ Args:
2486+ schema (pyiceberg.schema.Schema): The current table schema.
2487+ """
2488+ result : dict [int , str ] = {}
2489+ for pair in pre_order_visit (schema , ID2ParquetPathVisitor ()):
2490+ result [pair .field_id ] = pair .parquet_path
2491+ return result
2492+
2493+
2494+ @dataclass (frozen = True )
2495+ class BloomFilterOptions :
2496+ parquet_path : str
2497+ ndv : int | None
2498+ fpp : float | None
2499+
2500+
2501+ class BloomFilterOptionsCollector (PreOrderSchemaVisitor [list [BloomFilterOptions ]]):
2502+ _field_id : int = 0
2503+ _schema : Schema
2504+ _properties : dict [str , str ]
2505+
2506+ def __init__ (self , schema : Schema , properties : dict [str , str ], id_to_parquet_path_mapping : dict [int , str ]):
2507+ self ._schema = schema
2508+ self ._properties = properties
2509+ self ._id_to_parquet_path_mapping = id_to_parquet_path_mapping
2510+
2511+ def schema (
2512+ self , schema : Schema , struct_result : Callable [[], builtins .list [BloomFilterOptions ]]
2513+ ) -> builtins .list [BloomFilterOptions ]:
2514+ return struct_result ()
2515+
2516+ def struct (
2517+ self , struct : StructType , field_results : builtins .list [Callable [[], builtins .list [BloomFilterOptions ]]]
2518+ ) -> builtins .list [BloomFilterOptions ]:
2519+ return list (itertools .chain (* [result () for result in field_results ]))
2520+
2521+ def field (
2522+ self , field : NestedField , field_result : Callable [[], builtins .list [BloomFilterOptions ]]
2523+ ) -> builtins .list [BloomFilterOptions ]:
2524+ self ._field_id = field .field_id
2525+ return field_result ()
2526+
2527+ def list (
2528+ self , list_type : ListType , element_result : Callable [[], builtins .list [BloomFilterOptions ]]
2529+ ) -> builtins .list [BloomFilterOptions ]:
2530+ self ._field_id = list_type .element_id
2531+ return element_result ()
2532+
2533+ def map (
2534+ self ,
2535+ map_type : MapType ,
2536+ key_result : Callable [[], builtins .list [BloomFilterOptions ]],
2537+ value_result : Callable [[], builtins .list [BloomFilterOptions ]],
2538+ ) -> builtins .list [BloomFilterOptions ]:
2539+ self ._field_id = map_type .key_id
2540+ k = key_result ()
2541+ self ._field_id = map_type .value_id
2542+ v = value_result ()
2543+ return k + v
2544+
2545+ def primitive (self , primitive : PrimitiveType ) -> builtins .list [BloomFilterOptions ]:
2546+ from pyiceberg .table import TableProperties
2547+
2548+ column_name = self ._schema .find_column_name (self ._field_id )
2549+ if column_name is None :
2550+ return []
2551+
2552+ parquet_path = self ._id_to_parquet_path_mapping .get (self ._field_id )
2553+ if parquet_path is None :
2554+ return []
2555+
2556+ bloom_filter_enabled = property_as_bool (
2557+ self ._properties , f"{ TableProperties .PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX } .{ column_name } " , False
2558+ )
2559+ if not bloom_filter_enabled :
2560+ return []
2561+
2562+ bloom_filter_fpp = property_as_float (
2563+ self ._properties , f"{ TableProperties .PARQUET_BLOOM_FILTER_COLUMN_FPP_PREFIX } .{ column_name } " , None
2564+ )
2565+ bloom_filter_ndv = property_as_int (
2566+ self ._properties , f"{ TableProperties .PARQUET_BLOOM_FILTER_COLUMN_NDV_PREFIX } .{ column_name } " , None
2567+ )
2568+
2569+ return [BloomFilterOptions (parquet_path = parquet_path , ndv = bloom_filter_ndv , fpp = bloom_filter_fpp )]
2570+
2571+
2572+ def get_bloom_filter_options (
2573+ schema : Schema ,
2574+ table_properties : dict [str , str ],
2575+ ) -> dict [str , dict [str , Any ]]:
2576+ """
2577+ Get the bloom filter options from the table properties.
2578+
2579+ Args:
2580+ schema (pyiceberg.schema.Schema): The current table schema.
2581+ table_properties (dict[str, str]): The table properties.
2582+ """
2583+ bloom_filter_options = pre_order_visit (
2584+ schema , BloomFilterOptionsCollector (schema , table_properties , id_to_parquet_path_mapping (schema ))
2585+ )
2586+ result : dict [str , dict [str , Any ]] = {}
2587+ for bf_opts in bloom_filter_options :
2588+ result [bf_opts .parquet_path ] = {
2589+ ** ({"ndv" : bf_opts .ndv } if bf_opts .ndv is not None else {}),
2590+ ** ({"fpp" : bf_opts .fpp } if bf_opts .fpp is not None else {}),
2591+ }
2592+ return result
2593+
2594+
24762595@dataclass (frozen = True )
24772596class DataFileStatistics :
24782597 record_count : int
@@ -2668,7 +2787,6 @@ def data_file_statistics_from_parquet_metadata(
26682787def write_file (io : FileIO , table_metadata : TableMetadata , tasks : Iterator [WriteTask ]) -> Iterator [DataFile ]:
26692788 from pyiceberg .table import DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE , TableProperties
26702789
2671- parquet_writer_kwargs = _get_parquet_writer_kwargs (table_metadata .properties )
26722790 row_group_size = property_as_int (
26732791 properties = table_metadata .properties ,
26742792 property_name = TableProperties .PARQUET_ROW_GROUP_LIMIT ,
@@ -2685,6 +2803,8 @@ def write_parquet(task: WriteTask) -> DataFile:
26852803 else :
26862804 file_schema = table_schema
26872805
2806+ parquet_writer_kwargs = _get_parquet_writer_kwargs (table_metadata .properties , file_schema )
2807+
26882808 downcast_ns_timestamp_to_us = Config ().get_bool (DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE ) or False
26892809 batches = [
26902810 _to_requested_schema (
@@ -2829,14 +2949,25 @@ def parquet_file_to_data_file(io: FileIO, table_metadata: TableMetadata, file_pa
28292949PYARROW_UNCOMPRESSED_CODEC = "none"
28302950
28312951
2832- def _get_parquet_writer_kwargs (table_properties : Properties ) -> dict [str , Any ]:
2952+ def _get_parquet_writer_kwargs (table_properties : Properties , file_schema : Schema ) -> dict [str , Any ]:
28332953 from pyiceberg .table import TableProperties
28342954
2835- for key_pattern in [
2955+ unsupported_key_patterns = [
28362956 TableProperties .PARQUET_ROW_GROUP_SIZE_BYTES ,
28372957 TableProperties .PARQUET_BLOOM_FILTER_MAX_BYTES ,
2838- f"{ TableProperties .PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX } .*" ,
2839- ]:
2958+ ]
2959+
2960+ from packaging import version
2961+
2962+ MIN_PYARROW_VERSION_SUPPORTING_BLOOM_FILTER_WRITES = "24.0.0"
2963+ if version .parse (pyarrow .__version__ ) < version .parse (MIN_PYARROW_VERSION_SUPPORTING_BLOOM_FILTER_WRITES ):
2964+ unsupported_key_patterns += [
2965+ f"{ TableProperties .PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX } .*" ,
2966+ f"{ TableProperties .PARQUET_BLOOM_FILTER_COLUMN_FPP_PREFIX } .*" ,
2967+ f"{ TableProperties .PARQUET_BLOOM_FILTER_COLUMN_NDV_PREFIX } .*" ,
2968+ ]
2969+
2970+ for key_pattern in unsupported_key_patterns :
28402971 if unsupported_keys := fnmatch .filter (table_properties , key_pattern ):
28412972 warnings .warn (f"Parquet writer option(s) { unsupported_keys } not implemented" , stacklevel = 2 )
28422973
@@ -2849,6 +2980,8 @@ def _get_parquet_writer_kwargs(table_properties: Properties) -> dict[str, Any]:
28492980 if compression_codec == ICEBERG_UNCOMPRESSED_CODEC :
28502981 compression_codec = PYARROW_UNCOMPRESSED_CODEC
28512982
2983+ bloom_filter_options = get_bloom_filter_options (file_schema , table_properties )
2984+
28522985 return {
28532986 "compression" : compression_codec ,
28542987 "compression_level" : compression_level ,
@@ -2867,6 +3000,7 @@ def _get_parquet_writer_kwargs(table_properties: Properties) -> dict[str, Any]:
28673000 property_name = TableProperties .PARQUET_PAGE_ROW_LIMIT ,
28683001 default = TableProperties .PARQUET_PAGE_ROW_LIMIT_DEFAULT ,
28693002 ),
3003+ ** ({"bloom_filter_options" : bloom_filter_options } if bloom_filter_options else {}),
28703004 }
28713005
28723006
0 commit comments