3737from cachetools .keys import hashkey
3838from pydantic_core import to_json
3939
40+ from pyiceberg .avro .codecs import AVRO_CODEC_KEY , AvroCompressionCodec
4041from pyiceberg .avro .file import AvroFile , AvroOutputFile
4142from pyiceberg .conversions import to_bytes
4243from pyiceberg .exceptions import ValidationError
@@ -950,9 +951,16 @@ class ManifestWriter(ABC):
950951 _deleted_rows : int
951952 _min_sequence_number : Optional [int ]
952953 _partitions : List [Record ]
953- _reused_entry_wrapper : ManifestEntry
954+ _compression : AvroCompressionCodec
954955
955- def __init__ (self , spec : PartitionSpec , schema : Schema , output_file : OutputFile , snapshot_id : int ) -> None :
956+ def __init__ (
957+ self ,
958+ spec : PartitionSpec ,
959+ schema : Schema ,
960+ output_file : OutputFile ,
961+ snapshot_id : int ,
962+ avro_compression : AvroCompressionCodec ,
963+ ) -> None :
956964 self .closed = False
957965 self ._spec = spec
958966 self ._schema = schema
@@ -967,6 +975,7 @@ def __init__(self, spec: PartitionSpec, schema: Schema, output_file: OutputFile,
967975 self ._deleted_rows = 0
968976 self ._min_sequence_number = None
969977 self ._partitions = []
978+ self ._compression = avro_compression
970979
971980 def __enter__ (self ) -> ManifestWriter :
972981 """Open the writer."""
@@ -1002,6 +1011,7 @@ def _meta(self) -> Dict[str, str]:
10021011 "partition-spec" : to_json (self ._spec .fields ).decode ("utf-8" ),
10031012 "partition-spec-id" : str (self ._spec .spec_id ),
10041013 "format-version" : str (self .version ),
1014+ AVRO_CODEC_KEY : self ._compression ,
10051015 }
10061016
10071017 def _with_partition (self , format_version : TableVersion ) -> Schema :
@@ -1113,13 +1123,15 @@ def existing(self, entry: ManifestEntry) -> ManifestWriter:
11131123
11141124
11151125class ManifestWriterV1 (ManifestWriter ):
1116- def __init__ (self , spec : PartitionSpec , schema : Schema , output_file : OutputFile , snapshot_id : int ):
1117- super ().__init__ (
1118- spec ,
1119- schema ,
1120- output_file ,
1121- snapshot_id ,
1122- )
1126+ def __init__ (
1127+ self ,
1128+ spec : PartitionSpec ,
1129+ schema : Schema ,
1130+ output_file : OutputFile ,
1131+ snapshot_id : int ,
1132+ avro_compression : AvroCompressionCodec ,
1133+ ):
1134+ super ().__init__ (spec , schema , output_file , snapshot_id , avro_compression )
11231135
11241136 def content (self ) -> ManifestContent :
11251137 return ManifestContent .DATA
@@ -1133,8 +1145,15 @@ def prepare_entry(self, entry: ManifestEntry) -> ManifestEntry:
11331145
11341146
11351147class ManifestWriterV2 (ManifestWriter ):
1136- def __init__ (self , spec : PartitionSpec , schema : Schema , output_file : OutputFile , snapshot_id : int ):
1137- super ().__init__ (spec , schema , output_file , snapshot_id )
1148+ def __init__ (
1149+ self ,
1150+ spec : PartitionSpec ,
1151+ schema : Schema ,
1152+ output_file : OutputFile ,
1153+ snapshot_id : int ,
1154+ avro_compression : AvroCompressionCodec ,
1155+ ):
1156+ super ().__init__ (spec , schema , output_file , snapshot_id , avro_compression )
11381157
11391158 def content (self ) -> ManifestContent :
11401159 return ManifestContent .DATA
@@ -1160,12 +1179,17 @@ def prepare_entry(self, entry: ManifestEntry) -> ManifestEntry:
11601179
11611180
11621181def write_manifest (
1163- format_version : TableVersion , spec : PartitionSpec , schema : Schema , output_file : OutputFile , snapshot_id : int
1182+ format_version : TableVersion ,
1183+ spec : PartitionSpec ,
1184+ schema : Schema ,
1185+ output_file : OutputFile ,
1186+ snapshot_id : int ,
1187+ avro_compression : AvroCompressionCodec ,
11641188) -> ManifestWriter :
11651189 if format_version == 1 :
1166- return ManifestWriterV1 (spec , schema , output_file , snapshot_id )
1190+ return ManifestWriterV1 (spec , schema , output_file , snapshot_id , avro_compression )
11671191 elif format_version == 2 :
1168- return ManifestWriterV2 (spec , schema , output_file , snapshot_id )
1192+ return ManifestWriterV2 (spec , schema , output_file , snapshot_id , avro_compression )
11691193 else :
11701194 raise ValueError (f"Cannot write manifest for table version: { format_version } " )
11711195
@@ -1215,14 +1239,21 @@ def add_manifests(self, manifest_files: List[ManifestFile]) -> ManifestListWrite
12151239
12161240
12171241class ManifestListWriterV1 (ManifestListWriter ):
1218- def __init__ (self , output_file : OutputFile , snapshot_id : int , parent_snapshot_id : Optional [int ]):
1242+ def __init__ (
1243+ self ,
1244+ output_file : OutputFile ,
1245+ snapshot_id : int ,
1246+ parent_snapshot_id : Optional [int ],
1247+ compression : AvroCompressionCodec ,
1248+ ):
12191249 super ().__init__ (
12201250 format_version = 1 ,
12211251 output_file = output_file ,
12221252 meta = {
12231253 "snapshot-id" : str (snapshot_id ),
12241254 "parent-snapshot-id" : str (parent_snapshot_id ) if parent_snapshot_id is not None else "null" ,
12251255 "format-version" : "1" ,
1256+ AVRO_CODEC_KEY : compression ,
12261257 },
12271258 )
12281259
@@ -1236,7 +1267,14 @@ class ManifestListWriterV2(ManifestListWriter):
12361267 _commit_snapshot_id : int
12371268 _sequence_number : int
12381269
1239- def __init__ (self , output_file : OutputFile , snapshot_id : int , parent_snapshot_id : Optional [int ], sequence_number : int ):
1270+ def __init__ (
1271+ self ,
1272+ output_file : OutputFile ,
1273+ snapshot_id : int ,
1274+ parent_snapshot_id : Optional [int ],
1275+ sequence_number : int ,
1276+ compression : AvroCompressionCodec ,
1277+ ):
12401278 super ().__init__ (
12411279 format_version = 2 ,
12421280 output_file = output_file ,
@@ -1245,6 +1283,7 @@ def __init__(self, output_file: OutputFile, snapshot_id: int, parent_snapshot_id
12451283 "parent-snapshot-id" : str (parent_snapshot_id ) if parent_snapshot_id is not None else "null" ,
12461284 "sequence-number" : str (sequence_number ),
12471285 "format-version" : "2" ,
1286+ AVRO_CODEC_KEY : compression ,
12481287 },
12491288 )
12501289 self ._commit_snapshot_id = snapshot_id
@@ -1279,12 +1318,13 @@ def write_manifest_list(
12791318 snapshot_id : int ,
12801319 parent_snapshot_id : Optional [int ],
12811320 sequence_number : Optional [int ],
1321+ avro_compression : AvroCompressionCodec ,
12821322) -> ManifestListWriter :
12831323 if format_version == 1 :
1284- return ManifestListWriterV1 (output_file , snapshot_id , parent_snapshot_id )
1324+ return ManifestListWriterV1 (output_file , snapshot_id , parent_snapshot_id , avro_compression )
12851325 elif format_version == 2 :
12861326 if sequence_number is None :
12871327 raise ValueError (f"Sequence-number is required for V2 tables: { sequence_number } " )
1288- return ManifestListWriterV2 (output_file , snapshot_id , parent_snapshot_id , sequence_number )
1328+ return ManifestListWriterV2 (output_file , snapshot_id , parent_snapshot_id , sequence_number , avro_compression )
12891329 else :
12901330 raise ValueError (f"Cannot write manifest list for table version: { format_version } " )
0 commit comments