5454
5555UNASSIGNED_SEQ = - 1
5656DEFAULT_BLOCK_SIZE = 67108864 # 64 * 1024 * 1024
57- DEFAULT_READ_VERSION : Literal [2 ] = 2
57+ DEFAULT_READ_VERSION : Literal [3 ] = 3
5858
5959INITIAL_SEQUENCE_NUMBER = 0
6060
@@ -852,6 +852,17 @@ def partitions(self) -> list[PartitionFieldSummary] | None:
852852 def key_metadata (self ) -> bytes | None :
853853 return self ._data [14 ]
854854
855+ @property
856+ def first_row_id (self ) -> int | None :
857+ return self ._data [15 ] if len (self ._data ) > 15 else None
858+
859+ @first_row_id .setter
860+ def first_row_id (self , value : int | None ) -> None :
861+ if len (self ._data ) <= 15 :
862+ self ._data .append (value )
863+ else :
864+ self ._data [15 ] = value
865+
855866 def has_added_files (self ) -> bool :
856867 return self .added_files_count is None or self .added_files_count > 0
857868
@@ -1240,6 +1251,12 @@ def prepare_entry(self, entry: ManifestEntry) -> ManifestEntry:
12401251 return entry
12411252
12421253
1254+ class ManifestWriterV3 (ManifestWriterV2 ):
1255+ @property
1256+ def version (self ) -> TableVersion :
1257+ return 3
1258+
1259+
12431260def write_manifest (
12441261 format_version : TableVersion ,
12451262 spec : PartitionSpec ,
@@ -1252,6 +1269,8 @@ def write_manifest(
12521269 return ManifestWriterV1 (spec , schema , output_file , snapshot_id , avro_compression )
12531270 elif format_version == 2 :
12541271 return ManifestWriterV2 (spec , schema , output_file , snapshot_id , avro_compression )
1272+ elif format_version == 3 :
1273+ return ManifestWriterV3 (spec , schema , output_file , snapshot_id , avro_compression )
12551274 else :
12561275 raise ValueError (f"Cannot write manifest for table version: { format_version } " )
12571276
@@ -1295,6 +1314,10 @@ def __exit__(
12951314 @abstractmethod
12961315 def prepare_manifest (self , manifest_file : ManifestFile ) -> ManifestFile : ...
12971316
1317+ @property
1318+ def next_row_id (self ) -> int | None :
1319+ return None
1320+
12981321 def add_manifests (self , manifest_files : list [ManifestFile ]) -> ManifestListWriter :
12991322 self ._writer .write_block ([self .prepare_manifest (manifest_file ) for manifest_file in manifest_files ])
13001323 return self
@@ -1351,9 +1374,7 @@ def __init__(
13511374 self ._commit_snapshot_id = snapshot_id
13521375 self ._sequence_number = sequence_number
13531376
1354- def prepare_manifest (self , manifest_file : ManifestFile ) -> ManifestFile :
1355- wrapped_manifest_file = copy (manifest_file )
1356-
1377+ def _prepare_manifest_for_commit (self , wrapped_manifest_file : ManifestFile ) -> ManifestFile :
13571378 if wrapped_manifest_file .sequence_number == UNASSIGNED_SEQ :
13581379 # if the sequence number is being assigned here, then the manifest must be created by the current operation.
13591380 # To validate this, check that the snapshot id matches the current commit
@@ -1374,6 +1395,59 @@ def prepare_manifest(self, manifest_file: ManifestFile) -> ManifestFile:
13741395 wrapped_manifest_file .min_sequence_number = self ._sequence_number
13751396 return wrapped_manifest_file
13761397
1398+ def prepare_manifest (self , manifest_file : ManifestFile ) -> ManifestFile :
1399+ return self ._prepare_manifest_for_commit (copy (manifest_file ))
1400+
1401+
1402+ class ManifestListWriterV3 (ManifestListWriterV2 ):
1403+ _next_row_id : int
1404+
1405+ def __init__ (
1406+ self ,
1407+ output_file : OutputFile ,
1408+ snapshot_id : int ,
1409+ parent_snapshot_id : int | None ,
1410+ sequence_number : int ,
1411+ snapshot_first_row_id : int ,
1412+ compression : AvroCompressionCodec ,
1413+ ):
1414+ super ().__init__ (
1415+ output_file = output_file ,
1416+ snapshot_id = snapshot_id ,
1417+ parent_snapshot_id = parent_snapshot_id ,
1418+ sequence_number = sequence_number ,
1419+ compression = compression ,
1420+ )
1421+ self ._format_version = 3
1422+ self ._meta = {
1423+ "snapshot-id" : str (snapshot_id ),
1424+ "parent-snapshot-id" : str (parent_snapshot_id ) if parent_snapshot_id is not None else "null" ,
1425+ "sequence-number" : str (sequence_number ),
1426+ "first-row-id" : str (snapshot_first_row_id ),
1427+ "format-version" : "3" ,
1428+ AVRO_CODEC_KEY : compression ,
1429+ }
1430+ self ._next_row_id = snapshot_first_row_id
1431+
1432+ @property
1433+ def next_row_id (self ) -> int | None :
1434+ return self ._next_row_id
1435+
1436+ def prepare_manifest (self , manifest_file : ManifestFile ) -> ManifestFile :
1437+ wrapped_manifest_file = self ._prepare_manifest_for_commit (copy (manifest_file ))
1438+
1439+ if wrapped_manifest_file .content == ManifestContent .DATA and wrapped_manifest_file .first_row_id is None :
1440+ if wrapped_manifest_file .existing_rows_count is None or wrapped_manifest_file .added_rows_count is None :
1441+ raise ValueError (
1442+ "Cannot assign first row id for a v3 manifest without existing-rows-count and added-rows-count: "
1443+ f"{ wrapped_manifest_file .manifest_path } "
1444+ )
1445+
1446+ wrapped_manifest_file .first_row_id = self ._next_row_id
1447+ self ._next_row_id += wrapped_manifest_file .existing_rows_count + wrapped_manifest_file .added_rows_count
1448+
1449+ return wrapped_manifest_file
1450+
13771451
13781452def write_manifest_list (
13791453 format_version : TableVersion ,
@@ -1382,12 +1456,26 @@ def write_manifest_list(
13821456 parent_snapshot_id : int | None ,
13831457 sequence_number : int | None ,
13841458 avro_compression : AvroCompressionCodec ,
1459+ snapshot_first_row_id : int | None = None ,
13851460) -> ManifestListWriter :
13861461 if format_version == 1 :
13871462 return ManifestListWriterV1 (output_file , snapshot_id , parent_snapshot_id , avro_compression )
13881463 elif format_version == 2 :
13891464 if sequence_number is None :
13901465 raise ValueError (f"Sequence-number is required for V2 tables: { sequence_number } " )
13911466 return ManifestListWriterV2 (output_file , snapshot_id , parent_snapshot_id , sequence_number , avro_compression )
1467+ elif format_version == 3 :
1468+ if sequence_number is None :
1469+ raise ValueError (f"Sequence-number is required for V3 tables: { sequence_number } " )
1470+ if snapshot_first_row_id is None :
1471+ raise ValueError (f"snapshot_first_row_id is required for V3 tables: { snapshot_first_row_id } " )
1472+ return ManifestListWriterV3 (
1473+ output_file = output_file ,
1474+ snapshot_id = snapshot_id ,
1475+ parent_snapshot_id = parent_snapshot_id ,
1476+ sequence_number = sequence_number ,
1477+ snapshot_first_row_id = snapshot_first_row_id ,
1478+ compression = avro_compression ,
1479+ )
13921480 else :
13931481 raise ValueError (f"Cannot write manifest list for table version: { format_version } " )
0 commit comments