Skip to content

Commit 0d65919

Browse files
committed
Fix DELETED manifest entry snapshot_id in _OverwriteFiles
When _OverwriteFiles._deleted_entries() creates DELETED manifest entries, it now sets snapshot_id to the current (deleting) snapshot's ID instead of retaining the original INSERT snapshot's ID. This aligns with the Iceberg spec which states that for DELETED entries, snapshot_id should be the snapshot in which the file was deleted. The existing _DeleteFiles._compute_deletes() already handled this correctly. Signed-off-by: Sotaro Hikita <bering1814@gmail.com>
1 parent b67b724 commit 0d65919

2 files changed

Lines changed: 56 additions & 1 deletion

File tree

pyiceberg/table/update/snapshot.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -652,7 +652,7 @@ def _get_entries(manifest: ManifestFile) -> list[ManifestEntry]:
652652
return [
653653
ManifestEntry.from_args(
654654
status=ManifestEntryStatus.DELETED,
655-
snapshot_id=entry.snapshot_id,
655+
snapshot_id=self._snapshot_id,
656656
sequence_number=entry.sequence_number,
657657
file_sequence_number=entry.file_sequence_number,
658658
data_file=entry.data_file,

tests/integration/test_deletes.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -975,3 +975,58 @@ def assert_manifest_entry(expected_status: ManifestEntryStatus, expected_snapsho
975975
assert after_delete_snapshot is not None
976976

977977
assert_manifest_entry(ManifestEntryStatus.DELETED, after_delete_snapshot.snapshot_id)
978+
979+
980+
@pytest.mark.integration
981+
def test_manifest_entry_snapshot_id_after_partial_deletes(session_catalog: RestCatalog) -> None:
982+
"""Test that DELETED manifest entries from a CoW overwrite (partial delete) have the correct snapshot_id.
983+
984+
When only some rows match the delete filter, PyIceberg rewrites the file via _OverwriteFiles.
985+
The DELETED entry's snapshot_id must be the deleting snapshot's ID, not the original INSERT snapshot's ID.
986+
See: https://github.com/apache/iceberg-python/issues/3236
987+
"""
988+
identifier = "default.test_manifest_entry_snapshot_id_after_partial_deletes"
989+
try:
990+
session_catalog.drop_table(identifier)
991+
except NoSuchTableError:
992+
pass
993+
994+
schema = pa.schema(
995+
[
996+
("id", pa.int32()),
997+
("name", pa.string()),
998+
]
999+
)
1000+
1001+
table = session_catalog.create_table(identifier, schema)
1002+
data = pa.Table.from_pylist(
1003+
[
1004+
{"id": 1, "name": "keep"},
1005+
{"id": 2, "name": "keep"},
1006+
{"id": 3, "name": "delete_me"},
1007+
{"id": 4, "name": "delete_me"},
1008+
],
1009+
schema=schema,
1010+
)
1011+
table.append(data)
1012+
1013+
# Partial delete: only some rows match, triggering CoW overwrite via _OverwriteFiles
1014+
table.delete(EqualTo("name", "delete_me"))
1015+
after_delete_snapshot = table.refresh().current_snapshot()
1016+
assert after_delete_snapshot is not None
1017+
1018+
manifests = after_delete_snapshot.manifests(table.io)
1019+
deleted_entries = [
1020+
entry
1021+
for manifest in manifests
1022+
for entry in manifest.fetch_manifest_entry(table.io, discard_deleted=False)
1023+
if entry.status == ManifestEntryStatus.DELETED
1024+
]
1025+
1026+
assert len(deleted_entries) > 0, "Expected at least one DELETED manifest entry from the CoW overwrite"
1027+
1028+
for entry in deleted_entries:
1029+
assert entry.snapshot_id == after_delete_snapshot.snapshot_id, (
1030+
f"DELETED entry snapshot_id should be {after_delete_snapshot.snapshot_id} "
1031+
f"(the deleting snapshot), but was {entry.snapshot_id}"
1032+
)

0 commit comments

Comments
 (0)