|
22 | 22 | from pyiceberg.manifest import ManifestContent, ManifestFile |
23 | 23 | from pyiceberg.table import Table |
24 | 24 | from pyiceberg.table.snapshots import Operation, Snapshot |
25 | | -from pyiceberg.table.update.validate import validation_history |
| 25 | +from pyiceberg.table.update.validate import deleted_data_files, validation_history |
26 | 26 |
|
27 | 27 |
|
28 | 28 | def test_validation_history(table_v2_with_extensive_snapshots: Table) -> None: |
@@ -65,3 +65,47 @@ def mock_read_manifest_side_effect(self: Snapshot, io: FileIO) -> list[ManifestF |
65 | 65 | ) |
66 | 66 |
|
67 | 67 | assert len(manifests) == expected_manifest_data_counts |
| 68 | + |
| 69 | + |
| 70 | +def test_deleted_data_files( |
| 71 | + table_v2_with_extensive_snapshots: Table, |
| 72 | +) -> None: |
| 73 | + mock_manifests = {} |
| 74 | + |
| 75 | + for i, snapshot in enumerate(table_v2_with_extensive_snapshots.snapshots()): |
| 76 | + mock_manifest = ManifestFile( |
| 77 | + manifest_path=f"foo/bar/{i}", |
| 78 | + manifest_length=1, |
| 79 | + partition_spec_id=1, |
| 80 | + content=ManifestContent.DATA if i % 2 == 0 else ManifestContent.DELETES, |
| 81 | + sequence_number=1, |
| 82 | + min_sequence_number=1, |
| 83 | + added_snapshot_id=snapshot.snapshot_id, |
| 84 | + ) |
| 85 | + |
| 86 | + # Store the manifest for this specific snapshot |
| 87 | + mock_manifests[snapshot.snapshot_id] = [mock_manifest] |
| 88 | + |
| 89 | + oldest_snapshot = table_v2_with_extensive_snapshots.snapshots()[0] |
| 90 | + newest_snapshot = cast(Snapshot, table_v2_with_extensive_snapshots.current_snapshot()) |
| 91 | + |
| 92 | + def mock_read_manifest_side_effect(self: Snapshot, io: FileIO) -> list[ManifestFile]: |
| 93 | + """Mock the manifests method to use the snapshot_id for lookup.""" |
| 94 | + snapshot_id = self.snapshot_id |
| 95 | + if snapshot_id in mock_manifests: |
| 96 | + return mock_manifests[snapshot_id] |
| 97 | + return [] |
| 98 | + |
| 99 | + # every snapshot is an append, so we should get nothing! |
| 100 | + with patch("pyiceberg.table.snapshots.Snapshot.manifests", new=mock_read_manifest_side_effect): |
| 101 | + result = list( |
| 102 | + deleted_data_files( |
| 103 | + table=table_v2_with_extensive_snapshots, |
| 104 | + starting_snapshot=newest_snapshot, |
| 105 | + data_filter=None, |
| 106 | + parent_snapshot=oldest_snapshot, |
| 107 | + partition_set=None, |
| 108 | + ) |
| 109 | + ) |
| 110 | + |
| 111 | + assert result == [] |
0 commit comments