Skip to content

Commit 0e72ccc

Browse files
committed
Reverted changes back to prior commit version for _get_all_datafiles
1 parent 8c906d2 commit 0e72ccc

2 files changed

Lines changed: 21 additions & 9 deletions

File tree

pyiceberg/table/maintenance.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -294,9 +294,27 @@ def _get_protected_snapshot_ids(self, table_metadata: TableMetadata) -> Set[int]
294294

295295
def _get_all_datafiles(self) -> List[DataFile]:
296296
"""Collect all DataFiles in the current snapshot only."""
297-
data_file_structs = self.tbl.inspect.data_files()
298-
data_files = [DataFile(df) for df in data_file_structs]
299-
return data_files
297+
datafiles: List[DataFile] = []
298+
299+
current_snapshot = self.tbl.current_snapshot()
300+
if not current_snapshot:
301+
return datafiles
302+
303+
def process_manifest(manifest: ManifestFile) -> list[DataFile]:
304+
found: list[DataFile] = []
305+
for entry in manifest.fetch_manifest_entry(io=self.tbl.io, discard_deleted=True):
306+
if hasattr(entry, "data_file"):
307+
found.append(entry.data_file)
308+
return found
309+
310+
# Scan only the current snapshot's manifests
311+
manifests = current_snapshot.manifests(io=self.tbl.io)
312+
with ThreadPoolExecutor() as executor:
313+
results = executor.map(process_manifest, manifests)
314+
for res in results:
315+
datafiles.extend(res)
316+
317+
return datafiles
300318

301319
def deduplicate_data_files(self) -> List[DataFile]:
302320
"""

tests/table/test_dedup_data_file_filepaths.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -124,9 +124,6 @@ def test_deduplicate_data_files_removes_duplicates_in_current_snapshot(
124124
) -> None:
125125
mt = MaintenanceTable(tbl=prepopulated_table)
126126

127-
print("=== Before deduplication ===")
128-
check_data_files(prepopulated_table)
129-
130127
all_datafiles: List[DataFile] = mt._get_all_datafiles()
131128
file_names: List[str] = [os.path.basename(df.file_path) for df in all_datafiles]
132129
# There should be more than one reference before deduplication
@@ -135,9 +132,6 @@ def test_deduplicate_data_files_removes_duplicates_in_current_snapshot(
135132
)
136133
removed: List[DataFile] = mt.deduplicate_data_files()
137134

138-
print("=== After deduplication ===")
139-
check_data_files(prepopulated_table)
140-
141135
all_datafiles_after: List[DataFile] = mt._get_all_datafiles()
142136
file_names_after: List[str] = [os.path.basename(df.file_path) for df in all_datafiles_after]
143137
# Only one reference should remain after deduplication

0 commit comments

Comments
 (0)