1717from __future__ import annotations
1818
1919from datetime import datetime , timezone
20- from typing import TYPE_CHECKING , Any , Dict , Iterator , List , Optional , Set , Tuple
20+ from functools import reduce
21+ from typing import TYPE_CHECKING , Any , Dict , Iterator , List , Optional , Set , Tuple , Union , cast
2122
2223from pyiceberg .conversions import from_bytes
24+ from pyiceberg .io import _parse_location
2325from pyiceberg .manifest import DataFile , DataFileContent , ManifestContent , PartitionFieldSummary
2426from pyiceberg .partitioning import PartitionSpec
2527from pyiceberg .table .snapshots import Snapshot , ancestors_of
@@ -645,10 +647,16 @@ def data_files(self, snapshot_id: Optional[int] = None) -> "pa.Table":
645647 def delete_files (self , snapshot_id : Optional [int ] = None ) -> "pa.Table" :
646648 return self ._files (snapshot_id , {DataFileContent .POSITION_DELETES , DataFileContent .EQUALITY_DELETES })
647649
648- def all_manifests (self , snapshots : Optional [list [Snapshot ]] = None ) -> "pa.Table" :
650+ def all_manifests (self , snapshots : Optional [Union [ list [Snapshot ], list [ int ] ]] = None ) -> "pa.Table" :
649651 import pyarrow as pa
650652
651- snapshots = snapshots or self .tbl .snapshots ()
653+ # coerce into snapshot objects if users passes in snapshot ids
654+ if snapshots is not None :
655+ if isinstance (snapshots [0 ], int ):
656+ snapshots = cast (list [Snapshot ], [self .tbl .metadata .snapshot_by_id (snapshot_id ) for snapshot_id in snapshots ])
657+ else :
658+ snapshots = self .tbl .snapshots ()
659+
652660 if not snapshots :
653661 return pa .Table .from_pylist ([], schema = self ._get_all_manifests_schema ())
654662
@@ -657,3 +665,36 @@ def all_manifests(self, snapshots: Optional[list[Snapshot]] = None) -> "pa.Table
657665 lambda args : self ._generate_manifests_table (* args ), [(snapshot , True ) for snapshot in snapshots ]
658666 )
659667 return pa .concat_tables (manifests_by_snapshots )
668+
669+ def orphaned_files (self , location : str ) -> Set [str ]:
670+ try :
671+ import pyarrow as pa # noqa: F401
672+ except ModuleNotFoundError as e :
673+ raise ModuleNotFoundError ("For deleting orphaned files PyArrow needs to be installed" ) from e
674+
675+ from pyarrow .fs import FileSelector , FileType
676+
677+ from pyiceberg .io .pyarrow import _fs_from_file_path
678+
679+ all_known_files = set ()
680+ snapshots = self .tbl .snapshots ()
681+ manifests_paths = self .all_manifests (snapshots )["path" ].to_pylist ()
682+ all_known_files .update (manifests_paths )
683+
684+ executor = ExecutorFactory .get_or_create ()
685+ files_by_snapshots : Iterator [Set [str ]] = executor .map (
686+ lambda snapshot_id : set (self .files (snapshot_id )["file_path" ].to_pylist ())
687+ )
688+ datafile_paths : set [str ] = reduce (set .union , files_by_snapshots , set ())
689+ all_known_files .update (datafile_paths )
690+
691+ fs = _fs_from_file_path (self .tbl .io , location )
692+
693+ _ , _ , path = _parse_location (location )
694+ selector = FileSelector (path , recursive = True )
695+ # filter to just files as it may return directories
696+ all_files = [f .path for f in fs .get_file_info (selector ) if f .type == FileType .File ]
697+
698+ orphaned_files = set (all_files ).difference (set (all_known_files ))
699+
700+ return orphaned_files
0 commit comments