fix schemas and partition specs to be according the snapshot and not latest and make

amitgilad3 · amitgilad3 · commit d2c58b2b1142 · 2025-02-18T21:36:33.000+02:00
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -904,14 +904,15 @@ def _read_delete_file(fs: FileSystem, data_file: DataFile) -> Iterator[PositionD
 
 
 def _read_deletes(fs: FileSystem, data_file: DataFile) -> Dict[str, pa.ChunkedArray]:
-    delete_fragment = _construct_fragment(
-        fs, data_file, file_format_kwargs={"dictionary_columns": ("file_path",), "pre_buffer": True, "buffer_size": ONE_MEGABYTE}
-    )
-    table = ds.Scanner.from_fragment(fragment=delete_fragment).to_table()
-    table = table.unify_dictionaries()
+    deletes_by_file: Dict[str, List[int]] = {}
+    for delete in _read_delete_file(fs, data_file):
+        if delete.file_path not in deletes_by_file:
+            deletes_by_file[delete.file_path] = []
+        deletes_by_file[delete.file_path].append(delete.pos)
+
+    # Convert lists of positions to ChunkedArrays
     return {
-        file.as_py(): table.filter(pc.field("file_path") == file).column("pos")
-        for file in table.column("file_path").chunks[0].dictionary
+        file_path: pa.chunked_array([pa.array(positions, type=pa.int64())]) for file_path, positions in deletes_by_file.items()
     }
 
 
diff --git a/pyiceberg/table/inspect.py b/pyiceberg/table/inspect.py
@@ -22,6 +22,7 @@
 from pyiceberg.conversions import from_bytes
 from pyiceberg.manifest import DataFile, DataFileContent, ManifestContent, ManifestFile, PartitionFieldSummary
 from pyiceberg.partitioning import PartitionSpec
+from pyiceberg.schema import Schema
 from pyiceberg.table.snapshots import Snapshot, ancestors_of
 from pyiceberg.types import PrimitiveType
 from pyiceberg.utils.concurrent import ExecutorFactory
@@ -384,14 +385,16 @@ def _get_all_manifests_schema(self) -> "pa.Schema":
         all_manifests_schema = all_manifests_schema.append(pa.field("reference_snapshot_id", pa.int64(), nullable=False))
         return all_manifests_schema
 
-    def _get_positional_deletes_schema(self) -> "pa.Schema":
+    def _get_positional_deletes_schema(self, schema: Optional[Schema] = None, spec_id: Optional[int] = None) -> "pa.Schema":
         import pyarrow as pa
 
         from pyiceberg.io.pyarrow import schema_to_pyarrow
 
-        partition_struct = self.tbl.metadata.spec_struct()
+        schema = schema or self.tbl.metadata.schema()
+
+        partition_struct = self.tbl.metadata.spec_struct(spec_id=spec_id)
         pa_partition_struct = schema_to_pyarrow(partition_struct)
-        pa_row_struct = schema_to_pyarrow(self.tbl.schema().as_struct())
+        pa_row_struct = schema_to_pyarrow(schema.as_struct())
         positional_delete_schema = pa.schema(
             [
                 pa.field("file_path", pa.string(), nullable=False),
@@ -473,11 +476,13 @@ def _partition_summaries_to_rows(
             schema=self._get_all_manifests_schema() if is_all_manifests_table else self._get_manifests_schema(),
         )
 
-    def _generate_positional_delete_table(self, manifest: ManifestFile, position_deletes_schema: "pa.Schema") -> "pa.Table":
+    def _generate_positional_delete_table(self, manifest: ManifestFile, schema: Schema) -> "pa.Table":
         import pyarrow as pa
 
         positional_deletes: List["pa.Table"] = []
 
+        position_deletes_schema = self._get_positional_deletes_schema(schema=schema, spec_id=manifest.partition_spec_id)
+
         if manifest.content == ManifestContent.DELETES:
             for entry in manifest.fetch_manifest_entry(self.tbl.io):
                 if entry.data_file.content == DataFileContent.POSITION_DELETES:
@@ -713,14 +718,14 @@ def position_deletes(self, snapshot_id: Optional[int] = None) -> "pa.Table":
         import pyarrow as pa
 
         snapshot = self._get_snapshot(snapshot_id) if snapshot_id else self.tbl.current_snapshot()
-        position_deletes_schema = self._get_positional_deletes_schema()
-
         if not snapshot:
-            return pa.Table.from_pylist([], schema=position_deletes_schema)
+            schema = self._get_positional_deletes_schema()
+            return pa.Table.from_pylist([], schema=schema)
 
+        schemas = self.tbl.schemas()
         executor = ExecutorFactory.get_or_create()
         positional_deletes: Iterator["pa.Table"] = executor.map(
-            lambda manifest: self._generate_positional_delete_table(manifest, position_deletes_schema),
+            lambda manifest: self._generate_positional_delete_table(manifest, schema=schemas[snapshot.schema_id]),
             snapshot.manifests(self.tbl.io),
         )
         return pa.concat_tables(positional_deletes)
diff --git a/pyiceberg/table/metadata.py b/pyiceberg/table/metadata.py
@@ -280,7 +280,7 @@ def specs_struct(self) -> StructType:
         return StructType(*nested_fields)
 
     def spec_struct(self, spec_id: Optional[int] = None) -> StructType:
-        """Produce for a spec_id a struct of  PartitionSpecs.
+        """Produce for a spec_id a struct of PartitionSpecs.
 
         The partition fields should be optional: Partition fields may be added later,
         in which case not all files would have the result field, and it may be null.