fix comments of pr

amitgilad3 · amitgilad3 · commit 7ee20ea8f51b · 2025-02-09T00:03:52.000+02:00
diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md
@@ -971,6 +971,67 @@ readable_metrics: [
 
 To show only data files or delete files in the current snapshot, use `table.inspect.data_files()` and `table.inspect.delete_files()` respectively.
 
+### Position deletes
+
+Inspect the positional delete files in the current snapshot of the table:
+
+```python
+table.inspect.position_deletes()
+```
+
+```python
+pyarrow.Table
+file_path: string not null
+pos: int64 not null
+row: struct<id: int32, data: large_string>
+  child 0, id: int32
+  child 1, data: large_string
+partition: struct<data: large_string> not null
+  child 0, data: large_string
+spec_id: int64
+delete_file_path: string not null
+----
+file_path: [[],[],[],["s3://warehouse/default/table_metadata_position_deletes/data/data=a/00000-1-acbf93b7-f760-4517-aa84-b9240902d3d2-0-00001.parquet"]]
+pos: [[],[],[],[0]]
+row: [
+  -- is_valid: all not null
+  -- child 0 type: int32
+[]
+  -- child 1 type: large_string
+[],
+  -- is_valid: all not null
+  -- child 0 type: int32
+[]
+  -- child 1 type: large_string
+[],
+  -- is_valid: all not null
+  -- child 0 type: int32
+[]
+  -- child 1 type: large_string
+[],
+  -- is_valid:  [false]
+  -- child 0 type: int32
+[0]
+  -- child 1 type: large_string
+[""]]
+partition: [
+  -- is_valid: all not null
+  -- child 0 type: large_string
+[],
+  -- is_valid: all not null
+  -- child 0 type: large_string
+[],
+  -- is_valid: all not null
+  -- child 0 type: large_string
+[],
+  -- is_valid: all not null
+  -- child 0 type: large_string
+["a"]]
+spec_id: [[],[],[],[0]]
+delete_file_path: [[],[],[],["s3://warehouse/default/table_metadata_position_deletes/data/data=a/00000-5-bc7a1d8a-fefe-4277-b4ac-8f1dd7badb7a-00001-deletes.parquet"]]
+
+```
+
 ## Add Files
 
 Expert Iceberg users may choose to commit existing parquet files to the Iceberg table as data files, without rewriting them.
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -122,6 +122,7 @@
     DataFile,
     DataFileContent,
     FileFormat,
+    PositionDelete,
 )
 from pyiceberg.partitioning import PartitionField, PartitionFieldValue, PartitionKey, PartitionSpec, partition_record_value
 from pyiceberg.schema import (
@@ -889,10 +890,17 @@ def _construct_fragment(fs: FileSystem, data_file: DataFile, file_format_kwargs:
     return _get_file_format(data_file.file_format, **file_format_kwargs).make_fragment(path, fs)
 
 
-def _read_delete_file(fs: FileSystem, data_file: DataFile, schema: "pa.Schema") -> pa.Table:
+def _read_delete_file(fs: FileSystem, data_file: DataFile) -> Iterator[PositionDelete]:
     delete_fragment = _construct_fragment(fs, data_file, file_format_kwargs={"pre_buffer": True, "buffer_size": ONE_MEGABYTE})
-    table = ds.Scanner.from_fragment(fragment=delete_fragment, schema=schema).to_table()
-    return table
+    table = ds.Scanner.from_fragment(fragment=delete_fragment).to_table()
+    for batch in table.to_batches():
+        for i in range(len(batch)):
+            row = batch.column("row")[i].as_py() if "row" in batch.schema.names else None
+            yield PositionDelete(
+                file_path=batch.column("file_path")[i].as_py(),
+                pos=batch.column("pos")[i].as_py(),
+                row=row,  # Setting row as None since it's optional and not needed for position deletes
+            )
 
 
 def _read_deletes(fs: FileSystem, data_file: DataFile) -> Dict[str, pa.ChunkedArray]:
diff --git a/pyiceberg/manifest.py b/pyiceberg/manifest.py
@@ -320,6 +320,34 @@ def data_file_with_partition(partition_type: StructType, format_version: TableVe
     )
 
 
+class PositionDelete(Record):
+    __slots__ = ("file_path", "pos", "row")
+    file_path: str
+    pos: int
+    row: Optional[Record]
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        """Assign a key/value to a PositionDelete."""
+        super().__setattr__(name, value)
+
+    def __init__(self, file_path: str, pos: int, row: Optional[Record], *data: Any, **named_data: Any) -> None:
+        super().__init__(*data, **named_data)
+        self.file_path = file_path
+        self.pos = pos
+        self.row = row
+
+    def __hash__(self) -> int:
+        """Return the hash of the file path."""
+        return hash(self.file_path)
+
+    def __eq__(self, other: Any) -> bool:
+        """Compare the PositionDelete with another object.
+
+        If it is a PositionDelete, it will compare based on the file_path.
+        """
+        return self.file_path == other.file_path if isinstance(other, PositionDelete) else False
+
+
 class DataFile(Record):
     __slots__ = (
         "content",
diff --git a/pyiceberg/table/inspect.py b/pyiceberg/table/inspect.py
@@ -384,30 +384,15 @@ def _get_all_manifests_schema(self) -> "pa.Schema":
         all_manifests_schema = all_manifests_schema.append(pa.field("reference_snapshot_id", pa.int64(), nullable=False))
         return all_manifests_schema
 
-    def _get_positional_file_schema(self) -> "pa.Schema":
-        import pyarrow as pa
-
-        from pyiceberg.io.pyarrow import schema_to_pyarrow
-
-        pa_row_struct = schema_to_pyarrow(self.tbl.schema().as_struct())
-        positinal_delete_schema = pa.schema(
-            [
-                pa.field("file_path", pa.string(), nullable=False),
-                pa.field("pos", pa.int64(), nullable=False),
-                pa.field("row", pa_row_struct, nullable=True),
-            ]
-        )
-        return positinal_delete_schema
-
     def _get_positional_deletes_schema(self) -> "pa.Schema":
         import pyarrow as pa
 
         from pyiceberg.io.pyarrow import schema_to_pyarrow
 
-        partition_record = self.tbl.metadata.specs_struct()
-        pa_partition_struct = schema_to_pyarrow(partition_record)
+        partition_struct = self.tbl.metadata.spec_struct()
+        pa_partition_struct = schema_to_pyarrow(partition_struct)
         pa_row_struct = schema_to_pyarrow(self.tbl.schema().as_struct())
-        positinal_delete_schema = pa.schema(
+        positional_delete_schema = pa.schema(
             [
                 pa.field("file_path", pa.string(), nullable=False),
                 pa.field("pos", pa.int64(), nullable=False),
@@ -417,7 +402,7 @@ def _get_positional_deletes_schema(self) -> "pa.Schema":
                 pa.field("delete_file_path", pa.string(), nullable=False),
             ]
         )
-        return positinal_delete_schema
+        return positional_delete_schema
 
     def _generate_manifests_table(self, snapshot: Optional[Snapshot], is_all_manifests_table: bool = False) -> "pa.Table":
         import pyarrow as pa
@@ -492,22 +477,28 @@ def _generate_positional_delete_table(self, manifest: ManifestFile, position_del
         import pyarrow as pa
 
         positional_deletes: List["pa.Table"] = []
+
         if manifest.content == ManifestContent.DELETES:
             for entry in manifest.fetch_manifest_entry(self.tbl.io):
                 if entry.data_file.content == DataFileContent.POSITION_DELETES:
                     from pyiceberg.io.pyarrow import _fs_from_file_path, _read_delete_file
 
                     positional_delete_file = _read_delete_file(
-                        _fs_from_file_path(self.tbl.io, entry.data_file.file_path),
-                        entry.data_file,
-                        self._get_positional_file_schema(),
-                    ).to_pylist()
+                        _fs_from_file_path(self.tbl.io, entry.data_file.file_path), entry.data_file
+                    )
+                    positional_deletes_records = []
                     for record in positional_delete_file:
-                        record["partition"] = entry.data_file.partition.__dict__
-                        record["spec_id"] = manifest.partition_spec_id
-                        record["delete_file_path"] = entry.data_file.file_path
-
-                    positional_deletes.append(pa.Table.from_pylist(positional_delete_file, position_deletes_schema))
+                        row = {
+                            "file_path": record.file_path,
+                            "pos": record.pos,
+                            "row": record.row,
+                            "partition": entry.data_file.partition.__dict__,
+                            "spec_id": manifest.partition_spec_id,
+                            "delete_file_path": entry.data_file.file_path,
+                        }
+                        positional_deletes_records.append(row)
+
+                    positional_deletes.append(pa.Table.from_pylist(positional_deletes_records, position_deletes_schema))
 
         if not positional_deletes:
             return pa.Table.from_pylist([], position_deletes_schema)
@@ -718,18 +709,18 @@ def all_manifests(self) -> "pa.Table":
         )
         return pa.concat_tables(manifests_by_snapshots)
 
-    def position_deletes(self) -> "pa.Table":
+    def position_deletes(self, snapshot_id: Optional[int] = None) -> "pa.Table":
         import pyarrow as pa
 
+        snapshot = self._get_snapshot(snapshot_id) if snapshot_id else self.tbl.current_snapshot()
         position_deletes_schema = self._get_positional_deletes_schema()
-        current_snapshot = self.tbl.current_snapshot()
 
-        if not current_snapshot:
+        if not snapshot:
             return pa.Table.from_pylist([], schema=position_deletes_schema)
 
         executor = ExecutorFactory.get_or_create()
         positional_deletes: Iterator["pa.Table"] = executor.map(
             lambda manifest: self._generate_positional_delete_table(manifest, position_deletes_schema),
-            current_snapshot.manifests(self.tbl.io),
+            snapshot.manifests(self.tbl.io),
         )
         return pa.concat_tables(positional_deletes)
diff --git a/pyiceberg/table/metadata.py b/pyiceberg/table/metadata.py
@@ -279,6 +279,37 @@ def specs_struct(self) -> StructType:
 
         return StructType(*nested_fields)
 
+    def spec_struct(self, spec_id: Optional[int] = None) -> StructType:
+        """Produce for a spec_id a struct of  PartitionSpecs.
+
+        The partition fields should be optional: Partition fields may be added later,
+        in which case not all files would have the result field, and it may be null.
+
+        :return: A StructType that represents a PartitionSpec of the table for a specific spec_id or latest.
+        """
+        if spec_id is None:
+            spec = self.spec()
+        else:
+            specs = self.specs()
+            filtered_spec = list(filter(lambda spec: spec.spec_id == spec_id, specs.values()))
+            if not filtered_spec:
+                raise ValidationError(f"Spec with spec_id {spec_id} not found")
+            spec = filtered_spec[0]
+        # Collect all the fields
+        struct_fields = {field.field_id: field for field in spec.fields}
+
+        schema = self.schema()
+
+        nested_fields = []
+        # Sort them by field_id in order to get a deterministic output
+        for field_id in sorted(struct_fields):
+            field = struct_fields[field_id]
+            source_type = schema.find_type(field.source_id)
+            result_type = field.transform.result_type(source_type)
+            nested_fields.append(NestedField(field_id=field.field_id, name=field.name, type=result_type, required=False))
+
+        return StructType(*nested_fields)
+
     def new_snapshot_id(self) -> int:
         """Generate a new snapshot-id that's not in use."""
         snapshot_id = _generate_snapshot_id()