Merge branch 'main' into feat/orphan-files

jayceslesar · jayceslesar · commit c414df812bca · 2025-06-10T17:50:44.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -35,6 +35,7 @@ coverage.xml
 .project
 .settings
 bin/
+.vscode/
 
 # Hive/metastore files
 metastore_db/
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyiceberg/table/update/validate.py b/pyiceberg/table/update/validate.py
@@ -27,7 +27,7 @@
 VALIDATE_DATA_FILES_EXIST_OPERATIONS = {Operation.OVERWRITE, Operation.REPLACE, Operation.DELETE}
 
 
-def validation_history(
+def _validation_history(
     table: Table,
     from_snapshot: Snapshot,
     to_snapshot: Snapshot,
@@ -100,7 +100,7 @@ def _deleted_data_files(
     if parent_snapshot is None:
         return
 
-    manifests, snapshot_ids = validation_history(
+    manifests, snapshot_ids = _validation_history(
         table,
         parent_snapshot,
         starting_snapshot,
diff --git a/pyiceberg/transforms.py b/pyiceberg/transforms.py
@@ -111,6 +111,36 @@ def _transform_literal(func: Callable[[L], L], lit: Literal[L]) -> Literal[L]:
     return literal(func(lit.value))
 
 
+def _pyiceberg_transform_wrapper(
+    transform_func: Callable[["ArrayLike", Any], "ArrayLike"],
+    *args: Any,
+    expected_type: Optional["pa.DataType"] = None,
+) -> Callable[["ArrayLike"], "ArrayLike"]:
+    try:
+        import pyarrow as pa
+    except ModuleNotFoundError as e:
+        raise ModuleNotFoundError("For partition transforms, PyArrow needs to be installed") from e
+
+    def _transform(array: "ArrayLike") -> "ArrayLike":
+        def _cast_if_needed(arr: "ArrayLike") -> "ArrayLike":
+            if expected_type is not None:
+                return arr.cast(expected_type)
+            else:
+                return arr
+
+        if isinstance(array, pa.Array):
+            return _cast_if_needed(transform_func(array, *args))
+        elif isinstance(array, pa.ChunkedArray):
+            result_chunks = []
+            for arr in array.iterchunks():
+                result_chunks.append(_cast_if_needed(transform_func(arr, *args)))
+            return pa.chunked_array(result_chunks)
+        else:
+            raise ValueError(f"PyArrow array can only be of type pa.Array or pa.ChunkedArray, but found {type(array)}")
+
+    return _transform
+
+
 class Transform(IcebergRootModel[str], ABC, Generic[S, T]):
     """Transform base class for concrete transforms.
 
@@ -175,27 +205,6 @@ def supports_pyarrow_transform(self) -> bool:
     @abstractmethod
     def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": ...
 
-    def _pyiceberg_transform_wrapper(
-        self, transform_func: Callable[["ArrayLike", Any], "ArrayLike"], *args: Any
-    ) -> Callable[["ArrayLike"], "ArrayLike"]:
-        try:
-            import pyarrow as pa
-        except ModuleNotFoundError as e:
-            raise ModuleNotFoundError("For bucket/truncate transforms, PyArrow needs to be installed") from e
-
-        def _transform(array: "ArrayLike") -> "ArrayLike":
-            if isinstance(array, pa.Array):
-                return transform_func(array, *args)
-            elif isinstance(array, pa.ChunkedArray):
-                result_chunks = []
-                for arr in array.iterchunks():
-                    result_chunks.append(transform_func(arr, *args))
-                return pa.chunked_array(result_chunks)
-            else:
-                raise ValueError(f"PyArrow array can only be of type pa.Array or pa.ChunkedArray, but found {type(array)}")
-
-        return _transform
-
 
 def parse_transform(v: Any) -> Transform[Any, Any]:
     if isinstance(v, str):
@@ -375,7 +384,7 @@ def __repr__(self) -> str:
     def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]":
         from pyiceberg_core import transform as pyiceberg_core_transform
 
-        return self._pyiceberg_transform_wrapper(pyiceberg_core_transform.bucket, self._num_buckets)
+        return _pyiceberg_transform_wrapper(pyiceberg_core_transform.bucket, self._num_buckets)
 
     @property
     def supports_pyarrow_transform(self) -> bool:
@@ -501,22 +510,9 @@ def __repr__(self) -> str:
 
     def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]":
         import pyarrow as pa
-        import pyarrow.compute as pc
-
-        if isinstance(source, DateType):
-            epoch = pa.scalar(datetime.EPOCH_DATE)
-        elif isinstance(source, TimestampType):
-            epoch = pa.scalar(datetime.EPOCH_TIMESTAMP)
-        elif isinstance(source, TimestamptzType):
-            epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ)
-        elif isinstance(source, TimestampNanoType):
-            epoch = pa.scalar(datetime.EPOCH_TIMESTAMP).cast(pa.timestamp("ns"))
-        elif isinstance(source, TimestamptzNanoType):
-            epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ).cast(pa.timestamp("ns"))
-        else:
-            raise ValueError(f"Cannot apply year transform for type: {source}")
+        from pyiceberg_core import transform as pyiceberg_core_transform
 
-        return lambda v: pc.years_between(epoch, v) if v is not None else None
+        return _pyiceberg_transform_wrapper(pyiceberg_core_transform.year, expected_type=pa.int32())
 
 
 class MonthTransform(TimeTransform[S]):
@@ -575,28 +571,9 @@ def __repr__(self) -> str:
 
     def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]":
         import pyarrow as pa
-        import pyarrow.compute as pc
-
-        if isinstance(source, DateType):
-            epoch = pa.scalar(datetime.EPOCH_DATE)
-        elif isinstance(source, TimestampType):
-            epoch = pa.scalar(datetime.EPOCH_TIMESTAMP)
-        elif isinstance(source, TimestamptzType):
-            epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ)
-        elif isinstance(source, TimestampNanoType):
-            epoch = pa.scalar(datetime.EPOCH_TIMESTAMP).cast(pa.timestamp("ns"))
-        elif isinstance(source, TimestamptzNanoType):
-            epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ).cast(pa.timestamp("ns"))
-        else:
-            raise ValueError(f"Cannot apply month transform for type: {source}")
-
-        def month_func(v: pa.Array) -> pa.Array:
-            return pc.add(
-                pc.multiply(pc.years_between(epoch, v), pa.scalar(12)),
-                pc.add(pc.month(v), pa.scalar(-1)),
-            )
+        from pyiceberg_core import transform as pyiceberg_core_transform
 
-        return lambda v: month_func(v) if v is not None else None
+        return _pyiceberg_transform_wrapper(pyiceberg_core_transform.month, expected_type=pa.int32())
 
 
 class DayTransform(TimeTransform[S]):
@@ -663,22 +640,9 @@ def __repr__(self) -> str:
 
     def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]":
         import pyarrow as pa
-        import pyarrow.compute as pc
-
-        if isinstance(source, DateType):
-            epoch = pa.scalar(datetime.EPOCH_DATE)
-        elif isinstance(source, TimestampType):
-            epoch = pa.scalar(datetime.EPOCH_TIMESTAMP)
-        elif isinstance(source, TimestamptzType):
-            epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ)
-        elif isinstance(source, TimestampNanoType):
-            epoch = pa.scalar(datetime.EPOCH_TIMESTAMP).cast(pa.timestamp("ns"))
-        elif isinstance(source, TimestamptzNanoType):
-            epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ).cast(pa.timestamp("ns"))
-        else:
-            raise ValueError(f"Cannot apply day transform for type: {source}")
+        from pyiceberg_core import transform as pyiceberg_core_transform
 
-        return lambda v: pc.days_between(epoch, v) if v is not None else None
+        return _pyiceberg_transform_wrapper(pyiceberg_core_transform.day, expected_type=pa.int32())
 
 
 class HourTransform(TimeTransform[S]):
@@ -728,21 +692,9 @@ def __repr__(self) -> str:
         return "HourTransform()"
 
     def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]":
-        import pyarrow as pa
-        import pyarrow.compute as pc
-
-        if isinstance(source, TimestampType):
-            epoch = pa.scalar(datetime.EPOCH_TIMESTAMP)
-        elif isinstance(source, TimestamptzType):
-            epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ)
-        elif isinstance(source, TimestampNanoType):
-            epoch = pa.scalar(datetime.EPOCH_TIMESTAMP).cast(pa.timestamp("ns"))
-        elif isinstance(source, TimestamptzNanoType):
-            epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ).cast(pa.timestamp("ns"))
-        else:
-            raise ValueError(f"Cannot apply hour transform for type: {source}")
+        from pyiceberg_core import transform as pyiceberg_core_transform
 
-        return lambda v: pc.hours_between(epoch, v) if v is not None else None
+        return _pyiceberg_transform_wrapper(pyiceberg_core_transform.hour)
 
 
 def _base64encode(buffer: bytes) -> str:
@@ -965,7 +917,7 @@ def __repr__(self) -> str:
     def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]":
         from pyiceberg_core import transform as pyiceberg_core_transform
 
-        return self._pyiceberg_transform_wrapper(pyiceberg_core_transform.truncate, self._width)
+        return _pyiceberg_transform_wrapper(pyiceberg_core_transform.truncate, self._width)
 
     @property
     def supports_pyarrow_transform(self) -> bool:
diff --git a/pyproject.toml b/pyproject.toml
@@ -81,7 +81,7 @@ psycopg2-binary = { version = ">=2.9.6", optional = true }
 sqlalchemy = { version = "^2.0.18", optional = true }
 getdaft = { version = ">=0.2.12", optional = true }
 cachetools = ">=5.5,<7.0"
-pyiceberg-core = { version = "^0.4.0", optional = true }
+pyiceberg-core = { version = "^0.5.1", optional = true }
 polars = { version = "^1.21.0", optional = true }
 thrift-sasl = { version = ">=0.4.3", optional = true }
 kerberos = {version = "^1.3.1", optional = true}
@@ -97,10 +97,10 @@ requests-mock = "1.12.1"
 moto = { version = "^5.0.2", extras = ["server"] }
 typing-extensions = "4.13.2"
 pytest-mock = "3.14.1"
-pyspark = "3.5.5"
-cython = "3.1.1"
+pyspark = "3.5.6"
+cython = "3.1.2"
 deptry = ">=0.14,<0.24"
-datafusion = ">=44,<47"
+datafusion = ">=44,<48"
 docutils = "!=0.21.post1"   # https://github.com/python-poetry/poetry/issues/9248#issuecomment-2026240520
 
 [tool.poetry.group.docs.dependencies]
@@ -109,7 +109,7 @@ mkdocs = "1.6.1"
 griffe = "1.7.3"
 jinja2 = "3.1.6"
 mkdocstrings = "0.29.1"
-mkdocstrings-python = "1.16.11"
+mkdocstrings-python = "1.16.12"
 mkdocs-literate-nav = "0.6.2"
 mkdocs-autorefs = "1.4.2"
 mkdocs-gen-files = "0.5.0"
@@ -289,7 +289,7 @@ generate-setup-file = false
 script = "build-module.py"
 
 [tool.poetry.extras]
-pyarrow = ["pyarrow"]
+pyarrow = ["pyarrow", "pyiceberg-core"]
 pandas = ["pandas", "pyarrow"]
 duckdb = ["duckdb", "pyarrow"]
 ray = ["ray", "pyarrow", "pandas"]
diff --git a/tests/table/test_partitioning.py b/tests/table/test_partitioning.py
@@ -186,8 +186,8 @@ def test_partition_type(table_schema_simple: Schema) -> None:
         (DecimalType(5, 9), Decimal(19.25)),
         (DateType(), datetime.date(1925, 5, 22)),
         (TimeType(), datetime.time(19, 25, 00)),
-        (TimestampType(), datetime.datetime(19, 5, 1, 22, 1, 1)),
-        (TimestamptzType(), datetime.datetime(19, 5, 1, 22, 1, 1, tzinfo=datetime.timezone.utc)),
+        (TimestampType(), datetime.datetime(2022, 5, 1, 22, 1, 1)),
+        (TimestamptzType(), datetime.datetime(2022, 5, 1, 22, 1, 1, tzinfo=datetime.timezone.utc)),
         (StringType(), "abc"),
         (UUIDType(), UUID("12345678-1234-5678-1234-567812345678").bytes),
         (FixedType(5), 'b"\x8e\xd1\x87\x01"'),
@@ -208,13 +208,7 @@ def test_transform_consistency_with_pyarrow_transform(source_type: PrimitiveType
     ]
     for t in all_transforms:
         if t.can_transform(source_type):
-            try:
-                assert t.transform(source_type)(value) == t.pyarrow_transform(source_type)(pa.array([value])).to_pylist()[0]
-            except ValueError as e:
-                # Skipping unsupported feature
-                if "FeatureUnsupported => Unsupported data type for truncate transform" in str(e):
-                    continue
-                raise
+            assert t.transform(source_type)(value) == t.pyarrow_transform(source_type)(pa.array([value])).to_pylist()[0]
 
 
 def test_deserialize_partition_field_v2() -> None:
diff --git a/tests/table/test_validate.py b/tests/table/test_validate.py
@@ -25,7 +25,7 @@
 from pyiceberg.manifest import ManifestContent, ManifestEntry, ManifestEntryStatus, ManifestFile
 from pyiceberg.table import Table
 from pyiceberg.table.snapshots import Operation, Snapshot, Summary
-from pyiceberg.table.update.validate import _deleted_data_files, _validate_deleted_data_files, validation_history
+from pyiceberg.table.update.validate import _deleted_data_files, _validate_deleted_data_files, _validation_history
 
 
 @pytest.fixture
@@ -69,7 +69,7 @@ def mock_read_manifest_side_effect(self: Snapshot, io: FileIO) -> list[ManifestF
         return []
 
     with patch("pyiceberg.table.snapshots.Snapshot.manifests", new=mock_read_manifest_side_effect):
-        manifests, snapshots = validation_history(
+        manifests, snapshots = _validation_history(
             table,
             oldest_snapshot,
             newest_snapshot,
@@ -99,7 +99,7 @@ def test_validation_history_fails_on_snapshot_with_no_summary(
     )
     with patch("pyiceberg.table.update.validate.ancestors_between", return_value=[snapshot_with_no_summary]):
         with pytest.raises(ValidationException):
-            validation_history(
+            _validation_history(
                 table,
                 oldest_snapshot,
                 newest_snapshot,
@@ -129,7 +129,7 @@ def mock_read_manifest_side_effect(self: Snapshot, io: FileIO) -> list[ManifestF
     with patch("pyiceberg.table.snapshots.Snapshot.manifests", new=mock_read_manifest_side_effect):
         with patch("pyiceberg.table.update.validate.ancestors_between", return_value=missing_oldest_snapshot):
             with pytest.raises(ValidationException):
-                validation_history(
+                _validation_history(
                     table,
                     oldest_snapshot,
                     newest_snapshot,
diff --git a/tests/test_transforms.py b/tests/test_transforms.py
@@ -16,15 +16,14 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=eval-used,protected-access,redefined-outer-name
-from datetime import date, datetime
+from datetime import date
 from decimal import Decimal
 from typing import Annotated, Any, Callable, Optional, Union
 from uuid import UUID
 
 import mmh3 as mmh3
 import pyarrow as pa
 import pytest
-import pytz
 from pydantic import (
     BeforeValidator,
     PlainSerializer,
@@ -1654,38 +1653,6 @@ def test_bucket_pyarrow_transforms(
     assert expected == transform.pyarrow_transform(source_type)(input_arr)
 
 
-# pyiceberg_core currently does not support bucket transform on timestamp_ns and timestamptz_ns
-# https://github.com/apache/iceberg-rust/issues/1110
-@pytest.mark.parametrize(
-    "source_type, input_arr, num_buckets",
-    [
-        (
-            TimestampNanoType(),
-            pa.array([datetime(1970, 1, 1, 0, 0, 0), datetime(2025, 2, 26, 1, 2, 3)], type=pa.timestamp(unit="ns")),
-            10,
-        ),
-        (
-            TimestamptzNanoType(),
-            pa.array(
-                [datetime(1970, 1, 1, 0, 0, 0), datetime(2025, 2, 26, 1, 2, 3)],
-                type=pa.timestamp(unit="ns", tz=pytz.timezone("Etc/GMT+10")),
-            ),
-            10,
-        ),
-    ],
-)
-def test_unsupported_bucket_pyarrow_transform(
-    source_type: PrimitiveType,
-    input_arr: Union[pa.Array, pa.ChunkedArray],
-    num_buckets: int,
-) -> None:
-    transform: Transform[Any, Any] = BucketTransform(num_buckets=num_buckets)
-    with pytest.raises(ValueError) as exc_info:
-        transform.pyarrow_transform(source_type)(input_arr)
-
-    assert "FeatureUnsupported => Unsupported data type for bucket transform" in str(exc_info.value)
-
-
 @pytest.mark.parametrize(
     "source_type, input_arr, expected, width",
     [