Skip to content

Commit c414df8

Browse files
committed
Merge branch 'main' into feat/orphan-files
2 parents 85b4ab3 + dea5f77 commit c414df8

8 files changed

Lines changed: 237 additions & 338 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ coverage.xml
3535
.project
3636
.settings
3737
bin/
38+
.vscode/
3839

3940
# Hive/metastore files
4041
metastore_db/

poetry.lock

Lines changed: 180 additions & 195 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyiceberg/table/update/validate.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
VALIDATE_DATA_FILES_EXIST_OPERATIONS = {Operation.OVERWRITE, Operation.REPLACE, Operation.DELETE}
2828

2929

30-
def validation_history(
30+
def _validation_history(
3131
table: Table,
3232
from_snapshot: Snapshot,
3333
to_snapshot: Snapshot,
@@ -100,7 +100,7 @@ def _deleted_data_files(
100100
if parent_snapshot is None:
101101
return
102102

103-
manifests, snapshot_ids = validation_history(
103+
manifests, snapshot_ids = _validation_history(
104104
table,
105105
parent_snapshot,
106106
starting_snapshot,

pyiceberg/transforms.py

Lines changed: 40 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,36 @@ def _transform_literal(func: Callable[[L], L], lit: Literal[L]) -> Literal[L]:
111111
return literal(func(lit.value))
112112

113113

114+
def _pyiceberg_transform_wrapper(
115+
transform_func: Callable[["ArrayLike", Any], "ArrayLike"],
116+
*args: Any,
117+
expected_type: Optional["pa.DataType"] = None,
118+
) -> Callable[["ArrayLike"], "ArrayLike"]:
119+
try:
120+
import pyarrow as pa
121+
except ModuleNotFoundError as e:
122+
raise ModuleNotFoundError("For partition transforms, PyArrow needs to be installed") from e
123+
124+
def _transform(array: "ArrayLike") -> "ArrayLike":
125+
def _cast_if_needed(arr: "ArrayLike") -> "ArrayLike":
126+
if expected_type is not None:
127+
return arr.cast(expected_type)
128+
else:
129+
return arr
130+
131+
if isinstance(array, pa.Array):
132+
return _cast_if_needed(transform_func(array, *args))
133+
elif isinstance(array, pa.ChunkedArray):
134+
result_chunks = []
135+
for arr in array.iterchunks():
136+
result_chunks.append(_cast_if_needed(transform_func(arr, *args)))
137+
return pa.chunked_array(result_chunks)
138+
else:
139+
raise ValueError(f"PyArrow array can only be of type pa.Array or pa.ChunkedArray, but found {type(array)}")
140+
141+
return _transform
142+
143+
114144
class Transform(IcebergRootModel[str], ABC, Generic[S, T]):
115145
"""Transform base class for concrete transforms.
116146
@@ -175,27 +205,6 @@ def supports_pyarrow_transform(self) -> bool:
175205
@abstractmethod
176206
def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": ...
177207

178-
def _pyiceberg_transform_wrapper(
179-
self, transform_func: Callable[["ArrayLike", Any], "ArrayLike"], *args: Any
180-
) -> Callable[["ArrayLike"], "ArrayLike"]:
181-
try:
182-
import pyarrow as pa
183-
except ModuleNotFoundError as e:
184-
raise ModuleNotFoundError("For bucket/truncate transforms, PyArrow needs to be installed") from e
185-
186-
def _transform(array: "ArrayLike") -> "ArrayLike":
187-
if isinstance(array, pa.Array):
188-
return transform_func(array, *args)
189-
elif isinstance(array, pa.ChunkedArray):
190-
result_chunks = []
191-
for arr in array.iterchunks():
192-
result_chunks.append(transform_func(arr, *args))
193-
return pa.chunked_array(result_chunks)
194-
else:
195-
raise ValueError(f"PyArrow array can only be of type pa.Array or pa.ChunkedArray, but found {type(array)}")
196-
197-
return _transform
198-
199208

200209
def parse_transform(v: Any) -> Transform[Any, Any]:
201210
if isinstance(v, str):
@@ -375,7 +384,7 @@ def __repr__(self) -> str:
375384
def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]":
376385
from pyiceberg_core import transform as pyiceberg_core_transform
377386

378-
return self._pyiceberg_transform_wrapper(pyiceberg_core_transform.bucket, self._num_buckets)
387+
return _pyiceberg_transform_wrapper(pyiceberg_core_transform.bucket, self._num_buckets)
379388

380389
@property
381390
def supports_pyarrow_transform(self) -> bool:
@@ -501,22 +510,9 @@ def __repr__(self) -> str:
501510

502511
def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]":
503512
import pyarrow as pa
504-
import pyarrow.compute as pc
505-
506-
if isinstance(source, DateType):
507-
epoch = pa.scalar(datetime.EPOCH_DATE)
508-
elif isinstance(source, TimestampType):
509-
epoch = pa.scalar(datetime.EPOCH_TIMESTAMP)
510-
elif isinstance(source, TimestamptzType):
511-
epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ)
512-
elif isinstance(source, TimestampNanoType):
513-
epoch = pa.scalar(datetime.EPOCH_TIMESTAMP).cast(pa.timestamp("ns"))
514-
elif isinstance(source, TimestamptzNanoType):
515-
epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ).cast(pa.timestamp("ns"))
516-
else:
517-
raise ValueError(f"Cannot apply year transform for type: {source}")
513+
from pyiceberg_core import transform as pyiceberg_core_transform
518514

519-
return lambda v: pc.years_between(epoch, v) if v is not None else None
515+
return _pyiceberg_transform_wrapper(pyiceberg_core_transform.year, expected_type=pa.int32())
520516

521517

522518
class MonthTransform(TimeTransform[S]):
@@ -575,28 +571,9 @@ def __repr__(self) -> str:
575571

576572
def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]":
577573
import pyarrow as pa
578-
import pyarrow.compute as pc
579-
580-
if isinstance(source, DateType):
581-
epoch = pa.scalar(datetime.EPOCH_DATE)
582-
elif isinstance(source, TimestampType):
583-
epoch = pa.scalar(datetime.EPOCH_TIMESTAMP)
584-
elif isinstance(source, TimestamptzType):
585-
epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ)
586-
elif isinstance(source, TimestampNanoType):
587-
epoch = pa.scalar(datetime.EPOCH_TIMESTAMP).cast(pa.timestamp("ns"))
588-
elif isinstance(source, TimestamptzNanoType):
589-
epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ).cast(pa.timestamp("ns"))
590-
else:
591-
raise ValueError(f"Cannot apply month transform for type: {source}")
592-
593-
def month_func(v: pa.Array) -> pa.Array:
594-
return pc.add(
595-
pc.multiply(pc.years_between(epoch, v), pa.scalar(12)),
596-
pc.add(pc.month(v), pa.scalar(-1)),
597-
)
574+
from pyiceberg_core import transform as pyiceberg_core_transform
598575

599-
return lambda v: month_func(v) if v is not None else None
576+
return _pyiceberg_transform_wrapper(pyiceberg_core_transform.month, expected_type=pa.int32())
600577

601578

602579
class DayTransform(TimeTransform[S]):
@@ -663,22 +640,9 @@ def __repr__(self) -> str:
663640

664641
def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]":
665642
import pyarrow as pa
666-
import pyarrow.compute as pc
667-
668-
if isinstance(source, DateType):
669-
epoch = pa.scalar(datetime.EPOCH_DATE)
670-
elif isinstance(source, TimestampType):
671-
epoch = pa.scalar(datetime.EPOCH_TIMESTAMP)
672-
elif isinstance(source, TimestamptzType):
673-
epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ)
674-
elif isinstance(source, TimestampNanoType):
675-
epoch = pa.scalar(datetime.EPOCH_TIMESTAMP).cast(pa.timestamp("ns"))
676-
elif isinstance(source, TimestamptzNanoType):
677-
epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ).cast(pa.timestamp("ns"))
678-
else:
679-
raise ValueError(f"Cannot apply day transform for type: {source}")
643+
from pyiceberg_core import transform as pyiceberg_core_transform
680644

681-
return lambda v: pc.days_between(epoch, v) if v is not None else None
645+
return _pyiceberg_transform_wrapper(pyiceberg_core_transform.day, expected_type=pa.int32())
682646

683647

684648
class HourTransform(TimeTransform[S]):
@@ -728,21 +692,9 @@ def __repr__(self) -> str:
728692
return "HourTransform()"
729693

730694
def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]":
731-
import pyarrow as pa
732-
import pyarrow.compute as pc
733-
734-
if isinstance(source, TimestampType):
735-
epoch = pa.scalar(datetime.EPOCH_TIMESTAMP)
736-
elif isinstance(source, TimestamptzType):
737-
epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ)
738-
elif isinstance(source, TimestampNanoType):
739-
epoch = pa.scalar(datetime.EPOCH_TIMESTAMP).cast(pa.timestamp("ns"))
740-
elif isinstance(source, TimestamptzNanoType):
741-
epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ).cast(pa.timestamp("ns"))
742-
else:
743-
raise ValueError(f"Cannot apply hour transform for type: {source}")
695+
from pyiceberg_core import transform as pyiceberg_core_transform
744696

745-
return lambda v: pc.hours_between(epoch, v) if v is not None else None
697+
return _pyiceberg_transform_wrapper(pyiceberg_core_transform.hour)
746698

747699

748700
def _base64encode(buffer: bytes) -> str:
@@ -965,7 +917,7 @@ def __repr__(self) -> str:
965917
def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]":
966918
from pyiceberg_core import transform as pyiceberg_core_transform
967919

968-
return self._pyiceberg_transform_wrapper(pyiceberg_core_transform.truncate, self._width)
920+
return _pyiceberg_transform_wrapper(pyiceberg_core_transform.truncate, self._width)
969921

970922
@property
971923
def supports_pyarrow_transform(self) -> bool:

pyproject.toml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ psycopg2-binary = { version = ">=2.9.6", optional = true }
8181
sqlalchemy = { version = "^2.0.18", optional = true }
8282
getdaft = { version = ">=0.2.12", optional = true }
8383
cachetools = ">=5.5,<7.0"
84-
pyiceberg-core = { version = "^0.4.0", optional = true }
84+
pyiceberg-core = { version = "^0.5.1", optional = true }
8585
polars = { version = "^1.21.0", optional = true }
8686
thrift-sasl = { version = ">=0.4.3", optional = true }
8787
kerberos = {version = "^1.3.1", optional = true}
@@ -97,10 +97,10 @@ requests-mock = "1.12.1"
9797
moto = { version = "^5.0.2", extras = ["server"] }
9898
typing-extensions = "4.13.2"
9999
pytest-mock = "3.14.1"
100-
pyspark = "3.5.5"
101-
cython = "3.1.1"
100+
pyspark = "3.5.6"
101+
cython = "3.1.2"
102102
deptry = ">=0.14,<0.24"
103-
datafusion = ">=44,<47"
103+
datafusion = ">=44,<48"
104104
docutils = "!=0.21.post1" # https://github.com/python-poetry/poetry/issues/9248#issuecomment-2026240520
105105

106106
[tool.poetry.group.docs.dependencies]
@@ -109,7 +109,7 @@ mkdocs = "1.6.1"
109109
griffe = "1.7.3"
110110
jinja2 = "3.1.6"
111111
mkdocstrings = "0.29.1"
112-
mkdocstrings-python = "1.16.11"
112+
mkdocstrings-python = "1.16.12"
113113
mkdocs-literate-nav = "0.6.2"
114114
mkdocs-autorefs = "1.4.2"
115115
mkdocs-gen-files = "0.5.0"
@@ -289,7 +289,7 @@ generate-setup-file = false
289289
script = "build-module.py"
290290

291291
[tool.poetry.extras]
292-
pyarrow = ["pyarrow"]
292+
pyarrow = ["pyarrow", "pyiceberg-core"]
293293
pandas = ["pandas", "pyarrow"]
294294
duckdb = ["duckdb", "pyarrow"]
295295
ray = ["ray", "pyarrow", "pandas"]

tests/table/test_partitioning.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -186,8 +186,8 @@ def test_partition_type(table_schema_simple: Schema) -> None:
186186
(DecimalType(5, 9), Decimal(19.25)),
187187
(DateType(), datetime.date(1925, 5, 22)),
188188
(TimeType(), datetime.time(19, 25, 00)),
189-
(TimestampType(), datetime.datetime(19, 5, 1, 22, 1, 1)),
190-
(TimestamptzType(), datetime.datetime(19, 5, 1, 22, 1, 1, tzinfo=datetime.timezone.utc)),
189+
(TimestampType(), datetime.datetime(2022, 5, 1, 22, 1, 1)),
190+
(TimestamptzType(), datetime.datetime(2022, 5, 1, 22, 1, 1, tzinfo=datetime.timezone.utc)),
191191
(StringType(), "abc"),
192192
(UUIDType(), UUID("12345678-1234-5678-1234-567812345678").bytes),
193193
(FixedType(5), 'b"\x8e\xd1\x87\x01"'),
@@ -208,13 +208,7 @@ def test_transform_consistency_with_pyarrow_transform(source_type: PrimitiveType
208208
]
209209
for t in all_transforms:
210210
if t.can_transform(source_type):
211-
try:
212-
assert t.transform(source_type)(value) == t.pyarrow_transform(source_type)(pa.array([value])).to_pylist()[0]
213-
except ValueError as e:
214-
# Skipping unsupported feature
215-
if "FeatureUnsupported => Unsupported data type for truncate transform" in str(e):
216-
continue
217-
raise
211+
assert t.transform(source_type)(value) == t.pyarrow_transform(source_type)(pa.array([value])).to_pylist()[0]
218212

219213

220214
def test_deserialize_partition_field_v2() -> None:

tests/table/test_validate.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from pyiceberg.manifest import ManifestContent, ManifestEntry, ManifestEntryStatus, ManifestFile
2626
from pyiceberg.table import Table
2727
from pyiceberg.table.snapshots import Operation, Snapshot, Summary
28-
from pyiceberg.table.update.validate import _deleted_data_files, _validate_deleted_data_files, validation_history
28+
from pyiceberg.table.update.validate import _deleted_data_files, _validate_deleted_data_files, _validation_history
2929

3030

3131
@pytest.fixture
@@ -69,7 +69,7 @@ def mock_read_manifest_side_effect(self: Snapshot, io: FileIO) -> list[ManifestF
6969
return []
7070

7171
with patch("pyiceberg.table.snapshots.Snapshot.manifests", new=mock_read_manifest_side_effect):
72-
manifests, snapshots = validation_history(
72+
manifests, snapshots = _validation_history(
7373
table,
7474
oldest_snapshot,
7575
newest_snapshot,
@@ -99,7 +99,7 @@ def test_validation_history_fails_on_snapshot_with_no_summary(
9999
)
100100
with patch("pyiceberg.table.update.validate.ancestors_between", return_value=[snapshot_with_no_summary]):
101101
with pytest.raises(ValidationException):
102-
validation_history(
102+
_validation_history(
103103
table,
104104
oldest_snapshot,
105105
newest_snapshot,
@@ -129,7 +129,7 @@ def mock_read_manifest_side_effect(self: Snapshot, io: FileIO) -> list[ManifestF
129129
with patch("pyiceberg.table.snapshots.Snapshot.manifests", new=mock_read_manifest_side_effect):
130130
with patch("pyiceberg.table.update.validate.ancestors_between", return_value=missing_oldest_snapshot):
131131
with pytest.raises(ValidationException):
132-
validation_history(
132+
_validation_history(
133133
table,
134134
oldest_snapshot,
135135
newest_snapshot,

tests/test_transforms.py

Lines changed: 1 addition & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,14 @@
1616
# specific language governing permissions and limitations
1717
# under the License.
1818
# pylint: disable=eval-used,protected-access,redefined-outer-name
19-
from datetime import date, datetime
19+
from datetime import date
2020
from decimal import Decimal
2121
from typing import Annotated, Any, Callable, Optional, Union
2222
from uuid import UUID
2323

2424
import mmh3 as mmh3
2525
import pyarrow as pa
2626
import pytest
27-
import pytz
2827
from pydantic import (
2928
BeforeValidator,
3029
PlainSerializer,
@@ -1654,38 +1653,6 @@ def test_bucket_pyarrow_transforms(
16541653
assert expected == transform.pyarrow_transform(source_type)(input_arr)
16551654

16561655

1657-
# pyiceberg_core currently does not support bucket transform on timestamp_ns and timestamptz_ns
1658-
# https://github.com/apache/iceberg-rust/issues/1110
1659-
@pytest.mark.parametrize(
1660-
"source_type, input_arr, num_buckets",
1661-
[
1662-
(
1663-
TimestampNanoType(),
1664-
pa.array([datetime(1970, 1, 1, 0, 0, 0), datetime(2025, 2, 26, 1, 2, 3)], type=pa.timestamp(unit="ns")),
1665-
10,
1666-
),
1667-
(
1668-
TimestamptzNanoType(),
1669-
pa.array(
1670-
[datetime(1970, 1, 1, 0, 0, 0), datetime(2025, 2, 26, 1, 2, 3)],
1671-
type=pa.timestamp(unit="ns", tz=pytz.timezone("Etc/GMT+10")),
1672-
),
1673-
10,
1674-
),
1675-
],
1676-
)
1677-
def test_unsupported_bucket_pyarrow_transform(
1678-
source_type: PrimitiveType,
1679-
input_arr: Union[pa.Array, pa.ChunkedArray],
1680-
num_buckets: int,
1681-
) -> None:
1682-
transform: Transform[Any, Any] = BucketTransform(num_buckets=num_buckets)
1683-
with pytest.raises(ValueError) as exc_info:
1684-
transform.pyarrow_transform(source_type)(input_arr)
1685-
1686-
assert "FeatureUnsupported => Unsupported data type for bucket transform" in str(exc_info.value)
1687-
1688-
16891656
@pytest.mark.parametrize(
16901657
"source_type, input_arr, expected, width",
16911658
[

0 commit comments

Comments
 (0)