apache
diff --git a/‎mkdocs/docs/api.md‎
Lines changed: 46 additions & 0 deletions b/‎mkdocs/docs/api.md‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎mkdocs/docs/index.md‎
Lines changed: 1 addition & 0 deletions b/‎mkdocs/docs/index.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎poetry.lock‎
Lines changed: 820 additions & 602 deletions b/‎poetry.lock‎
Lines changed: 820 additions & 602 deletions
diff --git a/‎pyiceberg/io/pyarrow.py‎
Lines changed: 34 additions & 3 deletions b/‎pyiceberg/io/pyarrow.py‎
Lines changed: 34 additions & 3 deletions
diff --git a/‎pyiceberg/table/__init__.py‎
Lines changed: 19 additions & 1 deletion b/‎pyiceberg/table/__init__.py‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎pyiceberg/table/snapshots.py‎
Lines changed: 3 additions & 0 deletions b/‎pyiceberg/table/snapshots.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎pyiceberg/table/statistics.py‎
Lines changed: 3 additions & 3 deletions b/‎pyiceberg/table/statistics.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎pyiceberg/table/update/__init__.py‎
Lines changed: 40 additions & 1 deletion b/‎pyiceberg/table/update/__init__.py‎
Lines changed: 40 additions & 1 deletion
diff --git a/‎pyiceberg/transforms.py‎
Lines changed: 23 additions & 11 deletions b/‎pyiceberg/transforms.py‎
Lines changed: 23 additions & 11 deletions
diff --git a/‎pyiceberg/utils/schema_conversion.py‎
Lines changed: 2 additions & 0 deletions b/‎pyiceberg/utils/schema_conversion.py‎
Lines changed: 2 additions & 0 deletions
@@ -1523,6 +1523,52 @@ print(ray_dataset.take(2))
 ]
 ```
 
+### Bodo
+
+PyIceberg interfaces closely with Bodo Dataframes (see [Bodo Iceberg Quick Start](https://docs.bodo.ai/latest/quick_start/quickstart_local_iceberg/)),
+which provides a drop-in replacement for Pandas that applies query, compiler and HPC optimizations automatically.
+Bodo accelerates and scales Python code from single laptops to large clusters without code rewrites.
+
+<!-- prettier-ignore-start -->
+
+!!! note "Requirements"
+    This requires [`bodo` to be installed](index.md).
+
+```python
+pip install pyiceberg['bodo']
+```
+<!-- prettier-ignore-end -->
+
+A table can be read easily into a Bodo Dataframe to perform Pandas operations:
+
+```python
+df = table.to_bodo()  # equivalent to `bodo.pandas.read_iceberg_table(table)`
+df = df[df["trip_distance"] >= 10.0]
+df = df[["VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime"]]
+print(df)
+```
+
+This creates a lazy query, optimizes it, and runs it on all available cores (print triggers execution):
+
+```python
+        VendorID tpep_pickup_datetime tpep_dropoff_datetime
+0              2  2023-01-01 00:27:12   2023-01-01 00:49:56
+1              2  2023-01-01 00:09:29   2023-01-01 00:29:23
+2              1  2023-01-01 00:13:30   2023-01-01 00:44:00
+3              2  2023-01-01 00:41:41   2023-01-01 01:19:32
+4              2  2023-01-01 00:22:39   2023-01-01 01:30:45
+...          ...                  ...                   ...
+245478         2  2023-01-31 22:32:57   2023-01-31 23:01:48
+245479         2  2023-01-31 22:03:26   2023-01-31 22:46:13
+245480         2  2023-01-31 23:25:56   2023-02-01 00:05:42
+245481         2  2023-01-31 23:18:00   2023-01-31 23:46:00
+245482         2  2023-01-31 23:18:00   2023-01-31 23:41:00
+
+[245483 rows x 3 columns]
+```
+
+Bodo is optimized to take advantage of Iceberg features such as hidden partitioning and various statistics for efficient reads.
+
 ### Daft
 
 PyIceberg interfaces closely with Daft Dataframes (see also: [Daft integration with Iceberg](https://docs.daft.ai/en/stable/io/iceberg/)) which provides a full lazily optimized query engine interface on top of PyIceberg tables.
 
@@ -52,6 +52,7 @@ You can mix and match optional dependencies depending on your needs:
 | pandas        | Installs both PyArrow and Pandas                                          |
 | duckdb        | Installs both PyArrow and DuckDB                                          |
 | ray           | Installs PyArrow, Pandas, and Ray                                         |
+| bodo          | Installs Bodo                                                             |
 | daft          | Installs Daft                                                             |
 | polars       | Installs Polars                                                           |
 | s3fs          | S3FS as a FileIO implementation to interact with the object store         |
 
@@ -2728,9 +2728,11 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T
 
     for partition, name in zip(spec.fields, partition_fields):
         source_field = schema.find_field(partition.source_id)
-        arrow_table = arrow_table.append_column(
-            name, partition.transform.pyarrow_transform(source_field.field_type)(arrow_table[source_field.name])
-        )
+        full_field_name = schema.find_column_name(partition.source_id)
+        if full_field_name is None:
+            raise ValueError(f"Could not find column name for field ID: {partition.source_id}")
+        field_array = _get_field_from_arrow_table(arrow_table, full_field_name)
+        arrow_table = arrow_table.append_column(name, partition.transform.pyarrow_transform(source_field.field_type)(field_array))
 
     unique_partition_fields = arrow_table.select(partition_fields).group_by(partition_fields).aggregate([])
 
@@ -2765,3 +2767,32 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T
         )
 
     return table_partitions
+
+
+def _get_field_from_arrow_table(arrow_table: pa.Table, field_path: str) -> pa.Array:
+    """Get a field from an Arrow table, supporting both literal field names and nested field paths.
+
+    This function handles two cases:
+    1. Literal field names that may contain dots (e.g., "some.id")
+    2. Nested field paths using dot notation (e.g., "bar.baz" for nested access)
+
+    Args:
+        arrow_table: The Arrow table containing the field
+        field_path: Field name or dot-separated path
+
+    Returns:
+        The field as a PyArrow Array
+
+    Raises:
+        KeyError: If the field path cannot be resolved
+    """
+    # Try exact column name match (handles field names containing literal dots)
+    if field_path in arrow_table.column_names:
+        return arrow_table[field_path]
+
+    # If not found as exact name, treat as nested field path
+    path_parts = field_path.split(".")
+    # Get the struct column from the table (e.g., "bar" from "bar.baz")
+    field_array = arrow_table[path_parts[0]]
+    # Navigate into the struct using the remaining path parts
+    return pc.struct_field(field_array, path_parts[1:])
@@ -137,6 +137,7 @@
 from pyiceberg.utils.properties import property_as_bool
 
 if TYPE_CHECKING:
+    import bodo.pandas as bd
     import daft
     import pandas as pd
     import polars as pl
@@ -1485,6 +1486,16 @@ def to_daft(self) -> daft.DataFrame:
 
         return daft.read_iceberg(self)
 
+    def to_bodo(self) -> bd.DataFrame:
+        """Read a bodo DataFrame lazily from this Iceberg table.
+
+        Returns:
+            bd.DataFrame: Unmaterialized Bodo Dataframe created from the Iceberg table
+        """
+        import bodo.pandas as bd
+
+        return bd.read_iceberg_table(self)
+
     def to_polars(self) -> pl.LazyFrame:
         """Lazily read from this Apache Iceberg table.
 
@@ -1691,7 +1702,14 @@ def to_polars(self) -> pl.DataFrame: ...
 
     def update(self: S, **overrides: Any) -> S:
         """Create a copy of this table scan with updated fields."""
-        return type(self)(**{**self.__dict__, **overrides})
+        from inspect import signature
+
+        # Extract those attributes that are constructor parameters. We don't use self.__dict__ as the kwargs to the
+        # constructors because it may contain additional attributes that are not part of the constructor signature.
+        params = signature(type(self).__init__).parameters.keys() - {"self"}  # Skip "self" parameter
+        kwargs = {param: getattr(self, param) for param in params}  # Assume parameters are attributes
+
+        return type(self)(**{**kwargs, **overrides})
 
     def use_ref(self: S, name: str) -> S:
         if self.snapshot_id:
 
@@ -58,6 +58,7 @@
 TOTAL_FILE_SIZE = "total-files-size"
 CHANGED_PARTITION_COUNT_PROP = "changed-partition-count"
 CHANGED_PARTITION_PREFIX = "partitions."
+PARTITION_SUMMARY_PROP = "partition-summaries-included"
 OPERATION = "operation"
 
 INITIAL_SEQUENCE_NUMBER = 0
@@ -306,6 +307,8 @@ def build(self) -> Dict[str, str]:
         changed_partitions_size = len(self.partition_metrics)
         set_when_positive(properties, changed_partitions_size, CHANGED_PARTITION_COUNT_PROP)
         if changed_partitions_size <= self.max_changed_partitions_for_summaries:
+            if changed_partitions_size > 0:
+                properties[PARTITION_SUMMARY_PROP] = "true"
             for partition_path, update_metrics_partition in self.partition_metrics.items():
                 if (summary := self._partition_summary(update_metrics_partition)) and len(summary) != 0:
                     properties[CHANGED_PARTITION_PREFIX + partition_path] = summary
 
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-from typing import Dict, List, Literal, Optional
+from typing import Dict, List, Literal, Optional, Union
 
 from pydantic import Field
 
@@ -48,7 +48,7 @@ class PartitionStatisticsFile(StatisticsCommonFields):
 
 
 def filter_statistics_by_snapshot_id(
-    statistics: List[StatisticsFile],
+    statistics: List[Union[StatisticsFile, PartitionStatisticsFile]],
     reject_snapshot_id: int,
-) -> List[StatisticsFile]:
+) -> List[Union[StatisticsFile, PartitionStatisticsFile]]:
     return [stat for stat in statistics if stat.snapshot_id != reject_snapshot_id]
@@ -36,7 +36,11 @@
     SnapshotLogEntry,
 )
 from pyiceberg.table.sorting import SortOrder
-from pyiceberg.table.statistics import StatisticsFile, filter_statistics_by_snapshot_id
+from pyiceberg.table.statistics import (
+    PartitionStatisticsFile,
+    StatisticsFile,
+    filter_statistics_by_snapshot_id,
+)
 from pyiceberg.typedef import (
     IcebergBaseModel,
     Properties,
@@ -198,6 +202,16 @@ class RemoveStatisticsUpdate(IcebergBaseModel):
     snapshot_id: int = Field(alias="snapshot-id")
 
 
+class SetPartitionStatisticsUpdate(IcebergBaseModel):
+    action: Literal["set-partition-statistics"] = Field(default="set-partition-statistics")
+    partition_statistics: PartitionStatisticsFile
+
+
+class RemovePartitionStatisticsUpdate(IcebergBaseModel):
+    action: Literal["remove-partition-statistics"] = Field(default="remove-partition-statistics")
+    snapshot_id: int = Field(alias="snapshot-id")
+
+
 TableUpdate = Annotated[
     Union[
         AssignUUIDUpdate,
@@ -217,6 +231,8 @@ class RemoveStatisticsUpdate(IcebergBaseModel):
         RemovePropertiesUpdate,
         SetStatisticsUpdate,
         RemoveStatisticsUpdate,
+        SetPartitionStatisticsUpdate,
+        RemovePartitionStatisticsUpdate,
     ],
     Field(discriminator="action"),
 ]
@@ -582,6 +598,29 @@ def _(update: RemoveStatisticsUpdate, base_metadata: TableMetadata, context: _Ta
     return base_metadata.model_copy(update={"statistics": statistics})
 
 
+@_apply_table_update.register(SetPartitionStatisticsUpdate)
+def _(update: SetPartitionStatisticsUpdate, base_metadata: TableMetadata, context: _TableMetadataUpdateContext) -> TableMetadata:
+    partition_statistics = filter_statistics_by_snapshot_id(
+        base_metadata.partition_statistics, update.partition_statistics.snapshot_id
+    )
+    context.add_update(update)
+
+    return base_metadata.model_copy(update={"partition_statistics": partition_statistics + [update.partition_statistics]})
+
+
+@_apply_table_update.register(RemovePartitionStatisticsUpdate)
+def _(
+    update: RemovePartitionStatisticsUpdate, base_metadata: TableMetadata, context: _TableMetadataUpdateContext
+) -> TableMetadata:
+    if not any(part_stat.snapshot_id == update.snapshot_id for part_stat in base_metadata.partition_statistics):
+        raise ValueError(f"Partition Statistics with snapshot id {update.snapshot_id} does not exist")
+
+    statistics = filter_statistics_by_snapshot_id(base_metadata.partition_statistics, update.snapshot_id)
+    context.add_update(update)
+
+    return base_metadata.model_copy(update={"partition_statistics": statistics})
+
+
 def update_table_metadata(
     base_metadata: TableMetadata,
     updates: Tuple[TableUpdate, ...],
 
@@ -17,7 +17,9 @@
 
 import base64
 import datetime as py_datetime
+import importlib
 import struct
+import types
 from abc import ABC, abstractmethod
 from enum import IntEnum
 from functools import singledispatch
@@ -28,6 +30,7 @@
 import mmh3
 from pydantic import Field, PositiveInt, PrivateAttr
 
+from pyiceberg.exceptions import NotInstalledError
 from pyiceberg.expressions import (
     BoundEqualTo,
     BoundGreaterThan,
@@ -106,6 +109,17 @@
 TRUNCATE_PARSER = ParseNumberFromBrackets(TRUNCATE)
 
 
+def _try_import(module_name: str, extras_name: Optional[str] = None) -> types.ModuleType:
+    try:
+        return importlib.import_module(module_name)
+    except ImportError:
+        if extras_name:
+            msg = f'{module_name} needs to be installed. pip install "pyiceberg[{extras_name}]"'
+        else:
+            msg = f"{module_name} needs to be installed."
+        raise NotInstalledError(msg) from None
+
+
 def _transform_literal(func: Callable[[L], L], lit: Literal[L]) -> Literal[L]:
     """Small helper to upwrap the value from the literal, and wrap it again."""
     return literal(func(lit.value))
@@ -382,8 +396,7 @@ def __repr__(self) -> str:
         return f"BucketTransform(num_buckets={self._num_buckets})"
 
     def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]":
-        from pyiceberg_core import transform as pyiceberg_core_transform
-
+        pyiceberg_core_transform = _try_import("pyiceberg_core", extras_name="pyiceberg-core").transform
         return _pyiceberg_transform_wrapper(pyiceberg_core_transform.bucket, self._num_buckets)
 
     @property
@@ -509,9 +522,8 @@ def __repr__(self) -> str:
         return "YearTransform()"
 
     def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]":
-        import pyarrow as pa
-        from pyiceberg_core import transform as pyiceberg_core_transform
-
+        pa = _try_import("pyarrow")
+        pyiceberg_core_transform = _try_import("pyiceberg_core", extras_name="pyiceberg-core").transform
         return _pyiceberg_transform_wrapper(pyiceberg_core_transform.year, expected_type=pa.int32())
 
 
@@ -570,8 +582,8 @@ def __repr__(self) -> str:
         return "MonthTransform()"
 
     def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]":
-        import pyarrow as pa
-        from pyiceberg_core import transform as pyiceberg_core_transform
+        pa = _try_import("pyarrow")
+        pyiceberg_core_transform = _try_import("pyiceberg_core", extras_name="pyiceberg-core").transform
 
         return _pyiceberg_transform_wrapper(pyiceberg_core_transform.month, expected_type=pa.int32())
 
@@ -639,8 +651,8 @@ def __repr__(self) -> str:
         return "DayTransform()"
 
     def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]":
-        import pyarrow as pa
-        from pyiceberg_core import transform as pyiceberg_core_transform
+        pa = _try_import("pyarrow", extras_name="pyarrow")
+        pyiceberg_core_transform = _try_import("pyiceberg_core", extras_name="pyiceberg-core").transform
 
         return _pyiceberg_transform_wrapper(pyiceberg_core_transform.day, expected_type=pa.int32())
 
@@ -692,7 +704,7 @@ def __repr__(self) -> str:
         return "HourTransform()"
 
     def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]":
-        from pyiceberg_core import transform as pyiceberg_core_transform
+        pyiceberg_core_transform = _try_import("pyiceberg_core", extras_name="pyiceberg-core").transform
 
         return _pyiceberg_transform_wrapper(pyiceberg_core_transform.hour)
 
@@ -915,7 +927,7 @@ def __repr__(self) -> str:
         return f"TruncateTransform(width={self._width})"
 
     def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]":
-        from pyiceberg_core import transform as pyiceberg_core_transform
+        pyiceberg_core_transform = _try_import("pyiceberg_core", extras_name="pyiceberg-core").transform
 
         return _pyiceberg_transform_wrapper(pyiceberg_core_transform.truncate, self._width)
 
 
@@ -69,8 +69,10 @@
 LOGICAL_FIELD_TYPE_MAPPING: Dict[Tuple[str, str], PrimitiveType] = {
     ("date", "int"): DateType(),
     ("time-micros", "long"): TimeType(),
+    ("timestamp-millis", "int"): TimestampType(),
     ("timestamp-micros", "long"): TimestampType(),
     ("uuid", "fixed"): UUIDType(),
+    ("uuid", "string"): UUIDType(),
 }
 
 AvroType = Union[str, Any]