Add table.maintenance.compact() for full-table data file compaction

qzyu999 · qzyu999 · commit 5c8dc67e1ec2 · 2026-03-05T21:32:27.000-08:00
This introduces a simplified, whole-table compaction strategy via the MaintenanceTable API (`table.maintenance.compact()`). Key implementation details: - Reads the entire table state into memory via `.to_arrow()`. - Uses `table.overwrite()` to rewrite data, leveraging PyIceberg's target file bin-packing (`write.target-file-size-bytes`) natively. - Ensures atomicity by executing within a table transaction. - Explicitly sets `snapshot-type: replace` and `replace-operation: compaction` to ensure correct metadata history for downstream engines. - Includes a guard to safely ignore compaction requests on empty tables. Includes full Pytest coverage in `tests/table/test_maintenance.py`. Closes #1092
diff --git a/pyiceberg/table/maintenance.py b/pyiceberg/table/maintenance.py
@@ -43,3 +43,26 @@ def expire_snapshots(self) -> ExpireSnapshots:
         from pyiceberg.table.update.snapshot import ExpireSnapshots
 
         return ExpireSnapshots(transaction=Transaction(self.tbl, autocommit=True))
+
+    def compact(self) -> None:
+        """Compact the table's data files by reading and overwriting the entire table.
+
+        Note: This is a full-table compaction that leverages Arrow for binpacking.
+        It currently reads the entire table into memory via `.to_arrow()`.
+
+        This reads all existing data into memory and writes it back out using the
+        target file size settings (write.target-file-size-bytes), atomically
+        dropping the old files and replacing them with fewer, larger files.
+        """
+        # Read the current table state into memory
+        arrow_table = self.tbl.scan().to_arrow()
+
+        # Guard: if the table is completely empty, there's nothing to compact.
+        # Doing an overwrite with an empty table would result in deleting everything.
+        if arrow_table.num_rows == 0:
+            logger.info("Table contains no rows, skipping compaction.")
+            return
+
+        # Overwrite the table atomically (REPLACE operation)
+        with self.tbl.transaction() as txn:
+            txn.overwrite(arrow_table, snapshot_properties={"snapshot-type": "replace", "replace-operation": "compaction"})
diff --git a/tests/table/test_maintenance.py b/tests/table/test_maintenance.py
@@ -0,0 +1,101 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import random
+import pyarrow as pa
+import pytest
+
+from pyiceberg.catalog import Catalog
+from pyiceberg.schema import Schema
+from pyiceberg.partitioning import PartitionSpec, PartitionField
+from pyiceberg.transforms import IdentityTransform
+from pyiceberg.exceptions import NoSuchNamespaceError
+
+
+def test_maintenance_compact(catalog: Catalog) -> None:
+    # Setup Schema and specs
+    from pyiceberg.types import NestedField, StringType, LongType
+    schema = Schema(
+        NestedField(1, "id", LongType()),
+        NestedField(2, "category", StringType()),
+        NestedField(3, "value", LongType()),
+    )
+    spec = PartitionSpec(
+        PartitionField(source_id=2, field_id=1000, transform=IdentityTransform(), name="category")
+    )
+    
+    # Create the namespace and table
+    try:
+        catalog.create_namespace("default")
+    except NoSuchNamespaceError:
+        pass
+    table = catalog.create_table(
+        "default.test_compaction",
+        schema=schema,
+        partition_spec=spec,
+    )
+
+    # Append many small data files
+    categories = ["cat1", "cat2", "cat3"]
+    for i in range(12):
+        table.append(pa.table({
+            "id": list(range(i * 10, (i + 1) * 10)),
+            "category": [categories[i % 3]] * 10,
+            "value": [random.randint(1, 100) for _ in range(10)],
+        }))
+
+    # Verify state before compaction
+    before_files = list(table.scan().plan_files())
+    assert len(before_files) == 12
+    assert table.scan().to_arrow().num_rows == 120
+
+    # Execute Compaction
+    table.maintenance.compact()
+
+    # Verify state after compaction
+    table.refresh()
+    after_files = list(table.scan().plan_files())
+    assert len(after_files) == 3  # Should be 1 optimized data file per partition
+    assert table.scan().to_arrow().num_rows == 120
+
+    # Ensure snapshot properties specify the replace-operation
+    new_snapshot = table.current_snapshot()
+    assert new_snapshot is not None
+    assert new_snapshot.summary.get("snapshot-type") == "replace"
+    assert new_snapshot.summary.get("replace-operation") == "compaction"
+
+
+def test_maintenance_compact_empty_table(catalog: Catalog) -> None:
+    from pyiceberg.types import NestedField, StringType, LongType
+    schema = Schema(
+        NestedField(1, "id", LongType()),
+        NestedField(2, "category", StringType()),
+    )
+    
+    try:
+        catalog.create_namespace("default")
+    except NoSuchNamespaceError:
+        pass
+    
+    table = catalog.create_table("default.test_compaction_empty", schema=schema)
+    before_snapshots = len(table.history())
+    
+    # Should safely return doing nothing
+    table.maintenance.compact()
+    
+    table.refresh()
+    after_snapshots = len(table.history())
+    assert before_snapshots == after_snapshots  # No new snapshot should be made