Skip to content

Commit 06ceb12

Browse files
committed
Fix the snapshot summary of a partial overwrite
1 parent 5bbcb11 commit 06ceb12

4 files changed

Lines changed: 44 additions & 17 deletions

File tree

pyiceberg/table/snapshots.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from pyiceberg.manifest import DataFile, DataFileContent, ManifestFile, _manifests
2929
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec
3030
from pyiceberg.schema import Schema
31+
from pyiceberg.utils.deprecated import deprecation_message
3132

3233
if TYPE_CHECKING:
3334
from pyiceberg.table.metadata import TableMetadata
@@ -356,6 +357,11 @@ def update_snapshot_summaries(
356357
raise ValueError(f"Operation not implemented: {summary.operation}")
357358

358359
if truncate_full_table and summary.operation == Operation.OVERWRITE and previous_summary is not None:
360+
deprecation_message(
361+
deprecated_in="0.10.0",
362+
removed_in="0.11.0",
363+
help_message="The truncate-full-table should be used.",
364+
)
359365
summary = _truncate_table_summary(summary, previous_summary)
360366

361367
if not previous_summary:

pyiceberg/table/update/snapshot.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,6 @@ def _summary(self, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> Summary:
236236
return update_snapshot_summaries(
237237
summary=Summary(operation=self._operation, **ssc.build(), **snapshot_properties),
238238
previous_summary=previous_snapshot.summary if previous_snapshot is not None else None,
239-
truncate_full_table=self._operation == Operation.OVERWRITE,
240239
)
241240

242241
def _commit(self) -> UpdatesAndRequirements:

tests/integration/test_writes/test_writes.py

Lines changed: 38 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -271,15 +271,19 @@ def test_summaries_partial_overwrite(spark: SparkSession, session_catalog: Catal
271271
}
272272
pa_schema = pa.schema(
273273
[
274-
pa.field("id", pa.dictionary(pa.int32(), pa.int32(), False)),
275-
pa.field("name", pa.dictionary(pa.int32(), pa.string(), False)),
274+
pa.field("id", pa.int32(), pa.int32()),
275+
pa.field("name", pa.int32(), pa.string()),
276276
]
277277
)
278278
arrow_table = pa.Table.from_pydict(TEST_DATA, schema=pa_schema)
279279
tbl = _create_table(session_catalog, identifier, {"format-version": "2"}, schema=pa_schema)
280280
with tbl.update_spec() as txn:
281-
txn.add_identity("id") # partition by `id` to create 3 data files
282-
tbl.append(arrow_table) # append
281+
txn.add_identity("id")
282+
tbl.append(arrow_table)
283+
284+
# TODO: We might want to check why this ends up in 3 files
285+
assert len(tbl.inspect.data_files()) == 3
286+
283287
tbl.delete(delete_filter="id == 1 and name = 'AB'") # partial overwrite data from 1 data file
284288

285289
rows = spark.sql(
@@ -311,24 +315,44 @@ def test_summaries_partial_overwrite(spark: SparkSession, session_catalog: Catal
311315
"total-position-deletes": "0",
312316
"total-records": "5",
313317
}
314-
# BUG `deleted-data-files` property is being replaced by the previous summary's `total-data-files` value
315-
# OVERWRITE from tbl.delete
318+
# Java produces:
319+
# {
320+
# "added-data-files": "1",
321+
# "added-files-size": "707",
322+
# "added-records": "2",
323+
# "app-id": "local-1743678304626",
324+
# "changed-partition-count": "1",
325+
# "deleted-data-files": "1",
326+
# "deleted-records": "3",
327+
# "engine-name": "spark",
328+
# "engine-version": "3.5.5",
329+
# "iceberg-version": "Apache Iceberg 1.8.1 (commit 9ce0fcf0af7becf25ad9fc996c3bad2afdcfd33d)",
330+
# "removed-files-size": "693",
331+
# "spark.app.id": "local-1743678304626",
332+
# "total-data-files": "3",
333+
# "total-delete-files": "0",
334+
# "total-equality-deletes": "0",
335+
# "total-files-size": "1993",
336+
# "total-position-deletes": "0",
337+
# "total-records": "4"
338+
# }
339+
files = tbl.inspect.data_files()
340+
assert len(files) == 3
316341
assert summaries[1] == {
317342
"added-data-files": "1",
318343
"added-files-size": "859",
319-
"added-records": "2", # wrong should be 0
344+
"added-records": "2",
320345
"changed-partition-count": "1",
321-
"deleted-data-files": "3", # wrong should be 1
322-
"deleted-records": "5", # wrong should be 1
323-
"removed-files-size": "2848",
324-
"total-data-files": "1", # wrong should be 3
346+
"deleted-data-files": "1",
347+
"deleted-records": "3",
348+
"removed-files-size": "950",
349+
"total-data-files": "3",
325350
"total-delete-files": "0",
326351
"total-equality-deletes": "0",
327-
"total-files-size": "859",
352+
"total-files-size": "2757",
328353
"total-position-deletes": "0",
329-
"total-records": "2", # wrong should be 4
354+
"total-records": "4",
330355
}
331-
assert len(tbl.inspect.data_files()) == 3
332356
assert len(tbl.scan().to_pandas()) == 4
333357

334358

tests/table/test_snapshots.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,6 @@ def test_merge_snapshot_summaries_overwrite_summary() -> None:
289289
"total-position-deletes": "1",
290290
"total-records": "1",
291291
},
292-
truncate_full_table=True,
293292
)
294293

295294
expected = {
@@ -337,7 +336,6 @@ def test_invalid_type() -> None:
337336
},
338337
),
339338
previous_summary={"total-data-files": "abc"}, # should be a number
340-
truncate_full_table=True,
341339
)
342340

343341
assert "Could not parse summary property total-data-files to an int: abc" in str(e.value)

0 commit comments

Comments
 (0)