@@ -271,15 +271,19 @@ def test_summaries_partial_overwrite(spark: SparkSession, session_catalog: Catal
271271 }
272272 pa_schema = pa .schema (
273273 [
274- pa .field ("id" , pa .dictionary ( pa . int32 (), pa .int32 (), False )),
275- pa .field ("name" , pa .dictionary ( pa . int32 (), pa .string (), False )),
274+ pa .field ("id" , pa .int32 (), pa .int32 ()),
275+ pa .field ("name" , pa .int32 (), pa .string ()),
276276 ]
277277 )
278278 arrow_table = pa .Table .from_pydict (TEST_DATA , schema = pa_schema )
279279 tbl = _create_table (session_catalog , identifier , {"format-version" : "2" }, schema = pa_schema )
280280 with tbl .update_spec () as txn :
281- txn .add_identity ("id" ) # partition by `id` to create 3 data files
282- tbl .append (arrow_table ) # append
281+ txn .add_identity ("id" )
282+ tbl .append (arrow_table )
283+
284+ # TODO: We might want to check why this ends up in 3 files
285+ assert len (tbl .inspect .data_files ()) == 3
286+
283287 tbl .delete (delete_filter = "id == 1 and name = 'AB'" ) # partial overwrite data from 1 data file
284288
285289 rows = spark .sql (
@@ -311,24 +315,44 @@ def test_summaries_partial_overwrite(spark: SparkSession, session_catalog: Catal
311315 "total-position-deletes" : "0" ,
312316 "total-records" : "5" ,
313317 }
314- # BUG `deleted-data-files` property is being replaced by the previous summary's `total-data-files` value
315- # OVERWRITE from tbl.delete
318+ # Java produces:
319+ # {
320+ # "added-data-files": "1",
321+ # "added-files-size": "707",
322+ # "added-records": "2",
323+ # "app-id": "local-1743678304626",
324+ # "changed-partition-count": "1",
325+ # "deleted-data-files": "1",
326+ # "deleted-records": "3",
327+ # "engine-name": "spark",
328+ # "engine-version": "3.5.5",
329+ # "iceberg-version": "Apache Iceberg 1.8.1 (commit 9ce0fcf0af7becf25ad9fc996c3bad2afdcfd33d)",
330+ # "removed-files-size": "693",
331+ # "spark.app.id": "local-1743678304626",
332+ # "total-data-files": "3",
333+ # "total-delete-files": "0",
334+ # "total-equality-deletes": "0",
335+ # "total-files-size": "1993",
336+ # "total-position-deletes": "0",
337+ # "total-records": "4"
338+ # }
339+ files = tbl .inspect .data_files ()
340+ assert len (files ) == 3
316341 assert summaries [1 ] == {
317342 "added-data-files" : "1" ,
318343 "added-files-size" : "859" ,
319- "added-records" : "2" , # wrong should be 0
344+ "added-records" : "2" ,
320345 "changed-partition-count" : "1" ,
321- "deleted-data-files" : "3" , # wrong should be 1
322- "deleted-records" : "5" , # wrong should be 1
323- "removed-files-size" : "2848 " ,
324- "total-data-files" : "1" , # wrong should be 3
346+ "deleted-data-files" : "1" ,
347+ "deleted-records" : "3" ,
348+ "removed-files-size" : "950 " ,
349+ "total-data-files" : "3" ,
325350 "total-delete-files" : "0" ,
326351 "total-equality-deletes" : "0" ,
327- "total-files-size" : "859 " ,
352+ "total-files-size" : "2757 " ,
328353 "total-position-deletes" : "0" ,
329- "total-records" : "2" , # wrong should be 4
354+ "total-records" : "4" ,
330355 }
331- assert len (tbl .inspect .data_files ()) == 3
332356 assert len (tbl .scan ().to_pandas ()) == 4
333357
334358
0 commit comments