refactor: update the test_replace.py to use a reusable _create_dummy_data_file()

qzyu999 · qzyu999 · commit 9681ec3369e7 · 2026-04-16T13:48:35.000-07:00
diff --git a/tests/table/test_replace.py b/tests/table/test_replace.py
@@ -31,6 +31,28 @@
 from pyiceberg.typedef import Record
 
 
+def _create_dummy_data_file(
+    file_path: str,
+    record_count: int,
+    file_size_in_bytes: int = 1024,
+    content: DataFileContent = DataFileContent.DATA,
+    partition: Record | None = None,
+    spec_id: int = 0,
+) -> DataFile:
+    if partition is None:
+        partition = Record()
+    df = DataFile.from_args(
+        file_path=file_path,
+        file_format=FileFormat.PARQUET,
+        partition=partition,
+        record_count=record_count,
+        file_size_in_bytes=file_size_in_bytes,
+        content=content,
+    )
+    df.spec_id = spec_id
+    return df
+
+
 def test_replace_internally(catalog: Catalog) -> None:
     # Setup a basic table using the catalog fixture
     catalog.create_namespace("default")
@@ -40,37 +62,23 @@ def test_replace_internally(catalog: Catalog) -> None:
     )
 
     # 1. File we will delete
-    file_to_delete = DataFile.from_args(
+    file_to_delete = _create_dummy_data_file(
         file_path="s3://bucket/test/data/deleted.parquet",
-        file_format=FileFormat.PARQUET,
-        partition=Record(),
         record_count=100,
-        file_size_in_bytes=1024,
-        content=DataFileContent.DATA,
     )
-    file_to_delete.spec_id = 0
 
     # 2. File we will leave completely untouched
-    file_to_keep = DataFile.from_args(
+    file_to_keep = _create_dummy_data_file(
         file_path="s3://bucket/test/data/kept.parquet",
-        file_format=FileFormat.PARQUET,
-        partition=Record(),
         record_count=50,
         file_size_in_bytes=512,
-        content=DataFileContent.DATA,
     )
-    file_to_keep.spec_id = 0
 
     # 3. File we are adding as a replacement
-    file_to_add = DataFile.from_args(
+    file_to_add = _create_dummy_data_file(
         file_path="s3://bucket/test/data/added.parquet",
-        file_format=FileFormat.PARQUET,
-        partition=Record(),
         record_count=100,
-        file_size_in_bytes=1024,
-        content=DataFileContent.DATA,
     )
-    file_to_add.spec_id = 0
 
     # Initially append BOTH the file to delete and the file to keep
     with table.transaction() as tx:
@@ -152,35 +160,23 @@ def test_replace_reuses_unaffected_manifests(catalog: Catalog) -> None:
         schema=Schema(),
     )
 
-    file_a = DataFile.from_args(
+    file_a = _create_dummy_data_file(
         file_path="s3://bucket/test/data/a.parquet",
-        file_format=FileFormat.PARQUET,
-        partition=Record(),
         record_count=10,
         file_size_in_bytes=100,
-        content=DataFileContent.DATA,
     )
-    file_a.spec_id = 0
 
-    file_b = DataFile.from_args(
+    file_b = _create_dummy_data_file(
         file_path="s3://bucket/test/data/b.parquet",
-        file_format=FileFormat.PARQUET,
-        partition=Record(),
         record_count=10,
         file_size_in_bytes=100,
-        content=DataFileContent.DATA,
     )
-    file_b.spec_id = 0
 
-    file_c = DataFile.from_args(
+    file_c = _create_dummy_data_file(
         file_path="s3://bucket/test/data/c.parquet",
-        file_format=FileFormat.PARQUET,
-        partition=Record(),
         record_count=10,
         file_size_in_bytes=100,
-        content=DataFileContent.DATA,
     )
-    file_c.spec_id = 0
 
     # Commit 1: Append file A (Creates Manifest 1)
     with table.transaction() as tx:
@@ -260,25 +256,15 @@ def test_replace_missing_file_abort(catalog: Catalog) -> None:
         schema=Schema(),
     )
 
-    fake_data_file = DataFile.from_args(
+    fake_data_file = _create_dummy_data_file(
         file_path="s3://bucket/test/data/does_not_exist.parquet",
-        file_format=FileFormat.PARQUET,
-        partition=Record(),
         record_count=100,
-        file_size_in_bytes=1024,
-        content=DataFileContent.DATA,
     )
-    fake_data_file.spec_id = 0
 
-    new_data_file = DataFile.from_args(
+    new_data_file = _create_dummy_data_file(
         file_path="s3://bucket/test/data/new.parquet",
-        file_format=FileFormat.PARQUET,
-        partition=Record(),
         record_count=100,
-        file_size_in_bytes=1024,
-        content=DataFileContent.DATA,
     )
-    new_data_file.spec_id = 0
 
     # Ensure it aborts when trying to replace a file that isn't in the table
     with pytest.raises(ValueError, match="Cannot delete files that are not present in the table"):
@@ -296,26 +282,16 @@ def test_replace_invariant_violation(catalog: Catalog) -> None:
         schema=Schema(),
     )
 
-    file_to_delete = DataFile.from_args(
+    file_to_delete = _create_dummy_data_file(
         file_path="s3://bucket/test/data/deleted.parquet",
-        file_format=FileFormat.PARQUET,
-        partition=Record(),
         record_count=100,
-        file_size_in_bytes=1024,
-        content=DataFileContent.DATA,
     )
-    file_to_delete.spec_id = 0
 
     # Create a new file with MORE records than the one we are deleting
-    too_many_records_file = DataFile.from_args(
+    too_many_records_file = _create_dummy_data_file(
         file_path="s3://bucket/test/data/too_many.parquet",
-        file_format=FileFormat.PARQUET,
-        partition=Record(),
         record_count=101,
-        file_size_in_bytes=1024,
-        content=DataFileContent.DATA,
     )
-    too_many_records_file.spec_id = 0
 
     # Initially append to have something to replace
     with table.transaction() as tx:
@@ -339,26 +315,17 @@ def test_replace_allows_shrinking_for_soft_deletes(catalog: Catalog) -> None:
     )
 
     # Old data file has 100 records
-    file_to_delete = DataFile.from_args(
+    file_to_delete = _create_dummy_data_file(
         file_path="s3://bucket/test/data/deleted.parquet",
-        file_format=FileFormat.PARQUET,
-        partition=Record(),
         record_count=100,
-        file_size_in_bytes=1024,
-        content=DataFileContent.DATA,
     )
-    file_to_delete.spec_id = 0
 
     # New data file only has 90 records (simulating 10 records were soft-deleted)
-    shrunk_file_to_add = DataFile.from_args(
+    shrunk_file_to_add = _create_dummy_data_file(
         file_path="s3://bucket/test/data/shrunk.parquet",
-        file_format=FileFormat.PARQUET,
-        partition=Record(),
         record_count=90,
         file_size_in_bytes=900,
-        content=DataFileContent.DATA,
     )
-    shrunk_file_to_add.spec_id = 0
 
     # Initially append
     with table.transaction() as tx:
@@ -389,37 +356,26 @@ def test_replace_passes_through_delete_manifests(catalog: Catalog) -> None:
     )
 
     # 1. Data file we will replace
-    file_a = DataFile.from_args(
+    file_a = _create_dummy_data_file(
         file_path="s3://bucket/test/data/a.parquet",
-        file_format=FileFormat.PARQUET,
-        partition=Record(),
         record_count=10,
         file_size_in_bytes=100,
-        content=DataFileContent.DATA,
     )
-    file_a.spec_id = 0
 
     # 2. A Position Delete file (representing row-level deletes)
-    file_a_deletes = DataFile.from_args(
+    file_a_deletes = _create_dummy_data_file(
         file_path="s3://bucket/test/data/a_deletes.parquet",
-        file_format=FileFormat.PARQUET,
-        partition=Record(),
         record_count=2,
         file_size_in_bytes=50,
         content=DataFileContent.POSITION_DELETES,
     )
-    file_a_deletes.spec_id = 0
 
     # 3. Data file we are adding as a replacement
-    file_b = DataFile.from_args(
+    file_b = _create_dummy_data_file(
         file_path="s3://bucket/test/data/b.parquet",
-        file_format=FileFormat.PARQUET,
-        partition=Record(),
         record_count=10,
         file_size_in_bytes=100,
-        content=DataFileContent.DATA,
     )
-    file_b.spec_id = 0
 
     # Commit 1: Append the data file
     with table.transaction() as tx:
@@ -467,45 +423,27 @@ def test_replace_multiple_files(catalog: Catalog) -> None:
         schema=Schema(),
     )
 
-    file_1 = DataFile.from_args(
+    file_1 = _create_dummy_data_file(
         file_path="s3://bucket/test/data/1.parquet",
-        file_format=FileFormat.PARQUET,
-        partition=Record(),
         record_count=100,
-        file_size_in_bytes=1024,
-        content=DataFileContent.DATA,
     )
-    file_1.spec_id = 0
 
-    file_2 = DataFile.from_args(
+    file_2 = _create_dummy_data_file(
         file_path="s3://bucket/test/data/2.parquet",
-        file_format=FileFormat.PARQUET,
-        partition=Record(),
         record_count=100,
-        file_size_in_bytes=1024,
-        content=DataFileContent.DATA,
     )
-    file_2.spec_id = 0
 
-    file_1_new = DataFile.from_args(
+    file_1_new = _create_dummy_data_file(
         file_path="s3://bucket/test/data/1_new.parquet",
-        file_format=FileFormat.PARQUET,
-        partition=Record(),
         record_count=50,
         file_size_in_bytes=512,
-        content=DataFileContent.DATA,
     )
-    file_1_new.spec_id = 0
 
-    file_2_new = DataFile.from_args(
+    file_2_new = _create_dummy_data_file(
         file_path="s3://bucket/test/data/2_new.parquet",
-        file_format=FileFormat.PARQUET,
-        partition=Record(),
         record_count=50,
         file_size_in_bytes=512,
-        content=DataFileContent.DATA,
     )
-    file_2_new.spec_id = 0
 
     # Append initial files
     with table.transaction() as tx:
@@ -550,26 +488,20 @@ def test_replace_partitioned_table(catalog: Catalog) -> None:
     )
 
     # File in partition id=1
-    file_part1 = DataFile.from_args(
+    file_part1 = _create_dummy_data_file(
         file_path="s3://bucket/test/data/part1.parquet",
-        file_format=FileFormat.PARQUET,
         partition=Record(1),
         record_count=100,
-        file_size_in_bytes=1024,
-        content=DataFileContent.DATA,
+        spec_id=table.spec().spec_id,
     )
-    file_part1.spec_id = table.spec().spec_id
 
     # File in partition id=2
-    file_part2 = DataFile.from_args(
+    file_part2 = _create_dummy_data_file(
         file_path="s3://bucket/test/data/part2.parquet",
-        file_format=FileFormat.PARQUET,
         partition=Record(2),
         record_count=100,
-        file_size_in_bytes=1024,
-        content=DataFileContent.DATA,
+        spec_id=table.spec().spec_id,
     )
-    file_part2.spec_id = table.spec().spec_id
 
     # Add initial files
     with table.transaction() as tx:
@@ -578,15 +510,13 @@ def test_replace_partitioned_table(catalog: Catalog) -> None:
             append_snapshot.append_data_file(file_part2)
 
     # Replace file in partition 1
-    file_part1_new = DataFile.from_args(
+    file_part1_new = _create_dummy_data_file(
         file_path="s3://bucket/test/data/part1_new.parquet",
-        file_format=FileFormat.PARQUET,
         partition=Record(1),
         record_count=50,
         file_size_in_bytes=512,
-        content=DataFileContent.DATA,
+        spec_id=table.spec().spec_id,
     )
-    file_part1_new.spec_id = table.spec().spec_id
 
     with table.transaction() as tx:
         with tx.update_snapshot().replace() as rewrite:
@@ -609,15 +539,11 @@ def test_replace_no_op_on_non_empty_table(catalog: Catalog) -> None:
         schema=Schema(),
     )
 
-    file_a = DataFile.from_args(
+    file_a = _create_dummy_data_file(
         file_path="s3://bucket/test/data/a.parquet",
-        file_format=FileFormat.PARQUET,
-        partition=Record(),
         record_count=10,
         file_size_in_bytes=100,
-        content=DataFileContent.DATA,
     )
-    file_a.spec_id = 0
 
     # Commit 1: Append file A
     with table.transaction() as tx:
@@ -646,26 +572,16 @@ def test_replace_on_custom_branch(catalog: Catalog) -> None:
     )
 
     # 1. File we will delete
-    file_to_delete = DataFile.from_args(
+    file_to_delete = _create_dummy_data_file(
         file_path="s3://bucket/test/data/deleted.parquet",
-        file_format=FileFormat.PARQUET,
-        partition=Record(),
         record_count=100,
-        file_size_in_bytes=1024,
-        content=DataFileContent.DATA,
     )
-    file_to_delete.spec_id = 0
 
     # 2. File we are adding as a replacement
-    file_to_add = DataFile.from_args(
+    file_to_add = _create_dummy_data_file(
         file_path="s3://bucket/test/data/added.parquet",
-        file_format=FileFormat.PARQUET,
-        partition=Record(),
         record_count=100,
-        file_size_in_bytes=1024,
-        content=DataFileContent.DATA,
     )
-    file_to_add.spec_id = 0
 
     # Initially append to have something to replace on main
     with table.transaction() as tx: