Skip to content

Commit e06c01a

Browse files
sumedhsakdeoclaude
andcommitted
test: Refactor benchmark tests for new ScanOrder API
Restructure parameterized benchmark tests to use ScanOrder class instances: - TaskOrder() for default behavior - ArrivalOrder(concurrent_streams=N) for streaming configurations Simplifies test parameters by eliminating separate concurrent_files argument. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 19841dc commit e06c01a

1 file changed

Lines changed: 10 additions & 12 deletions

File tree

tests/benchmark/test_read_benchmark.py

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
import pytest
3636

3737
from pyiceberg.catalog.sql import SqlCatalog
38-
from pyiceberg.table import ScanOrder, Table
38+
from pyiceberg.table import ScanOrder, TaskOrder, ArrivalOrder, Table
3939

4040
NUM_FILES = 32
4141
ROWS_PER_FILE = 500_000
@@ -85,26 +85,25 @@ def benchmark_table(tmp_path_factory: pytest.TempPathFactory) -> Table:
8585

8686

8787
@pytest.mark.parametrize(
88-
"order,concurrent_files,batch_size",
88+
"order,batch_size",
8989
[
90-
pytest.param(ScanOrder.TASK, 1, None, id="default"),
91-
pytest.param(ScanOrder.ARRIVAL, 1, None, id="arrival-cf1"),
92-
pytest.param(ScanOrder.ARRIVAL, 2, None, id="arrival-cf2"),
93-
pytest.param(ScanOrder.ARRIVAL, 4, None, id="arrival-cf4"),
94-
pytest.param(ScanOrder.ARRIVAL, 8, None, id="arrival-cf8"),
95-
pytest.param(ScanOrder.ARRIVAL, 16, None, id="arrival-cf16"),
90+
pytest.param(TaskOrder(), None, id="default"),
91+
pytest.param(ArrivalOrder(concurrent_streams=1), None, id="arrival-cf1"),
92+
pytest.param(ArrivalOrder(concurrent_streams=2), None, id="arrival-cf2"),
93+
pytest.param(ArrivalOrder(concurrent_streams=4), None, id="arrival-cf4"),
94+
pytest.param(ArrivalOrder(concurrent_streams=8), None, id="arrival-cf8"),
95+
pytest.param(ArrivalOrder(concurrent_streams=16), None, id="arrival-cf16"),
9696
],
9797
)
9898
def test_read_throughput(
9999
benchmark_table: Table,
100100
order: ScanOrder,
101-
concurrent_files: int,
102101
batch_size: int | None,
103102
) -> None:
104103
"""Measure records/sec, time to first record, and peak Arrow memory for a scan configuration."""
105104
effective_batch_size = batch_size or 131_072 # PyArrow default
106-
if order == ScanOrder.ARRIVAL:
107-
config_str = f"order=ARRIVAL, concurrent_files={concurrent_files}, batch_size={effective_batch_size}"
105+
if isinstance(order, ArrivalOrder):
106+
config_str = f"order=ARRIVAL, concurrent_streams={order.concurrent_streams}, batch_size={effective_batch_size}"
108107
else:
109108
config_str = f"order=TASK (executor.map, all files parallel), batch_size={effective_batch_size}"
110109
print("\n--- ArrowScan Read Throughput Benchmark ---")
@@ -129,7 +128,6 @@ def test_read_throughput(
129128
for batch in benchmark_table.scan().to_arrow_batch_reader(
130129
batch_size=batch_size,
131130
order=order,
132-
concurrent_files=concurrent_files,
133131
):
134132
if first_batch_time is None:
135133
first_batch_time = timeit.default_timer() - start

0 commit comments

Comments
 (0)