@@ -1625,7 +1625,9 @@ def _table_from_scan_task(task: FileScanTask) -> pa.Table:
16251625
16261626 return result
16271627
1628- def to_record_batches (self , tasks : Iterable [FileScanTask ]) -> Iterator [pa .RecordBatch ]:
1628+ def to_record_batches (
1629+ self , tasks : Iterable [FileScanTask ], concurrent_tasks : Optional [int ] = None
1630+ ) -> Iterator [pa .RecordBatch ]:
16291631 """Scan the Iceberg table and return an Iterator[pa.RecordBatch].
16301632
16311633 Returns an Iterator of pa.RecordBatch with data from the Iceberg table
@@ -1634,6 +1636,7 @@ def to_record_batches(self, tasks: Iterable[FileScanTask]) -> Iterator[pa.Record
16341636
16351637 Args:
16361638 tasks: FileScanTasks representing the data files and delete files to read from.
1639+ concurrent_tasks: number of concurrent tasks
16371640
16381641 Returns:
16391642 An Iterator of PyArrow RecordBatches.
@@ -1643,8 +1646,20 @@ def to_record_batches(self, tasks: Iterable[FileScanTask]) -> Iterator[pa.Record
16431646 ResolveError: When a required field cannot be found in the file
16441647 ValueError: When a field type in the file cannot be projected to the schema type
16451648 """
1649+ from concurrent .futures import ThreadPoolExecutor
1650+
16461651 deletes_per_file = _read_all_delete_files (self ._io , tasks )
1647- return self ._record_batches_from_scan_tasks_and_deletes (tasks , deletes_per_file )
1652+
1653+ if concurrent_tasks is not None :
1654+ with ThreadPoolExecutor (max_workers = concurrent_tasks ) as pool :
1655+ for batches in pool .map (
1656+ lambda task : list (self ._record_batches_from_scan_tasks_and_deletes ([task ], deletes_per_file )), tasks
1657+ ):
1658+ for batch in batches :
1659+ yield batch
1660+
1661+ else :
1662+ return self ._record_batches_from_scan_tasks_and_deletes (tasks , deletes_per_file )
16481663
16491664 def _record_batches_from_scan_tasks_and_deletes (
16501665 self , tasks : Iterable [FileScanTask ], deletes_per_file : Dict [str , List [ChunkedArray ]]
0 commit comments