Accept concurrent_tasks when fetching record_batches

koenvo · koenvo · commit 88a4ad2ebc00 · 2025-06-02T23:32:05.000+02:00
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -1625,7 +1625,9 @@ def _table_from_scan_task(task: FileScanTask) -> pa.Table:
 
         return result
 
-    def to_record_batches(self, tasks: Iterable[FileScanTask]) -> Iterator[pa.RecordBatch]:
+    def to_record_batches(
+        self, tasks: Iterable[FileScanTask], concurrent_tasks: Optional[int] = None
+    ) -> Iterator[pa.RecordBatch]:
         """Scan the Iceberg table and return an Iterator[pa.RecordBatch].
 
         Returns an Iterator of pa.RecordBatch with data from the Iceberg table
@@ -1634,6 +1636,7 @@ def to_record_batches(self, tasks: Iterable[FileScanTask]) -> Iterator[pa.Record
 
         Args:
             tasks: FileScanTasks representing the data files and delete files to read from.
+            concurrent_tasks: number of concurrent tasks
 
         Returns:
             An Iterator of PyArrow RecordBatches.
@@ -1643,8 +1646,20 @@ def to_record_batches(self, tasks: Iterable[FileScanTask]) -> Iterator[pa.Record
             ResolveError: When a required field cannot be found in the file
             ValueError: When a field type in the file cannot be projected to the schema type
         """
+        from concurrent.futures import ThreadPoolExecutor
+
         deletes_per_file = _read_all_delete_files(self._io, tasks)
-        return self._record_batches_from_scan_tasks_and_deletes(tasks, deletes_per_file)
+
+        if concurrent_tasks is not None:
+            with ThreadPoolExecutor(max_workers=concurrent_tasks) as pool:
+                for batches in pool.map(
+                    lambda task: list(self._record_batches_from_scan_tasks_and_deletes([task], deletes_per_file)), tasks
+                ):
+                    for batch in batches:
+                        yield batch
+
+        else:
+            return self._record_batches_from_scan_tasks_and_deletes(tasks, deletes_per_file)
 
     def _record_batches_from_scan_tasks_and_deletes(
         self, tasks: Iterable[FileScanTask], deletes_per_file: Dict[str, List[ChunkedArray]]
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -1864,7 +1864,7 @@ def to_arrow(self) -> pa.Table:
             self.table_metadata, self.io, self.projection(), self.row_filter, self.case_sensitive, self.limit
         ).to_table(self.plan_files())
 
-    def to_arrow_batch_reader(self) -> pa.RecordBatchReader:
+    def to_arrow_batch_reader(self, concurrent_tasks: Optional[int] = None) -> pa.RecordBatchReader:
         """Return an Arrow RecordBatchReader from this DataScan.
 
         For large results, using a RecordBatchReader requires less memory than
@@ -1882,7 +1882,7 @@ def to_arrow_batch_reader(self) -> pa.RecordBatchReader:
         target_schema = schema_to_pyarrow(self.projection())
         batches = ArrowScan(
             self.table_metadata, self.io, self.projection(), self.row_filter, self.case_sensitive, self.limit
-        ).to_record_batches(self.plan_files())
+        ).to_record_batches(self.plan_files(), concurrent_tasks=concurrent_tasks)
 
         return pa.RecordBatchReader.from_batches(
             target_schema,