apache
diff --git a/‎pyiceberg/io/pyarrow.py‎
Lines changed: 14 additions & 1 deletion b/‎pyiceberg/io/pyarrow.py‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎pyiceberg/table/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎pyiceberg/table/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tests/conftest.py‎
Lines changed: 43 additions & 0 deletions b/‎tests/conftest.py‎
Lines changed: 43 additions & 0 deletions
@@ -1011,6 +1011,10 @@ def _expression_to_complementary_pyarrow(expr: BooleanExpression) -> pc.Expressi
 def _get_file_format(file_format: FileFormat, **kwargs: Dict[str, Any]) -> ds.FileFormat:
     if file_format == FileFormat.PARQUET:
         return ds.ParquetFileFormat(**kwargs)
+    elif file_format == FileFormat.ORC:
+        # ORC doesn't support pre_buffer and buffer_size parameters
+        orc_kwargs = {k: v for k, v in kwargs.items() if k not in ['pre_buffer', 'buffer_size']}
+        return ds.OrcFileFormat(**orc_kwargs)
     else:
         raise ValueError(f"Unsupported file format: {file_format}")
 
@@ -1027,6 +1031,15 @@ def _read_deletes(io: FileIO, data_file: DataFile) -> Dict[str, pa.ChunkedArray]
             file.as_py(): table.filter(pc.field("file_path") == file).column("pos")
             for file in table.column("file_path").chunks[0].dictionary
         }
+    elif data_file.file_format == FileFormat.ORC:
+        with io.new_input(data_file.file_path).open() as fi:
+            delete_fragment = _get_file_format(data_file.file_format).make_fragment(fi)
+            table = ds.Scanner.from_fragment(fragment=delete_fragment).to_table()
+            # For ORC, file_path columns are not dictionary-encoded, so we use unique() directly
+            return {
+                path.as_py(): table.filter(pc.field("file_path") == path).column("pos")
+                for path in table.column("file_path").unique()
+            }
     elif data_file.file_format == FileFormat.PUFFIN:
         with io.new_input(data_file.file_path).open() as fi:
             payload = fi.read()
@@ -1495,7 +1508,7 @@ def _task_to_record_batches(
     format_version: TableVersion = TableProperties.DEFAULT_FORMAT_VERSION,
     downcast_ns_timestamp_to_us: Optional[bool] = None,
 ) -> Iterator[pa.RecordBatch]:
-    arrow_format = ds.ParquetFileFormat(pre_buffer=True, buffer_size=(ONE_MEGABYTE * 8))
+    arrow_format = _get_file_format(task.file.file_format, pre_buffer=True, buffer_size=(ONE_MEGABYTE * 8))
     with io.new_input(task.file.file_path).open() as fin:
         fragment = arrow_format.make_fragment(fin)
         physical_schema = fragment.physical_schema
 
@@ -211,6 +211,9 @@ class TableProperties:
     WRITE_OBJECT_STORE_PARTITIONED_PATHS_DEFAULT = True
 
     WRITE_DATA_PATH = "write.data.path"
+
+    WRITE_FILE_FORMAT = "write.format.default"
+    WRITE_FILE_FORMAT_DEFAULT = "parquet"
     WRITE_METADATA_PATH = "write.metadata.path"
 
     DELETE_MODE = "write.delete.mode"
 
@@ -2413,6 +2413,32 @@ def example_task(data_file: str) -> FileScanTask:
     )
 
 
+@pytest.fixture
+def data_file_orc(table_schema_simple: Schema, tmp_path: str) -> str:
+    import pyarrow as pa
+    import pyarrow.orc as orc
+
+    from pyiceberg.io.pyarrow import schema_to_pyarrow
+
+    table = pa.table(
+        {"foo": ["a", "b", "c"], "bar": [1, 2, 3], "baz": [True, False, None]},
+        schema=schema_to_pyarrow(table_schema_simple),
+    )
+
+    file_path = f"{tmp_path}/0000-data.orc"
+    orc.write_table(table=table, where=file_path)
+    return file_path
+
+
+@pytest.fixture
+def example_task_orc(data_file_orc: str) -> FileScanTask:
+    datafile = DataFile.from_args(file_path=data_file_orc, file_format=FileFormat.ORC, file_size_in_bytes=1925)
+    datafile.spec_id = 0
+    return FileScanTask(
+        data_file=datafile,
+    )
+
+
 @pytest.fixture(scope="session")
 def warehouse(tmp_path_factory: pytest.TempPathFactory) -> Path:
     return tmp_path_factory.mktemp("test_sql")
@@ -2442,6 +2468,23 @@ def table_v2(example_table_metadata_v2: Dict[str, Any]) -> Table:
     )
 
 
+@pytest.fixture
+def table_v2_orc(example_table_metadata_v2: Dict[str, Any]) -> Table:
+    import copy
+    metadata_dict = copy.deepcopy(example_table_metadata_v2)
+    if not metadata_dict["properties"]:
+        metadata_dict["properties"] = {}
+    metadata_dict["properties"]["write.format.default"] = "ORC"
+    table_metadata = TableMetadataV2(**metadata_dict)
+    return Table(
+        identifier=("database", "table_orc"),
+        metadata=table_metadata,
+        metadata_location=f"{table_metadata.location}/uuid.metadata.json",
+        io=load_file_io(),
+        catalog=NoopCatalog("NoopCatalog"),
+    )
+
+
 @pytest.fixture
 def table_v2_with_fixed_and_decimal_types(
     table_metadata_v2_with_fixed_and_decimal_types: Dict[str, Any],