|
20 | 20 | from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Set, Tuple |
21 | 21 |
|
22 | 22 | from pyiceberg.conversions import from_bytes |
23 | | -from pyiceberg.manifest import DataFile, DataFileContent, ManifestContent, PartitionFieldSummary |
| 23 | +from pyiceberg.manifest import DataFile, DataFileContent, ManifestContent, ManifestFile, PartitionFieldSummary |
24 | 24 | from pyiceberg.partitioning import PartitionSpec |
25 | 25 | from pyiceberg.table.snapshots import Snapshot, ancestors_of |
26 | 26 | from pyiceberg.types import PrimitiveType |
@@ -384,14 +384,35 @@ def _get_all_manifests_schema(self) -> "pa.Schema": |
384 | 384 | all_manifests_schema = all_manifests_schema.append(pa.field("reference_snapshot_id", pa.int64(), nullable=False)) |
385 | 385 | return all_manifests_schema |
386 | 386 |
|
| 387 | + def _get_positional_file_schema(self) -> "pa.Schema": |
| 388 | + import pyarrow as pa |
| 389 | + |
| 390 | + from pyiceberg.io.pyarrow import schema_to_pyarrow |
| 391 | + |
| 392 | + pa_row_struct = schema_to_pyarrow(self.tbl.schema().as_struct()) |
| 393 | + positinal_delete_schema = pa.schema( |
| 394 | + [ |
| 395 | + pa.field("file_path", pa.string(), nullable=False), |
| 396 | + pa.field("pos", pa.int64(), nullable=False), |
| 397 | + pa.field("row", pa_row_struct, nullable=True), |
| 398 | + ] |
| 399 | + ) |
| 400 | + return positinal_delete_schema |
| 401 | + |
387 | 402 | def _get_positional_deletes_schema(self) -> "pa.Schema": |
388 | 403 | import pyarrow as pa |
389 | 404 |
|
| 405 | + from pyiceberg.io.pyarrow import schema_to_pyarrow |
| 406 | + |
| 407 | + partition_record = self.tbl.metadata.specs_struct() |
| 408 | + pa_partition_struct = schema_to_pyarrow(partition_record) |
| 409 | + pa_row_struct = schema_to_pyarrow(self.tbl.schema().as_struct()) |
390 | 410 | positinal_delete_schema = pa.schema( |
391 | 411 | [ |
392 | 412 | pa.field("file_path", pa.string(), nullable=False), |
393 | 413 | pa.field("pos", pa.int64(), nullable=False), |
394 | | - pa.field("row", pa.int64(), nullable=True), |
| 414 | + pa.field("row", pa_row_struct, nullable=True), |
| 415 | + pa.field("partition", pa_partition_struct, nullable=False), |
395 | 416 | pa.field("spec_id", pa.int64(), nullable=True), |
396 | 417 | pa.field("delete_file_path", pa.string(), nullable=False), |
397 | 418 | ] |
@@ -467,23 +488,30 @@ def _partition_summaries_to_rows( |
467 | 488 | schema=self._get_all_manifests_schema() if is_all_manifests_table else self._get_manifests_schema(), |
468 | 489 | ) |
469 | 490 |
|
470 | | - # def _generate_positional_delete_table(self, manifest_list: ManifestFile) -> "pa.Table": |
471 | | - # import pyarrow as pa |
472 | | - # all_deletes = [] |
473 | | - # if manifest_list.content == ManifestContent.DELETES: |
474 | | - # for manifest_entry in manifest_list.fetch_manifest_entry(self.tbl.io): |
475 | | - # if manifest_entry.data_file.content == DataFileContent.POSITION_DELETES: |
476 | | - # from pyiceberg.io.pyarrow import _read_delete_file |
477 | | - # from pyiceberg.io.pyarrow import _fs_from_file_path |
478 | | - # positional_delete = _read_delete_file( |
479 | | - # _fs_from_file_path(self.tbl.io, manifest_entry.data_file.file_path), |
480 | | - # manifest_entry.data_file) |
481 | | - # |
482 | | - # positional_delete = positional_delete.append_column("spec_id", pa.array( |
483 | | - # [manifest_list.partition_spec_id] * len(positional_delete))).append_column("delete_file_path",pa.array([manifest_entry.data_file.file_path] * len(positional_delete))) |
484 | | - # |
485 | | - # all_deletes.append(positional_delete) |
486 | | - # return pa.concat_tables(all_deletes) |
| 491 | + def _generate_positional_delete_table(self, manifest: ManifestFile, position_deletes_schema: "pa.Schema") -> "pa.Table": |
| 492 | + import pyarrow as pa |
| 493 | + |
| 494 | + positional_deletes: List["pa.Table"] = [] |
| 495 | + if manifest.content == ManifestContent.DELETES: |
| 496 | + for entry in manifest.fetch_manifest_entry(self.tbl.io): |
| 497 | + if entry.data_file.content == DataFileContent.POSITION_DELETES: |
| 498 | + from pyiceberg.io.pyarrow import _fs_from_file_path, _read_delete_file |
| 499 | + |
| 500 | + positional_delete_file = _read_delete_file( |
| 501 | + _fs_from_file_path(self.tbl.io, entry.data_file.file_path), |
| 502 | + entry.data_file, |
| 503 | + self._get_positional_file_schema(), |
| 504 | + ).to_pylist() |
| 505 | + for record in positional_delete_file: |
| 506 | + record["partition"] = entry.data_file.partition.__dict__ |
| 507 | + record["spec_id"] = manifest.partition_spec_id |
| 508 | + record["delete_file_path"] = entry.data_file.file_path |
| 509 | + |
| 510 | + positional_deletes.append(pa.Table.from_pylist(positional_delete_file, position_deletes_schema)) |
| 511 | + |
| 512 | + if not positional_deletes: |
| 513 | + return pa.Table.from_pylist([], position_deletes_schema) |
| 514 | + return pa.concat_tables(positional_deletes) |
487 | 515 |
|
488 | 516 | def manifests(self) -> "pa.Table": |
489 | 517 | return self._generate_manifests_table(self.tbl.current_snapshot()) |
@@ -693,38 +721,15 @@ def all_manifests(self) -> "pa.Table": |
693 | 721 | def position_deletes(self) -> "pa.Table": |
694 | 722 | import pyarrow as pa |
695 | 723 |
|
696 | | - snapshots = self.tbl.snapshots() |
697 | | - if not snapshots: |
698 | | - return pa.Table.from_pylist([], schema=self._get_positional_deletes_schema()) |
| 724 | + position_deletes_schema = self._get_positional_deletes_schema() |
699 | 725 | current_snapshot = self.tbl.current_snapshot() |
700 | 726 |
|
701 | | - # |
702 | | - # executor = ExecutorFactory.get_or_create() |
703 | | - # positonal_deletes: Iterator["pa.Table"] = executor.map( |
704 | | - # lambda manifest_list: self._generate_positional_delete_table(manifest_list),current_snapshot.manifests(self.tbl.io) |
705 | | - # ) |
706 | | - # all_deletes = [] |
707 | | - positional_deletes = [] |
708 | | - |
709 | | - for manifest_list in current_snapshot.manifests(self.tbl.io): |
710 | | - import pyarrow as pa |
711 | | - if manifest_list.content == ManifestContent.DELETES: |
712 | | - defaultSpecId = self.tbl.spec().spec_id |
713 | | - for manifest_entry in manifest_list.fetch_manifest_entry(self.tbl.io): |
714 | | - |
715 | | - if manifest_entry.data_file.content == DataFileContent.POSITION_DELETES: |
716 | | - from pyiceberg.io.pyarrow import _read_delete_file |
717 | | - from pyiceberg.io.pyarrow import _fs_from_file_path |
718 | | - positional_delete = _read_delete_file( |
719 | | - _fs_from_file_path(self.tbl.io, manifest_entry.data_file.file_path), |
720 | | - manifest_entry.data_file) |
721 | | - |
722 | | - positional_delete = positional_delete.append_column("spec_id", pa.array( |
723 | | - [manifest_list.partition_spec_id] * len(positional_delete))).append_column("partition", pa.array( |
724 | | - [self.] * len(positional_delete))).append_column( |
725 | | - "delete_file_path", pa.array([manifest_entry.data_file.file_path] * len(positional_delete))) |
726 | | - |
727 | | - positional_deletes.append(positional_delete) |
728 | | - # return pa.concat_tables(all_deletes) |
| 727 | + if not current_snapshot: |
| 728 | + return pa.Table.from_pylist([], schema=position_deletes_schema) |
729 | 729 |
|
| 730 | + executor = ExecutorFactory.get_or_create() |
| 731 | + positional_deletes: Iterator["pa.Table"] = executor.map( |
| 732 | + lambda manifest: self._generate_positional_delete_table(manifest, position_deletes_schema), |
| 733 | + current_snapshot.manifests(self.tbl.io), |
| 734 | + ) |
730 | 735 | return pa.concat_tables(positional_deletes) |
0 commit comments