Fixed bug for empty tables

vinjai · vinjai · commit 49f75b4ec021 · 2025-05-26T09:56:13.000+05:30
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -87,7 +87,7 @@
 from pyiceberg.table.name_mapping import (
     NameMapping,
 )
-from pyiceberg.table.refs import MAIN_BRANCH, SnapshotRef
+from pyiceberg.table.refs import SnapshotRef
 from pyiceberg.table.snapshots import (
     Snapshot,
     SnapshotLogEntry,
@@ -398,7 +398,7 @@ def _build_partition_predicate(self, partition_records: Set[Record]) -> BooleanE
             expr = Or(expr, match_partition_expression)
         return expr
 
-    def _append_snapshot_producer(self, snapshot_properties: Dict[str, str], branch: str) -> _FastAppendFiles:
+    def _append_snapshot_producer(self, snapshot_properties: Dict[str, str], branch: Optional[str]) -> _FastAppendFiles:
         """Determine the append type based on table properties.
 
         Args:
@@ -431,7 +431,7 @@ def update_schema(self, allow_incompatible_changes: bool = False, case_sensitive
             name_mapping=self.table_metadata.name_mapping(),
         )
 
-    def update_snapshot(self, snapshot_properties: Dict[str, str] = EMPTY_DICT, branch: str = MAIN_BRANCH) -> UpdateSnapshot:
+    def update_snapshot(self, snapshot_properties: Dict[str, str] = EMPTY_DICT, branch: Optional[str] = None) -> UpdateSnapshot:
         """Create a new UpdateSnapshot to produce a new snapshot for the table.
 
         Returns:
@@ -448,7 +448,7 @@ def update_statistics(self) -> UpdateStatistics:
         """
         return UpdateStatistics(transaction=self)
 
-    def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT, branch: str = MAIN_BRANCH) -> None:
+    def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT, branch: Optional[str] = None) -> None:
         """
         Shorthand API for appending a PyArrow table to a table transaction.
 
@@ -490,7 +490,7 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT,
                     append_files.append_data_file(data_file)
 
     def dynamic_partition_overwrite(
-        self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT, branch: str = MAIN_BRANCH
+        self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT, branch: Optional[str] = None
     ) -> None:
         """
         Shorthand for overwriting existing partitions with a PyArrow table.
@@ -554,7 +554,7 @@ def overwrite(
         overwrite_filter: Union[BooleanExpression, str] = ALWAYS_TRUE,
         snapshot_properties: Dict[str, str] = EMPTY_DICT,
         case_sensitive: bool = True,
-        branch: str = MAIN_BRANCH,
+        branch: Optional[str] = None,
     ) -> None:
         """
         Shorthand for adding a table overwrite with a PyArrow table to the transaction.
@@ -617,7 +617,7 @@ def delete(
         delete_filter: Union[str, BooleanExpression],
         snapshot_properties: Dict[str, str] = EMPTY_DICT,
         case_sensitive: bool = True,
-        branch: str = MAIN_BRANCH,
+        branch: Optional[str] = None,
     ) -> None:
         """
         Shorthand for deleting record from a table.
@@ -656,7 +656,10 @@ def delete(
             bound_delete_filter = bind(self.table_metadata.schema(), delete_filter, case_sensitive)
             preserve_row_filter = _expression_to_complementary_pyarrow(bound_delete_filter)
 
-            files = self._scan(row_filter=delete_filter, case_sensitive=case_sensitive).use_ref(branch).plan_files()
+            if branch is None:
+                files = self._scan(row_filter=delete_filter, case_sensitive=case_sensitive).plan_files()
+            else:
+                files = self._scan(row_filter=delete_filter, case_sensitive=case_sensitive).use_ref(branch).plan_files()
 
             commit_uuid = uuid.uuid4()
             counter = itertools.count(0)
@@ -717,6 +720,7 @@ def upsert(
         when_matched_update_all: bool = True,
         when_not_matched_insert_all: bool = True,
         case_sensitive: bool = True,
+        branch: Optional[str] = None,
     ) -> UpsertResult:
         """Shorthand API for performing an upsert to an iceberg table.
 
@@ -727,6 +731,7 @@ def upsert(
             when_matched_update_all: Bool indicating to update rows that are matched but require an update due to a value in a non-key column changing
             when_not_matched_insert_all: Bool indicating new rows to be inserted that do not match any existing rows in the table
             case_sensitive: Bool indicating if the match should be case-sensitive
+            branch: Branch Reference to run the upsert operation
 
             To learn more about the identifier-field-ids: https://iceberg.apache.org/spec/#identifier-field-ids
 
@@ -789,12 +794,24 @@ def upsert(
         matched_predicate = upsert_util.create_match_filter(df, join_cols)
 
         # We must use Transaction.table_metadata for the scan. This includes all uncommitted - but relevant - changes.
-        matched_iceberg_table = DataScan(
-            table_metadata=self.table_metadata,
-            io=self._table.io,
-            row_filter=matched_predicate,
-            case_sensitive=case_sensitive,
-        ).to_arrow()
+        if branch is None:
+            matched_iceberg_table = DataScan(
+                table_metadata=self.table_metadata,
+                io=self._table.io,
+                row_filter=matched_predicate,
+                case_sensitive=case_sensitive,
+            ).to_arrow()
+        else:
+            matched_iceberg_table = (
+                DataScan(
+                    table_metadata=self.table_metadata,
+                    io=self._table.io,
+                    row_filter=matched_predicate,
+                    case_sensitive=case_sensitive,
+                )
+                .use_ref(branch)
+                .to_arrow()
+            )
 
         update_row_cnt = 0
         insert_row_cnt = 0
@@ -811,7 +828,7 @@ def upsert(
                 # build the match predicate filter
                 overwrite_mask_predicate = upsert_util.create_match_filter(rows_to_update, join_cols)
 
-                self.overwrite(rows_to_update, overwrite_filter=overwrite_mask_predicate)
+                self.overwrite(rows_to_update, overwrite_filter=overwrite_mask_predicate, branch=branch)
 
         if when_not_matched_insert_all:
             expr_match = upsert_util.create_match_filter(matched_iceberg_table, join_cols)
@@ -822,7 +839,7 @@ def upsert(
             insert_row_cnt = len(rows_to_insert)
 
             if insert_row_cnt > 0:
-                self.append(rows_to_insert)
+                self.append(rows_to_insert, branch=branch)
 
         return UpsertResult(rows_updated=update_row_cnt, rows_inserted=insert_row_cnt)
 
@@ -1255,6 +1272,7 @@ def upsert(
         when_matched_update_all: bool = True,
         when_not_matched_insert_all: bool = True,
         case_sensitive: bool = True,
+        branch: Optional[str] = None,
     ) -> UpsertResult:
         """Shorthand API for performing an upsert to an iceberg table.
 
@@ -1265,6 +1283,7 @@ def upsert(
             when_matched_update_all: Bool indicating to update rows that are matched but require an update due to a value in a non-key column changing
             when_not_matched_insert_all: Bool indicating new rows to be inserted that do not match any existing rows in the table
             case_sensitive: Bool indicating if the match should be case-sensitive
+            branch: Branch Reference to run the upsert operation
 
             To learn more about the identifier-field-ids: https://iceberg.apache.org/spec/#identifier-field-ids
 
@@ -1297,9 +1316,10 @@ def upsert(
                 when_matched_update_all=when_matched_update_all,
                 when_not_matched_insert_all=when_not_matched_insert_all,
                 case_sensitive=case_sensitive,
+                branch=branch,
             )
 
-    def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT, branch: str = MAIN_BRANCH) -> None:
+    def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT, branch: Optional[str] = None) -> None:
         """
         Shorthand API for appending a PyArrow table to the table.
 
@@ -1312,7 +1332,7 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT,
             tx.append(df=df, snapshot_properties=snapshot_properties, branch=branch)
 
     def dynamic_partition_overwrite(
-        self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT, branch: str = MAIN_BRANCH
+        self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT, branch: Optional[str] = None
     ) -> None:
         """Shorthand for dynamic overwriting the table with a PyArrow table.
 
@@ -1331,7 +1351,7 @@ def overwrite(
         overwrite_filter: Union[BooleanExpression, str] = ALWAYS_TRUE,
         snapshot_properties: Dict[str, str] = EMPTY_DICT,
         case_sensitive: bool = True,
-        branch: str = MAIN_BRANCH,
+        branch: Optional[str] = None,
     ) -> None:
         """
         Shorthand for overwriting the table with a PyArrow table.
@@ -1364,7 +1384,7 @@ def delete(
         delete_filter: Union[BooleanExpression, str] = ALWAYS_TRUE,
         snapshot_properties: Dict[str, str] = EMPTY_DICT,
         case_sensitive: bool = True,
-        branch: str = MAIN_BRANCH,
+        branch: Optional[str] = None,
     ) -> None:
         """
         Shorthand for deleting rows from the table.
diff --git a/pyiceberg/table/update/snapshot.py b/pyiceberg/table/update/snapshot.py
@@ -105,30 +105,39 @@ class _SnapshotProducer(UpdateTableMetadata[U], Generic[U]):
     _added_data_files: List[DataFile]
     _manifest_num_counter: itertools.count[int]
     _deleted_data_files: Set[DataFile]
-    _branch: str
 
     def __init__(
         self,
         operation: Operation,
         transaction: Transaction,
         io: FileIO,
-        branch: str,
         commit_uuid: Optional[uuid.UUID] = None,
         snapshot_properties: Dict[str, str] = EMPTY_DICT,
+        branch: str = MAIN_BRANCH,
     ) -> None:
         super().__init__(transaction)
         self.commit_uuid = commit_uuid or uuid.uuid4()
         self._io = io
         self._operation = operation
         self._snapshot_id = self._transaction.table_metadata.new_snapshot_id()
-        self._branch = branch
-        self._parent_snapshot_id = (
-            snapshot.snapshot_id if (snapshot := self._transaction.table_metadata.snapshot_by_name(self._branch)) else None
-        )
         self._added_data_files = []
         self._deleted_data_files = set()
         self.snapshot_properties = snapshot_properties
         self._manifest_num_counter = itertools.count(0)
+        self._set_target_branch(branch=branch)
+        self._parent_snapshot_id = (
+            snapshot.snapshot_id if (snapshot := self._transaction.table_metadata.snapshot_by_name(self._target_branch)) else None
+        )
+
+    def _set_target_branch(self, branch: str) -> None:
+        # Default is already set to MAIN_BRANCH. So branch name can't be None.
+        assert branch is not None, ValueError("Invalid branch name: null")
+        if branch in self._transaction.table_metadata.refs:
+            ref = self._transaction.table_metadata.refs[branch]
+            assert ref.snapshot_ref_type == SnapshotRefType.BRANCH, ValueError(
+                f"{branch} is a tag, not a branch. Tags cannot be targets for producing snapshots"
+            )
+        self._target_branch = branch
 
     def append_data_file(self, data_file: DataFile) -> _SnapshotProducer[U]:
         self._added_data_files.append(data_file)
@@ -276,16 +285,16 @@ def _commit(self) -> UpdatesAndRequirements:
                 SetSnapshotRefUpdate(
                     snapshot_id=self._snapshot_id,
                     parent_snapshot_id=self._parent_snapshot_id,
-                    ref_name=self._branch,
+                    ref_name=self._target_branch,
                     type=SnapshotRefType.BRANCH,
                 ),
             ),
             (
                 AssertRefSnapshotId(
-                    snapshot_id=self._transaction.table_metadata.refs[self._branch].snapshot_id
-                    if self._branch in self._transaction.table_metadata.refs
+                    snapshot_id=self._transaction.table_metadata.refs[self._target_branch].snapshot_id
+                    if self._target_branch in self._transaction.table_metadata.refs
                     else self._transaction.table_metadata.current_snapshot_id,
-                    ref=self._branch,
+                    ref=self._target_branch,
                 ),
             ),
         )
@@ -338,7 +347,7 @@ def __init__(
         commit_uuid: Optional[uuid.UUID] = None,
         snapshot_properties: Dict[str, str] = EMPTY_DICT,
     ):
-        super().__init__(operation, transaction, io, branch, commit_uuid, snapshot_properties)
+        super().__init__(operation, transaction, io, commit_uuid, snapshot_properties, branch)
         self._predicate = AlwaysFalse()
         self._case_sensitive = True
 
@@ -503,7 +512,7 @@ def __init__(
     ) -> None:
         from pyiceberg.table import TableProperties
 
-        super().__init__(operation, transaction, io, branch, commit_uuid, snapshot_properties)
+        super().__init__(operation, transaction, io, commit_uuid, snapshot_properties, branch)
         self._target_size_bytes = property_as_int(
             self._transaction.table_metadata.properties,
             TableProperties.MANIFEST_TARGET_SIZE_BYTES,
@@ -549,7 +558,7 @@ def _existing_manifests(self) -> List[ManifestFile]:
         """Determine if there are any existing manifest files."""
         existing_files = []
 
-        if snapshot := self._transaction.table_metadata.snapshot_by_name(name=self._branch):
+        if snapshot := self._transaction.table_metadata.snapshot_by_name(name=self._target_branch):
             for manifest_file in snapshot.manifests(io=self._io):
                 entries = manifest_file.fetch_manifest_entry(io=self._io, discard_deleted=True)
                 found_deleted_data_files = [entry.data_file for entry in entries if entry.data_file in self._deleted_data_files]
@@ -623,12 +632,16 @@ class UpdateSnapshot:
     _snapshot_properties: Dict[str, str]
 
     def __init__(
-        self, transaction: Transaction, io: FileIO, snapshot_properties: Dict[str, str] = EMPTY_DICT, branch: str = MAIN_BRANCH
+        self,
+        transaction: Transaction,
+        io: FileIO,
+        snapshot_properties: Dict[str, str] = EMPTY_DICT,
+        branch: Optional[str] = MAIN_BRANCH,
     ) -> None:
         self._transaction = transaction
         self._io = io
         self._snapshot_properties = snapshot_properties
-        self._branch = branch
+        self._branch = branch if branch is not None else MAIN_BRANCH
 
     def fast_append(self) -> _FastAppendFiles:
         return _FastAppendFiles(
diff --git a/pyiceberg/utils/concurrent.py b/pyiceberg/utils/concurrent.py
@@ -25,6 +25,11 @@
 class ExecutorFactory:
     _instance: Optional[Executor] = None
 
+    @staticmethod
+    def max_workers() -> Optional[int]:
+        """Return the max number of workers configured."""
+        return Config().get_int("max-workers")
+
     @staticmethod
     def get_or_create() -> Executor:
         """Return the same executor in each call."""
@@ -33,8 +38,3 @@ def get_or_create() -> Executor:
             ExecutorFactory._instance = ThreadPoolExecutor(max_workers=max_workers)
 
         return ExecutorFactory._instance
-
-    @staticmethod
-    def max_workers() -> Optional[int]:
-        """Return the max number of workers configured."""
-        return Config().get_int("max-workers")