apache
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 0 additions & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎dev/Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mkdocs/docs/api.md‎
Lines changed: 23 additions & 0 deletions b/‎mkdocs/docs/api.md‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎poetry.lock‎
Lines changed: 711 additions & 343 deletions b/‎poetry.lock‎
Lines changed: 711 additions & 343 deletions
diff --git a/‎pyiceberg/io/__init__.py‎
Lines changed: 0 additions & 9 deletions b/‎pyiceberg/io/__init__.py‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎pyiceberg/io/fsspec.py‎
Lines changed: 8 additions & 60 deletions b/‎pyiceberg/io/fsspec.py‎
Lines changed: 8 additions & 60 deletions
diff --git a/‎pyiceberg/io/pyarrow.py‎
Lines changed: 1 addition & 9 deletions b/‎pyiceberg/io/pyarrow.py‎
Lines changed: 1 addition & 9 deletions
diff --git a/‎pyiceberg/table/__init__.py‎
Lines changed: 18 additions & 0 deletions b/‎pyiceberg/table/__init__.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎pyiceberg/table/metadata.py‎
Lines changed: 9 additions & 0 deletions b/‎pyiceberg/table/metadata.py‎
Lines changed: 9 additions & 0 deletions
@@ -23,7 +23,6 @@ repos:
     hooks:
       - id: trailing-whitespace
       - id: end-of-file-fixer
-      - id: check-docstring-first
       - id: debug-statements
       - id: check-yaml
       - id: check-ast
 
@@ -22,7 +22,7 @@ help:  ## Display this help
 install-poetry:  ## Install poetry if the user has not done that yet.
 	 @if ! command -v poetry &> /dev/null; then \
          echo "Poetry could not be found. Installing..."; \
-         pip install --user poetry==1.8.5; \
+         pip install --user poetry==2.0.1; \
      else \
          echo "Poetry is already installed."; \
      fi
 
@@ -42,7 +42,7 @@ ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12
 ENV ICEBERG_VERSION=1.6.0
 ENV PYICEBERG_VERSION=0.8.1
 
-RUN curl --retry 5 -s -C - https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \
+RUN curl --retry 5 -s -C - https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \
  && tar xzf spark-${SPARK_VERSION}-bin-hadoop3.tgz --directory /opt/spark --strip-components 1 \
  && rm -rf spark-${SPARK_VERSION}-bin-hadoop3.tgz
 
 
@@ -1258,6 +1258,29 @@ with table.manage_snapshots() as ms:
     ms.create_branch(snapshot_id1, "Branch_A").create_tag(snapshot_id2, "tag789")
 ```
 
+## Table Statistics Management
+
+Manage table statistics with operations through the `Table` API:
+
+```python
+# To run a specific operation
+table.update_statistics().set_statistics(snapshot_id=1, statistics_file=statistics_file).commit()
+# To run multiple operations
+table.update_statistics()
+  .set_statistics(snapshot_id1, statistics_file1)
+  .remove_statistics(snapshot_id2)
+  .commit()
+# Operations are applied on commit.
+```
+
+You can also use context managers to make more changes:
+
+```python
+with table.update_statistics() as update:
+    update.set_statistics(snaphsot_id1, statistics_file)
+    update.remove_statistics(snapshot_id2)
+```
+
 ## Query the data
 
 To query a table, a table scan is needed. A table scan accepts a filter, columns, optionally a limit and a snapshot ID:
 
@@ -48,14 +48,6 @@
 
 logger = logging.getLogger(__name__)
 
-ADLFS_CONNECTION_STRING = "adlfs.connection-string"
-ADLFS_ACCOUNT_NAME = "adlfs.account-name"
-ADLFS_ACCOUNT_KEY = "adlfs.account-key"
-ADLFS_SAS_TOKEN = "adlfs.sas-token"
-ADLFS_TENANT_ID = "adlfs.tenant-id"
-ADLFS_CLIENT_ID = "adlfs.client-id"
-ADLFS_ClIENT_SECRET = "adlfs.client-secret"
-ADLFS_PREFIX = "adlfs"
 AWS_REGION = "client.region"
 AWS_ACCESS_KEY_ID = "client.access-key-id"
 AWS_SECRET_ACCESS_KEY = "client.secret-access-key"
@@ -94,7 +86,6 @@
 GCS_CACHE_TIMEOUT = "gcs.cache-timeout"
 GCS_REQUESTER_PAYS = "gcs.requester-pays"
 GCS_SESSION_KWARGS = "gcs.session-kwargs"
-GCS_ENDPOINT = "gcs.endpoint"
 GCS_SERVICE_HOST = "gcs.service.host"
 GCS_DEFAULT_LOCATION = "gcs.default-bucket-location"
 GCS_VERSION_AWARE = "gcs.version-aware"
 
@@ -40,13 +40,6 @@
 from pyiceberg.catalog import TOKEN
 from pyiceberg.exceptions import SignError
 from pyiceberg.io import (
-    ADLFS_ACCOUNT_KEY,
-    ADLFS_ACCOUNT_NAME,
-    ADLFS_CLIENT_ID,
-    ADLFS_CONNECTION_STRING,
-    ADLFS_PREFIX,
-    ADLFS_SAS_TOKEN,
-    ADLFS_TENANT_ID,
     ADLS_ACCOUNT_KEY,
     ADLS_ACCOUNT_NAME,
     ADLS_CLIENT_ID,
@@ -61,7 +54,6 @@
     GCS_CACHE_TIMEOUT,
     GCS_CONSISTENCY,
     GCS_DEFAULT_LOCATION,
-    GCS_ENDPOINT,
     GCS_PROJECT_ID,
     GCS_REQUESTER_PAYS,
     GCS_SERVICE_HOST,
@@ -78,7 +70,6 @@
     S3_SIGNER_ENDPOINT,
     S3_SIGNER_ENDPOINT_DEFAULT,
     S3_SIGNER_URI,
-    ADLFS_ClIENT_SECRET,
     ADLS_ClIENT_SECRET,
     FileIO,
     InputFile,
@@ -87,7 +78,6 @@
     OutputStream,
 )
 from pyiceberg.typedef import Properties
-from pyiceberg.utils.deprecated import deprecation_message
 from pyiceberg.utils.properties import get_first_property_value, property_as_bool
 
 logger = logging.getLogger(__name__)
@@ -172,12 +162,6 @@ def _gs(properties: Properties) -> AbstractFileSystem:
     # https://gcsfs.readthedocs.io/en/latest/api.html#gcsfs.core.GCSFileSystem
     from gcsfs import GCSFileSystem
 
-    if properties.get(GCS_ENDPOINT):
-        deprecation_message(
-            deprecated_in="0.8.0",
-            removed_in="0.9.0",
-            help_message=f"The property {GCS_ENDPOINT} is deprecated, please use {GCS_SERVICE_HOST} instead",
-        )
     return GCSFileSystem(
         project=properties.get(GCS_PROJECT_ID),
         access=properties.get(GCS_ACCESS, "full_control"),
@@ -186,7 +170,7 @@ def _gs(properties: Properties) -> AbstractFileSystem:
         cache_timeout=properties.get(GCS_CACHE_TIMEOUT),
         requester_pays=property_as_bool(properties, GCS_REQUESTER_PAYS, False),
         session_kwargs=json.loads(properties.get(GCS_SESSION_KWARGS, "{}")),
-        endpoint_url=get_first_property_value(properties, GCS_SERVICE_HOST, GCS_ENDPOINT),
+        endpoint_url=properties.get(GCS_SERVICE_HOST),
         default_location=properties.get(GCS_DEFAULT_LOCATION),
         version_aware=property_as_bool(properties, GCS_VERSION_AWARE, False),
     )
@@ -195,50 +179,14 @@ def _gs(properties: Properties) -> AbstractFileSystem:
 def _adls(properties: Properties) -> AbstractFileSystem:
     from adlfs import AzureBlobFileSystem
 
-    for property_name in properties:
-        if property_name.startswith(ADLFS_PREFIX):
-            deprecation_message(
-                deprecated_in="0.8.0",
-                removed_in="0.9.0",
-                help_message=f"The property {property_name} is deprecated. Please use properties that start with adls.",
-            )
-
     return AzureBlobFileSystem(
-        connection_string=get_first_property_value(
-            properties,
-            ADLS_CONNECTION_STRING,
-            ADLFS_CONNECTION_STRING,
-        ),
-        account_name=get_first_property_value(
-            properties,
-            ADLS_ACCOUNT_NAME,
-            ADLFS_ACCOUNT_NAME,
-        ),
-        account_key=get_first_property_value(
-            properties,
-            ADLS_ACCOUNT_KEY,
-            ADLFS_ACCOUNT_KEY,
-        ),
-        sas_token=get_first_property_value(
-            properties,
-            ADLS_SAS_TOKEN,
-            ADLFS_SAS_TOKEN,
-        ),
-        tenant_id=get_first_property_value(
-            properties,
-            ADLS_TENANT_ID,
-            ADLFS_TENANT_ID,
-        ),
-        client_id=get_first_property_value(
-            properties,
-            ADLS_CLIENT_ID,
-            ADLFS_CLIENT_ID,
-        ),
-        client_secret=get_first_property_value(
-            properties,
-            ADLS_ClIENT_SECRET,
-            ADLFS_ClIENT_SECRET,
-        ),
+        connection_string=properties.get(ADLS_CONNECTION_STRING),
+        account_name=properties.get(ADLS_ACCOUNT_NAME),
+        account_key=properties.get(ADLS_ACCOUNT_KEY),
+        sas_token=properties.get(ADLS_SAS_TOKEN),
+        tenant_id=properties.get(ADLS_TENANT_ID),
+        client_id=properties.get(ADLS_CLIENT_ID),
+        client_secret=properties.get(ADLS_ClIENT_SECRET),
     )
 
 
 
@@ -90,7 +90,6 @@
     AWS_SECRET_ACCESS_KEY,
     AWS_SESSION_TOKEN,
     GCS_DEFAULT_LOCATION,
-    GCS_ENDPOINT,
     GCS_SERVICE_HOST,
     GCS_TOKEN,
     GCS_TOKEN_EXPIRES_AT_MS,
@@ -166,7 +165,6 @@
 from pyiceberg.utils.concurrent import ExecutorFactory
 from pyiceberg.utils.config import Config
 from pyiceberg.utils.datetime import millis_to_datetime
-from pyiceberg.utils.deprecated import deprecation_message
 from pyiceberg.utils.properties import get_first_property_value, property_as_bool, property_as_int
 from pyiceberg.utils.singleton import Singleton
 from pyiceberg.utils.truncate import truncate_upper_bound_binary_string, truncate_upper_bound_text_string
@@ -471,13 +469,7 @@ def _initialize_gcs_fs(self) -> FileSystem:
             gcs_kwargs["credential_token_expiration"] = millis_to_datetime(int(expiration))
         if bucket_location := self.properties.get(GCS_DEFAULT_LOCATION):
             gcs_kwargs["default_bucket_location"] = bucket_location
-        if endpoint := get_first_property_value(self.properties, GCS_SERVICE_HOST, GCS_ENDPOINT):
-            if self.properties.get(GCS_ENDPOINT):
-                deprecation_message(
-                    deprecated_in="0.8.0",
-                    removed_in="0.9.0",
-                    help_message=f"The property {GCS_ENDPOINT} is deprecated, please use {GCS_SERVICE_HOST} instead",
-                )
+        if endpoint := self.properties.get(GCS_SERVICE_HOST):
             url_parts = urlparse(endpoint)
             gcs_kwargs["scheme"] = url_parts.scheme
             gcs_kwargs["endpoint_override"] = url_parts.netloc
 
@@ -118,6 +118,7 @@
     _FastAppendFiles,
 )
 from pyiceberg.table.update.spec import UpdateSpec
+from pyiceberg.table.update.statistics import UpdateStatistics
 from pyiceberg.transforms import IdentityTransform
 from pyiceberg.typedef import (
     EMPTY_DICT,
@@ -1043,6 +1044,23 @@ def manage_snapshots(self) -> ManageSnapshots:
         """
         return ManageSnapshots(transaction=Transaction(self, autocommit=True))
 
+    def update_statistics(self) -> UpdateStatistics:
+        """
+        Shorthand to run statistics management operations like add statistics and remove statistics.
+
+        Use table.update_statistics().<operation>().commit() to run a specific operation.
+        Use table.update_statistics().<operation-one>().<operation-two>().commit() to run multiple operations.
+
+        Pending changes are applied on commit.
+
+        We can also use context managers to make more changes. For example:
+
+        with table.update_statistics() as update:
+            update.set_statistics(snapshot_id=1, statistics_file=statistics_file)
+            update.remove_statistics(snapshot_id=2)
+        """
+        return UpdateStatistics(transaction=Transaction(self, autocommit=True))
+
     def update_schema(self, allow_incompatible_changes: bool = False, case_sensitive: bool = True) -> UpdateSchema:
         """Create a new UpdateSchema to alter the columns of this table.
 
 
@@ -44,6 +44,7 @@
     SortOrder,
     assign_fresh_sort_order_ids,
 )
+from pyiceberg.table.statistics import StatisticsFile
 from pyiceberg.typedef import (
     EMPTY_DICT,
     IcebergBaseModel,
@@ -221,6 +222,14 @@ class TableMetadataCommonFields(IcebergBaseModel):
     There is always a main branch reference pointing to the
     current-snapshot-id even if the refs map is null."""
 
+    statistics: List[StatisticsFile] = Field(default_factory=list)
+    """A optional list of table statistics files.
+    Table statistics files are valid Puffin files. Statistics are
+    informational. A reader can choose to ignore statistics
+    information. Statistics support is not required to read the
+    table correctly. A table can contain many statistics files
+    associated with different table snapshots."""
+
     # validators
     @field_validator("properties", mode="before")
     def transform_properties_dict_value_to_str(cls, properties: Properties) -> Dict[str, str]: