Skip to content

Commit 207fc73

Browse files
committed
Merge branch 'test_matrix' of https://github.com/JE-Chen/iceberg-python into test_matrix
2 parents bd40cbd + 733735b commit 207fc73

21 files changed

Lines changed: 1454 additions & 447 deletions

.pre-commit-config.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ repos:
2323
hooks:
2424
- id: trailing-whitespace
2525
- id: end-of-file-fixer
26-
- id: check-docstring-first
2726
- id: debug-statements
2827
- id: check-yaml
2928
- id: check-ast

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ help: ## Display this help
2222
install-poetry: ## Install poetry if the user has not done that yet.
2323
@if ! command -v poetry &> /dev/null; then \
2424
echo "Poetry could not be found. Installing..."; \
25-
pip install --user poetry==1.8.5; \
25+
pip install --user poetry==2.0.1; \
2626
else \
2727
echo "Poetry is already installed."; \
2828
fi

dev/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12
4242
ENV ICEBERG_VERSION=1.6.0
4343
ENV PYICEBERG_VERSION=0.8.1
4444

45-
RUN curl --retry 5 -s -C - https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \
45+
RUN curl --retry 5 -s -C - https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \
4646
&& tar xzf spark-${SPARK_VERSION}-bin-hadoop3.tgz --directory /opt/spark --strip-components 1 \
4747
&& rm -rf spark-${SPARK_VERSION}-bin-hadoop3.tgz
4848

mkdocs/docs/api.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1258,6 +1258,29 @@ with table.manage_snapshots() as ms:
12581258
ms.create_branch(snapshot_id1, "Branch_A").create_tag(snapshot_id2, "tag789")
12591259
```
12601260

1261+
## Table Statistics Management
1262+
1263+
Manage table statistics with operations through the `Table` API:
1264+
1265+
```python
1266+
# To run a specific operation
1267+
table.update_statistics().set_statistics(snapshot_id=1, statistics_file=statistics_file).commit()
1268+
# To run multiple operations
1269+
table.update_statistics()
1270+
.set_statistics(snapshot_id1, statistics_file1)
1271+
.remove_statistics(snapshot_id2)
1272+
.commit()
1273+
# Operations are applied on commit.
1274+
```
1275+
1276+
You can also use context managers to make more changes:
1277+
1278+
```python
1279+
with table.update_statistics() as update:
1280+
update.set_statistics(snaphsot_id1, statistics_file)
1281+
update.remove_statistics(snapshot_id2)
1282+
```
1283+
12611284
## Query the data
12621285

12631286
To query a table, a table scan is needed. A table scan accepts a filter, columns, optionally a limit and a snapshot ID:

poetry.lock

Lines changed: 711 additions & 343 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyiceberg/io/__init__.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,6 @@
4848

4949
logger = logging.getLogger(__name__)
5050

51-
ADLFS_CONNECTION_STRING = "adlfs.connection-string"
52-
ADLFS_ACCOUNT_NAME = "adlfs.account-name"
53-
ADLFS_ACCOUNT_KEY = "adlfs.account-key"
54-
ADLFS_SAS_TOKEN = "adlfs.sas-token"
55-
ADLFS_TENANT_ID = "adlfs.tenant-id"
56-
ADLFS_CLIENT_ID = "adlfs.client-id"
57-
ADLFS_ClIENT_SECRET = "adlfs.client-secret"
58-
ADLFS_PREFIX = "adlfs"
5951
AWS_REGION = "client.region"
6052
AWS_ACCESS_KEY_ID = "client.access-key-id"
6153
AWS_SECRET_ACCESS_KEY = "client.secret-access-key"
@@ -94,7 +86,6 @@
9486
GCS_CACHE_TIMEOUT = "gcs.cache-timeout"
9587
GCS_REQUESTER_PAYS = "gcs.requester-pays"
9688
GCS_SESSION_KWARGS = "gcs.session-kwargs"
97-
GCS_ENDPOINT = "gcs.endpoint"
9889
GCS_SERVICE_HOST = "gcs.service.host"
9990
GCS_DEFAULT_LOCATION = "gcs.default-bucket-location"
10091
GCS_VERSION_AWARE = "gcs.version-aware"

pyiceberg/io/fsspec.py

Lines changed: 8 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,6 @@
4040
from pyiceberg.catalog import TOKEN
4141
from pyiceberg.exceptions import SignError
4242
from pyiceberg.io import (
43-
ADLFS_ACCOUNT_KEY,
44-
ADLFS_ACCOUNT_NAME,
45-
ADLFS_CLIENT_ID,
46-
ADLFS_CONNECTION_STRING,
47-
ADLFS_PREFIX,
48-
ADLFS_SAS_TOKEN,
49-
ADLFS_TENANT_ID,
5043
ADLS_ACCOUNT_KEY,
5144
ADLS_ACCOUNT_NAME,
5245
ADLS_CLIENT_ID,
@@ -61,7 +54,6 @@
6154
GCS_CACHE_TIMEOUT,
6255
GCS_CONSISTENCY,
6356
GCS_DEFAULT_LOCATION,
64-
GCS_ENDPOINT,
6557
GCS_PROJECT_ID,
6658
GCS_REQUESTER_PAYS,
6759
GCS_SERVICE_HOST,
@@ -78,7 +70,6 @@
7870
S3_SIGNER_ENDPOINT,
7971
S3_SIGNER_ENDPOINT_DEFAULT,
8072
S3_SIGNER_URI,
81-
ADLFS_ClIENT_SECRET,
8273
ADLS_ClIENT_SECRET,
8374
FileIO,
8475
InputFile,
@@ -87,7 +78,6 @@
8778
OutputStream,
8879
)
8980
from pyiceberg.typedef import Properties
90-
from pyiceberg.utils.deprecated import deprecation_message
9181
from pyiceberg.utils.properties import get_first_property_value, property_as_bool
9282

9383
logger = logging.getLogger(__name__)
@@ -172,12 +162,6 @@ def _gs(properties: Properties) -> AbstractFileSystem:
172162
# https://gcsfs.readthedocs.io/en/latest/api.html#gcsfs.core.GCSFileSystem
173163
from gcsfs import GCSFileSystem
174164

175-
if properties.get(GCS_ENDPOINT):
176-
deprecation_message(
177-
deprecated_in="0.8.0",
178-
removed_in="0.9.0",
179-
help_message=f"The property {GCS_ENDPOINT} is deprecated, please use {GCS_SERVICE_HOST} instead",
180-
)
181165
return GCSFileSystem(
182166
project=properties.get(GCS_PROJECT_ID),
183167
access=properties.get(GCS_ACCESS, "full_control"),
@@ -186,7 +170,7 @@ def _gs(properties: Properties) -> AbstractFileSystem:
186170
cache_timeout=properties.get(GCS_CACHE_TIMEOUT),
187171
requester_pays=property_as_bool(properties, GCS_REQUESTER_PAYS, False),
188172
session_kwargs=json.loads(properties.get(GCS_SESSION_KWARGS, "{}")),
189-
endpoint_url=get_first_property_value(properties, GCS_SERVICE_HOST, GCS_ENDPOINT),
173+
endpoint_url=properties.get(GCS_SERVICE_HOST),
190174
default_location=properties.get(GCS_DEFAULT_LOCATION),
191175
version_aware=property_as_bool(properties, GCS_VERSION_AWARE, False),
192176
)
@@ -195,50 +179,14 @@ def _gs(properties: Properties) -> AbstractFileSystem:
195179
def _adls(properties: Properties) -> AbstractFileSystem:
196180
from adlfs import AzureBlobFileSystem
197181

198-
for property_name in properties:
199-
if property_name.startswith(ADLFS_PREFIX):
200-
deprecation_message(
201-
deprecated_in="0.8.0",
202-
removed_in="0.9.0",
203-
help_message=f"The property {property_name} is deprecated. Please use properties that start with adls.",
204-
)
205-
206182
return AzureBlobFileSystem(
207-
connection_string=get_first_property_value(
208-
properties,
209-
ADLS_CONNECTION_STRING,
210-
ADLFS_CONNECTION_STRING,
211-
),
212-
account_name=get_first_property_value(
213-
properties,
214-
ADLS_ACCOUNT_NAME,
215-
ADLFS_ACCOUNT_NAME,
216-
),
217-
account_key=get_first_property_value(
218-
properties,
219-
ADLS_ACCOUNT_KEY,
220-
ADLFS_ACCOUNT_KEY,
221-
),
222-
sas_token=get_first_property_value(
223-
properties,
224-
ADLS_SAS_TOKEN,
225-
ADLFS_SAS_TOKEN,
226-
),
227-
tenant_id=get_first_property_value(
228-
properties,
229-
ADLS_TENANT_ID,
230-
ADLFS_TENANT_ID,
231-
),
232-
client_id=get_first_property_value(
233-
properties,
234-
ADLS_CLIENT_ID,
235-
ADLFS_CLIENT_ID,
236-
),
237-
client_secret=get_first_property_value(
238-
properties,
239-
ADLS_ClIENT_SECRET,
240-
ADLFS_ClIENT_SECRET,
241-
),
183+
connection_string=properties.get(ADLS_CONNECTION_STRING),
184+
account_name=properties.get(ADLS_ACCOUNT_NAME),
185+
account_key=properties.get(ADLS_ACCOUNT_KEY),
186+
sas_token=properties.get(ADLS_SAS_TOKEN),
187+
tenant_id=properties.get(ADLS_TENANT_ID),
188+
client_id=properties.get(ADLS_CLIENT_ID),
189+
client_secret=properties.get(ADLS_ClIENT_SECRET),
242190
)
243191

244192

pyiceberg/io/pyarrow.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,6 @@
9090
AWS_SECRET_ACCESS_KEY,
9191
AWS_SESSION_TOKEN,
9292
GCS_DEFAULT_LOCATION,
93-
GCS_ENDPOINT,
9493
GCS_SERVICE_HOST,
9594
GCS_TOKEN,
9695
GCS_TOKEN_EXPIRES_AT_MS,
@@ -166,7 +165,6 @@
166165
from pyiceberg.utils.concurrent import ExecutorFactory
167166
from pyiceberg.utils.config import Config
168167
from pyiceberg.utils.datetime import millis_to_datetime
169-
from pyiceberg.utils.deprecated import deprecation_message
170168
from pyiceberg.utils.properties import get_first_property_value, property_as_bool, property_as_int
171169
from pyiceberg.utils.singleton import Singleton
172170
from pyiceberg.utils.truncate import truncate_upper_bound_binary_string, truncate_upper_bound_text_string
@@ -471,13 +469,7 @@ def _initialize_gcs_fs(self) -> FileSystem:
471469
gcs_kwargs["credential_token_expiration"] = millis_to_datetime(int(expiration))
472470
if bucket_location := self.properties.get(GCS_DEFAULT_LOCATION):
473471
gcs_kwargs["default_bucket_location"] = bucket_location
474-
if endpoint := get_first_property_value(self.properties, GCS_SERVICE_HOST, GCS_ENDPOINT):
475-
if self.properties.get(GCS_ENDPOINT):
476-
deprecation_message(
477-
deprecated_in="0.8.0",
478-
removed_in="0.9.0",
479-
help_message=f"The property {GCS_ENDPOINT} is deprecated, please use {GCS_SERVICE_HOST} instead",
480-
)
472+
if endpoint := self.properties.get(GCS_SERVICE_HOST):
481473
url_parts = urlparse(endpoint)
482474
gcs_kwargs["scheme"] = url_parts.scheme
483475
gcs_kwargs["endpoint_override"] = url_parts.netloc

pyiceberg/table/__init__.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@
118118
_FastAppendFiles,
119119
)
120120
from pyiceberg.table.update.spec import UpdateSpec
121+
from pyiceberg.table.update.statistics import UpdateStatistics
121122
from pyiceberg.transforms import IdentityTransform
122123
from pyiceberg.typedef import (
123124
EMPTY_DICT,
@@ -1043,6 +1044,23 @@ def manage_snapshots(self) -> ManageSnapshots:
10431044
"""
10441045
return ManageSnapshots(transaction=Transaction(self, autocommit=True))
10451046

1047+
def update_statistics(self) -> UpdateStatistics:
1048+
"""
1049+
Shorthand to run statistics management operations like add statistics and remove statistics.
1050+
1051+
Use table.update_statistics().<operation>().commit() to run a specific operation.
1052+
Use table.update_statistics().<operation-one>().<operation-two>().commit() to run multiple operations.
1053+
1054+
Pending changes are applied on commit.
1055+
1056+
We can also use context managers to make more changes. For example:
1057+
1058+
with table.update_statistics() as update:
1059+
update.set_statistics(snapshot_id=1, statistics_file=statistics_file)
1060+
update.remove_statistics(snapshot_id=2)
1061+
"""
1062+
return UpdateStatistics(transaction=Transaction(self, autocommit=True))
1063+
10461064
def update_schema(self, allow_incompatible_changes: bool = False, case_sensitive: bool = True) -> UpdateSchema:
10471065
"""Create a new UpdateSchema to alter the columns of this table.
10481066

pyiceberg/table/metadata.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
SortOrder,
4545
assign_fresh_sort_order_ids,
4646
)
47+
from pyiceberg.table.statistics import StatisticsFile
4748
from pyiceberg.typedef import (
4849
EMPTY_DICT,
4950
IcebergBaseModel,
@@ -221,6 +222,14 @@ class TableMetadataCommonFields(IcebergBaseModel):
221222
There is always a main branch reference pointing to the
222223
current-snapshot-id even if the refs map is null."""
223224

225+
statistics: List[StatisticsFile] = Field(default_factory=list)
226+
"""A optional list of table statistics files.
227+
Table statistics files are valid Puffin files. Statistics are
228+
informational. A reader can choose to ignore statistics
229+
information. Statistics support is not required to read the
230+
table correctly. A table can contain many statistics files
231+
associated with different table snapshots."""
232+
224233
# validators
225234
@field_validator("properties", mode="before")
226235
def transform_properties_dict_value_to_str(cls, properties: Properties) -> Dict[str, str]:

0 commit comments

Comments
 (0)