|
13 | 13 | Datalake S3 Client |
14 | 14 | """ |
15 | 15 |
|
| 16 | +import re |
16 | 17 | from functools import partial |
17 | | -from typing import Callable, Iterable, Optional, Set, Tuple # noqa: UP035 |
| 18 | +from typing import Callable, Dict, Iterable, List, Optional, Set, Tuple # noqa: UP035 |
18 | 19 |
|
19 | 20 | from metadata.clients.aws_client import AWSClient |
20 | 21 | from metadata.generated.schema.entity.services.connections.database.datalake.s3Config import ( |
@@ -61,31 +62,72 @@ def get_database_schema_names(self, bucket_name: Optional[str]) -> Iterable[str] |
61 | 62 | for bucket in self._client.list_buckets()["Buckets"]: |
62 | 63 | yield bucket["Name"] |
63 | 64 |
|
| 65 | + _ICEBERG_METADATA_RE = re.compile(r"^(.*)/metadata/v(\d+)\.metadata\.json$") |
| 66 | + |
| 67 | + @staticmethod |
| 68 | + def _should_skip_s3_cold_storage(key: dict) -> bool: |
| 69 | + storage_class = key.get("StorageClass", "STANDARD") |
| 70 | + archive_status = key.get("ArchiveStatus", "") |
| 71 | + return storage_class in S3_COLD_STORAGE_CLASSES or archive_status in { |
| 72 | + "ARCHIVE_ACCESS", |
| 73 | + "DEEP_ARCHIVE_ACCESS", |
| 74 | + } |
| 75 | + |
| 76 | + def _classify_s3_object( |
| 77 | + self, |
| 78 | + iceberg_tables: Dict[str, Tuple[int, str, Optional[int]]], # noqa: UP006, UP045 |
| 79 | + regular_files: List[Tuple[str, Optional[int]]], # noqa: UP006, UP045 |
| 80 | + key_name: str, |
| 81 | + size: Optional[int], # noqa: UP045 |
| 82 | + ) -> None: |
| 83 | + match = self._ICEBERG_METADATA_RE.match(key_name) |
| 84 | + if match: |
| 85 | + table_dir, version = match.group(1), int(match.group(2)) |
| 86 | + existing = iceberg_tables.get(table_dir) |
| 87 | + if existing is None or version > existing[0]: |
| 88 | + iceberg_tables[table_dir] = (version, key_name, size) |
| 89 | + else: |
| 90 | + regular_files.append((key_name, size)) |
| 91 | + |
64 | 92 | def get_table_names( |
65 | 93 | self, |
66 | 94 | bucket_name: str, |
67 | 95 | prefix: Optional[str], # noqa: UP045 |
68 | 96 | skip_cold_storage: bool = False, |
69 | 97 | ) -> Iterable[Tuple[str, Optional[int]]]: # noqa: UP006, UP045 |
70 | | - kwargs = {"Bucket": bucket_name} |
71 | | - |
| 98 | + """ |
| 99 | + Lists tables in an S3 bucket using a single pass. |
| 100 | +
|
| 101 | + Iceberg table directories are identified by objects matching |
| 102 | + ``<table_dir>/metadata/v<N>.metadata.json``. Only the object with the |
| 103 | + highest integer version is yielded per table directory. Regular files |
| 104 | + not under any Iceberg table directory are also yielded. |
| 105 | + """ |
| 106 | + kwargs: Dict[str, str] = {"Bucket": bucket_name} # noqa: UP006 |
72 | 107 | if prefix: |
73 | 108 | kwargs["Prefix"] = prefix if prefix.endswith("/") else f"{prefix}/" |
74 | 109 |
|
| 110 | + iceberg_tables: Dict[str, Tuple[int, str, Optional[int]]] = {} # noqa: UP006 |
| 111 | + regular_files: List[Tuple[str, Optional[int]]] = [] # noqa: UP006 |
| 112 | + |
75 | 113 | for key in list_s3_objects(self._client, **kwargs): |
76 | | - if skip_cold_storage: |
77 | | - storage_class = key.get("StorageClass", "STANDARD") |
78 | | - archive_status = key.get("ArchiveStatus", "") |
79 | | - if storage_class in S3_COLD_STORAGE_CLASSES or archive_status in { |
80 | | - "ARCHIVE_ACCESS", |
81 | | - "DEEP_ARCHIVE_ACCESS", |
82 | | - }: |
83 | | - logger.debug( |
84 | | - f"Skipping cold storage object: {key['Key']} " |
85 | | - f"(StorageClass: {storage_class}, ArchiveStatus: {archive_status})" |
86 | | - ) |
87 | | - continue |
88 | | - yield key["Key"], key.get("Size") |
| 114 | + key_name = key["Key"] |
| 115 | + size = key.get("Size") |
| 116 | + if skip_cold_storage and self._should_skip_s3_cold_storage(key): |
| 117 | + logger.debug( |
| 118 | + f"Skipping cold storage object: {key_name} " |
| 119 | + f"(StorageClass: {key.get('StorageClass', 'STANDARD')}, " |
| 120 | + f"ArchiveStatus: {key.get('ArchiveStatus', '')})" |
| 121 | + ) |
| 122 | + continue |
| 123 | + self._classify_s3_object(iceberg_tables, regular_files, key_name, size) |
| 124 | + |
| 125 | + iceberg_dirs = set(iceberg_tables.keys()) |
| 126 | + for _, metadata_key, size in iceberg_tables.values(): |
| 127 | + yield metadata_key, size |
| 128 | + for file_path, size in regular_files: |
| 129 | + if not any(file_path.startswith(d + "/") for d in iceberg_dirs): |
| 130 | + yield file_path, size |
89 | 131 |
|
90 | 132 | def get_folders_prefix(self, bucket_name: str, prefix: Optional[str]) -> Iterable[str]: # noqa: UP045 |
91 | 133 | for page in self._client.get_paginator("list_objects_v2").paginate( |
|
0 commit comments