feat(datalake): add GCS/S3 Iceberg table ingestion support (#22644)

mohitjeswani01 · mohitjeswani01 · commit 134c1ebda345 · 2026-04-28T23:36:51.000+05:30
diff --git a/ingestion/src/metadata/ingestion/source/database/datalake/clients/gcs.py b/ingestion/src/metadata/ingestion/source/database/datalake/clients/gcs.py
@@ -14,9 +14,10 @@
 """
 
 import os
+import re
 from copy import deepcopy
 from functools import partial
-from typing import Callable, Iterable, List, Optional, Set, Tuple  # noqa: UP035
+from typing import Callable, Dict, Iterable, List, Optional, Set, Tuple  # noqa: UP035
 
 from google.cloud import storage
 
@@ -107,21 +108,61 @@ def get_database_schema_names(self, bucket_name: Optional[str]) -> Iterable[str]
             for bucket in self._client.list_buckets():
                 yield bucket.name
 
+    _ICEBERG_METADATA_RE = re.compile(r"^(.*)/metadata/v(\d+)\.metadata\.json$")
+
+    @staticmethod
+    def _should_skip_gcs_cold_storage(blob) -> bool:
+        storage_class = getattr(blob, "storage_class", None)
+        return bool(storage_class and storage_class in GCS_COLD_STORAGE_CLASSES)
+
+    def _classify_gcs_blob(
+        self,
+        iceberg_tables: Dict[str, Tuple[int, str, Optional[int]]],  # noqa: UP006, UP045
+        regular_files: List[Tuple[str, Optional[int]]],  # noqa: UP006, UP045
+        blob,
+    ) -> None:
+        match = self._ICEBERG_METADATA_RE.match(blob.name)
+        if match:
+            table_dir, version = match.group(1), int(match.group(2))
+            existing = iceberg_tables.get(table_dir)
+            if existing is None or version > existing[0]:
+                iceberg_tables[table_dir] = (version, blob.name, blob.size)
+        else:
+            regular_files.append((blob.name, blob.size))
+
     def get_table_names(
         self,
         bucket_name: str,
         prefix: Optional[str],  # noqa: UP045
         skip_cold_storage: bool = False,
     ) -> Iterable[Tuple[str, Optional[int]]]:  # noqa: UP006, UP045
+        """
+        Lists tables in a GCS bucket using a single pass.
+
+        Iceberg table directories are identified by blobs matching
+        ``<table_dir>/metadata/v<N>.metadata.json``. Only the blob with the
+        highest integer version is yielded per table directory. Regular files
+        not under any Iceberg table directory are also yielded.
+        """
         bucket = self._client.get_bucket(bucket_name)
-
-        for key in bucket.list_blobs(prefix=prefix):
-            if skip_cold_storage:
-                storage_class = getattr(key, "storage_class", None)
-                if storage_class and storage_class in GCS_COLD_STORAGE_CLASSES:
-                    logger.debug(f"Skipping cold storage object: {key.name} (storage_class: {storage_class})")
-                    continue
-            yield key.name, key.size
+        iceberg_tables: Dict[str, Tuple[int, str, Optional[int]]] = {}  # noqa: UP006
+        regular_files: List[Tuple[str, Optional[int]]] = []  # noqa: UP006
+
+        for blob in bucket.list_blobs(prefix=prefix):
+            if skip_cold_storage and self._should_skip_gcs_cold_storage(blob):
+                logger.debug(
+                    f"Skipping cold storage object: {blob.name} "
+                    f"(storage_class: {getattr(blob, 'storage_class', None)})"
+                )
+                continue
+            self._classify_gcs_blob(iceberg_tables, regular_files, blob)
+
+        iceberg_dirs = set(iceberg_tables.keys())
+        for _, metadata_blob_path, size in iceberg_tables.values():
+            yield metadata_blob_path, size
+        for file_path, size in regular_files:
+            if not any(file_path.startswith(d + "/") for d in iceberg_dirs):
+                yield file_path, size
 
     def close(self, service_connection):
         os.environ.pop("GOOGLE_CLOUD_PROJECT", "")
diff --git a/ingestion/src/metadata/ingestion/source/database/datalake/clients/s3.py b/ingestion/src/metadata/ingestion/source/database/datalake/clients/s3.py
@@ -13,8 +13,9 @@
 Datalake S3 Client
 """
 
+import re
 from functools import partial
-from typing import Callable, Iterable, Optional, Set, Tuple  # noqa: UP035
+from typing import Callable, Dict, Iterable, List, Optional, Set, Tuple  # noqa: UP035
 
 from metadata.clients.aws_client import AWSClient
 from metadata.generated.schema.entity.services.connections.database.datalake.s3Config import (
@@ -61,31 +62,72 @@ def get_database_schema_names(self, bucket_name: Optional[str]) -> Iterable[str]
             for bucket in self._client.list_buckets()["Buckets"]:
                 yield bucket["Name"]
 
+    _ICEBERG_METADATA_RE = re.compile(r"^(.*)/metadata/v(\d+)\.metadata\.json$")
+
+    @staticmethod
+    def _should_skip_s3_cold_storage(key: dict) -> bool:
+        storage_class = key.get("StorageClass", "STANDARD")
+        archive_status = key.get("ArchiveStatus", "")
+        return storage_class in S3_COLD_STORAGE_CLASSES or archive_status in {
+            "ARCHIVE_ACCESS",
+            "DEEP_ARCHIVE_ACCESS",
+        }
+
+    def _classify_s3_object(
+        self,
+        iceberg_tables: Dict[str, Tuple[int, str, Optional[int]]],  # noqa: UP006, UP045
+        regular_files: List[Tuple[str, Optional[int]]],  # noqa: UP006, UP045
+        key_name: str,
+        size: Optional[int],  # noqa: UP045
+    ) -> None:
+        match = self._ICEBERG_METADATA_RE.match(key_name)
+        if match:
+            table_dir, version = match.group(1), int(match.group(2))
+            existing = iceberg_tables.get(table_dir)
+            if existing is None or version > existing[0]:
+                iceberg_tables[table_dir] = (version, key_name, size)
+        else:
+            regular_files.append((key_name, size))
+
     def get_table_names(
         self,
         bucket_name: str,
         prefix: Optional[str],  # noqa: UP045
         skip_cold_storage: bool = False,
     ) -> Iterable[Tuple[str, Optional[int]]]:  # noqa: UP006, UP045
-        kwargs = {"Bucket": bucket_name}
-
+        """
+        Lists tables in an S3 bucket using a single pass.
+
+        Iceberg table directories are identified by objects matching
+        ``<table_dir>/metadata/v<N>.metadata.json``. Only the object with the
+        highest integer version is yielded per table directory. Regular files
+        not under any Iceberg table directory are also yielded.
+        """
+        kwargs: Dict[str, str] = {"Bucket": bucket_name}  # noqa: UP006
         if prefix:
             kwargs["Prefix"] = prefix if prefix.endswith("/") else f"{prefix}/"
 
+        iceberg_tables: Dict[str, Tuple[int, str, Optional[int]]] = {}  # noqa: UP006
+        regular_files: List[Tuple[str, Optional[int]]] = []  # noqa: UP006
+
         for key in list_s3_objects(self._client, **kwargs):
-            if skip_cold_storage:
-                storage_class = key.get("StorageClass", "STANDARD")
-                archive_status = key.get("ArchiveStatus", "")
-                if storage_class in S3_COLD_STORAGE_CLASSES or archive_status in {
-                    "ARCHIVE_ACCESS",
-                    "DEEP_ARCHIVE_ACCESS",
-                }:
-                    logger.debug(
-                        f"Skipping cold storage object: {key['Key']} "
-                        f"(StorageClass: {storage_class}, ArchiveStatus: {archive_status})"
-                    )
-                    continue
-            yield key["Key"], key.get("Size")
+            key_name = key["Key"]
+            size = key.get("Size")
+            if skip_cold_storage and self._should_skip_s3_cold_storage(key):
+                logger.debug(
+                    f"Skipping cold storage object: {key_name} "
+                    f"(StorageClass: {key.get('StorageClass', 'STANDARD')}, "
+                    f"ArchiveStatus: {key.get('ArchiveStatus', '')})"
+                )
+                continue
+            self._classify_s3_object(iceberg_tables, regular_files, key_name, size)
+
+        iceberg_dirs = set(iceberg_tables.keys())
+        for _, metadata_key, size in iceberg_tables.values():
+            yield metadata_key, size
+        for file_path, size in regular_files:
+            if not any(file_path.startswith(d + "/") for d in iceberg_dirs):
+                yield file_path, size
 
     def get_folders_prefix(self, bucket_name: str, prefix: Optional[str]) -> Iterable[str]:  # noqa: UP045
         for page in self._client.get_paginator("list_objects_v2").paginate(
diff --git a/ingestion/src/metadata/ingestion/source/database/datalake/metadata.py b/ingestion/src/metadata/ingestion/source/database/datalake/metadata.py
@@ -67,6 +67,7 @@
     DataFrameColumnParser,
     fetch_dataframe_first_chunk,
     get_file_format_type,
+    get_iceberg_table_name_from_metadata_path,
 )
 from metadata.utils.filters import filter_by_database, filter_by_schema, filter_by_table
 from metadata.utils.logger import ingestion_logger
@@ -201,7 +202,7 @@ def yield_database_schema(self, schema_name: str) -> Iterable[Either[CreateDatab
 
     def get_tables_name_and_type(  # pylint: disable=too-many-branches
         self,
-    ) -> Iterable[Tuple[str, TableType, SupportedTypes, Optional[int]]]:  # noqa: UP006, UP045
+    ) -> Iterable[Tuple[str, TableType, SupportedTypes, Optional[int], str]]:  # noqa: UP006, UP045
         """
         Handle table and views.
 
@@ -238,26 +239,37 @@ def get_tables_name_and_type(  # pylint: disable=too-many-branches
                     logger.debug(f"Object filtered due to unsupported file type: {key_name}")
                     continue
 
-                yield table_name, TableType.Regular, file_extension, file_size
+                table_type = (
+                    TableType.Iceberg
+                    if get_iceberg_table_name_from_metadata_path(key_name) is not None
+                    else TableType.Regular
+                )
+                yield table_name, table_type, file_extension, file_size, key_name
 
     def yield_table(
         self,
-        table_name_and_type: Tuple[str, TableType, SupportedTypes, Optional[int]],  # noqa: UP006, UP045
+        table_name_and_type: Tuple[str, TableType, SupportedTypes, Optional[int], str],  # noqa: UP006, UP045
     ) -> Iterable[Either[CreateTableRequest]]:
         """
         From topology.
         Prepare a table request and pass it to the sink.
         Uses first chunk only for schema inference to avoid loading entire file.
         """
-        table_name, table_type, table_extension, file_size = table_name_and_type
+        (
+            table_name,
+            table_type,
+            table_extension,
+            file_size,
+            fetch_key,
+        ) = table_name_and_type
         schema_name = self.context.get().database_schema
         try:
             table_constraints = None
             data_frame, raw_data = fetch_dataframe_first_chunk(
                 config_source=self.config_source,
                 client=self.client.client,
                 file_fqn=DatalakeTableSchemaWrapper(
-                    key=table_name,
+                    key=fetch_key,
                     bucket_name=schema_name,
                     file_extension=table_extension,
                     file_size=file_size,
@@ -326,7 +338,8 @@ def standardize_table_name(
         schema: str,
         table: str,  # pylint: disable=unused-argument
     ) -> str:
-        return table
+        iceberg_name = get_iceberg_table_name_from_metadata_path(table)
+        return iceberg_name if iceberg_name is not None else table
 
     def filter_dl_table(self, table_name: str):
         """Filters Datalake Tables based on filterPattern"""
diff --git a/ingestion/src/metadata/readers/dataframe/json.py b/ingestion/src/metadata/readers/dataframe/json.py
@@ -120,7 +120,20 @@ def _read_json_object(
 
         content = content.decode(UTF_8, errors="ignore") if isinstance(content, bytes) else content
         data = json.loads(content)
-        raw_data = content if isinstance(data, dict) and data.get("$schema") else None
+        raw_data = (
+            content
+            if isinstance(data, dict)
+            and (
+                data.get("$schema") is not None  # JSON Schema files
+                or data.get("format-version")
+                is not None  # Apache Iceberg table metadata
+                or (  # Delta Lake / Iceberg schema structure
+                    isinstance(data.get("schema"), dict)
+                    and isinstance(data.get("schema", {}).get("fields"), list)
+                )
+            )
+            else None
+        )
         data = [data] if isinstance(data, dict) else data
 
         def chunk_generator():
diff --git a/ingestion/src/metadata/utils/datalake/datalake_utils.py b/ingestion/src/metadata/utils/datalake/datalake_utils.py
@@ -17,6 +17,7 @@
 import ast
 import json
 import random
+import re
 import traceback
 from typing import Any, Dict, List, Optional, Union, cast  # noqa: UP035
 
@@ -149,6 +150,25 @@ def fetch_dataframe_first_chunk(
     return None
 
 
+_ICEBERG_METADATA_PATH_RE = re.compile(r"([^/]+)/metadata/v\d+\.metadata\.json$")
+
+
+def get_iceberg_table_name_from_metadata_path(metadata_path: str) -> Optional[str]:
+    """
+    Extracts the Iceberg table directory name from a metadata file path.
+
+    Examples:
+      "warehouse/orders/metadata/v2.metadata.json"  -> "orders"
+      "my_prefix/sales/metadata/v1.metadata.json"   -> "sales"
+      "simple/metadata/v3.metadata.json"            -> "simple"
+      "data/orders.json"                            -> None
+
+    Returns None if the path does not match the Iceberg metadata pattern.
+    """
+    match = _ICEBERG_METADATA_PATH_RE.search(metadata_path)
+    return match.group(1) if match else None
+
+
 def get_file_format_type(key_name, metadata_entry=None):
     for supported_types in SupportedTypes:
         if key_name.lower().endswith(supported_types.value.lower()):
diff --git a/ingestion/tests/unit/readers/test_json_reader.py b/ingestion/tests/unit/readers/test_json_reader.py
@@ -261,6 +261,86 @@ def test_empty_json_lines(self):
         total_rows = sum(len(chunk) for chunk in chunks)
         self.assertEqual(total_rows, 2)
 
+    def test_raw_data_set_for_iceberg_metadata(self):
+        iceberg_metadata = json.dumps(
+            {
+                "format-version": 2,
+                "table-uuid": "abc-123",
+                "location": "gs://bucket/warehouse/orders",
+                "schema": {
+                    "type": "struct",
+                    "schema-id": 0,
+                    "fields": [
+                        {"id": 1, "name": "id", "type": "long", "required": True},
+                        {"id": 2, "name": "name", "type": "string", "required": False},
+                    ],
+                },
+            }
+        ).encode("utf-8")
+
+        _, raw_data = JSONDataFrameReader._read_json_object(iceberg_metadata)
+
+        assert raw_data is not None
+
+    def test_iceberg_columns_parsed_correctly(self):
+        from metadata.utils.datalake.datalake_utils import JsonDataFrameColumnParser
+
+        iceberg_metadata = json.dumps(
+            {
+                "format-version": 2,
+                "table-uuid": "abc-123",
+                "location": "gs://bucket/warehouse/orders",
+                "schema": {
+                    "type": "struct",
+                    "schema-id": 0,
+                    "fields": [
+                        {"id": 1, "name": "id", "type": "long", "required": True},
+                        {"id": 2, "name": "name", "type": "string", "required": False},
+                    ],
+                },
+            }
+        ).encode("utf-8")
+
+        _, raw_data = JSONDataFrameReader._read_json_object(iceberg_metadata)
+        assert raw_data is not None
+
+        import pandas as pd
+
+        from metadata.generated.schema.entity.data.table import DataType
+
+        empty_df = pd.DataFrame()
+        parser = JsonDataFrameColumnParser(data_frame=empty_df, raw_data=raw_data)
+        columns = parser.get_columns()
+
+        assert len(columns) == 2
+        column_names = [col.name.root for col in columns]
+        assert "id" in column_names
+        assert "name" in column_names
+
+        id_col = next(col for col in columns if col.name.root == "id")
+        name_col = next(col for col in columns if col.name.root == "name")
+        assert id_col.dataType in {DataType.INT, DataType.BIGINT, DataType.LONG}
+        assert name_col.dataType in {DataType.STRING, DataType.VARCHAR, DataType.TEXT}
+
+    def test_raw_data_none_for_regular_json(self):
+        regular_json = json.dumps([{"col1": "val1", "col2": 42}]).encode("utf-8")
+
+        _, raw_data = JSONDataFrameReader._read_json_object(regular_json)
+
+        assert raw_data is None
+
+    def test_raw_data_set_for_json_schema(self):
+        json_schema = json.dumps(
+            {
+                "$schema": "http://json-schema.org/draft-07/schema",
+                "properties": {"id": {"type": "integer"}},
+            }
+        ).encode("utf-8")
+
+        _, raw_data = JSONDataFrameReader._read_json_object(json_schema)
+
+        assert raw_data is not None
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/ingestion/tests/unit/source/database/test_iceberg_discovery.py b/ingestion/tests/unit/source/database/test_iceberg_discovery.py