fix(datalake): address Copilot + gitar-bot findings on Iceberg ingestion

mohitjeswani01 · mohitjeswani01 · commit 8082b4e2a75b · 2026-04-28T23:56:16.000+05:30
- Fix _is_json_lines false-positive: minified single-line Iceberg/Delta metadata
  dicts were classified as JSONL, bypassing the raw_data gate entirely. Now all
  three detection conditions (format-version, schema.fields, \) are checked.
- Move _ICEBERG_METADATA_RE and _update_iceberg_entry to DatalakeBaseClient to
  eliminate regex/classify duplication between GCS and S3 clients (DRY)
- Replace single-pass O(N) memory approach with two-pass streaming: pass 1 builds
  iceberg_tables dict only (O(tables)), pass 2 streams regular files without
  accumulation (O(1) per object)
- Fix sys.modules stub in test_iceberg_discovery.py: use setdefault for all three
  google module entries to avoid overwriting real installed packages
diff --git a/ingestion/src/metadata/ingestion/source/database/datalake/clients/base.py b/ingestion/src/metadata/ingestion/source/database/datalake/clients/base.py
@@ -13,13 +13,36 @@
 Datalake Base Client
 """
 
+import re
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Iterable, Optional, Tuple  # noqa: UP035
+from typing import Any, Callable, Dict, Iterable, Optional, Tuple  # noqa: UP035
 
 
 class DatalakeBaseClient(ABC):
     """Base DL client implementation"""
 
+    _ICEBERG_METADATA_RE = re.compile(r"^(.*)/metadata/v(\d+)\.metadata\.json$")
+
+    def _update_iceberg_entry(
+        self,
+        iceberg_tables: Dict[str, Tuple[int, str, Optional[int]]],  # noqa: UP006, UP045
+        name: str,
+        size: Optional[int],  # noqa: UP045
+    ) -> bool:
+        """
+        If name matches the Iceberg metadata pattern, update iceberg_tables with
+        the highest-version entry and return True. Otherwise return False.
+        """
+        match = self._ICEBERG_METADATA_RE.match(name)
+        if not match:
+            return False
+        table_dir, version = match.group(1), int(match.group(2))
+        existing = iceberg_tables.get(table_dir)
+        if existing is None or version > existing[0]:
+            iceberg_tables[table_dir] = (version, name, size)
+        return True
+
+
     def __init__(self, client: Any, session: Any = None, **kwargs):
         self._client = client
         self._session = session
diff --git a/ingestion/src/metadata/ingestion/source/database/datalake/clients/gcs.py b/ingestion/src/metadata/ingestion/source/database/datalake/clients/gcs.py
@@ -14,7 +14,6 @@
 """
 
 import os
-import re
 from copy import deepcopy
 from functools import partial
 from typing import Callable, Dict, Iterable, List, Optional, Set, Tuple  # noqa: UP035
@@ -108,45 +107,26 @@ def get_database_schema_names(self, bucket_name: Optional[str]) -> Iterable[str]
             for bucket in self._client.list_buckets():
                 yield bucket.name
 
-    _ICEBERG_METADATA_RE = re.compile(r"^(.*)/metadata/v(\d+)\.metadata\.json$")
-
     @staticmethod
     def _should_skip_gcs_cold_storage(blob) -> bool:
         storage_class = getattr(blob, "storage_class", None)
         return bool(storage_class and storage_class in GCS_COLD_STORAGE_CLASSES)
 
-    def _classify_gcs_blob(
-        self,
-        iceberg_tables: Dict[str, Tuple[int, str, Optional[int]]],  # noqa: UP006, UP045
-        regular_files: List[Tuple[str, Optional[int]]],  # noqa: UP006, UP045
-        blob,
-    ) -> None:
-        match = self._ICEBERG_METADATA_RE.match(blob.name)
-        if match:
-            table_dir, version = match.group(1), int(match.group(2))
-            existing = iceberg_tables.get(table_dir)
-            if existing is None or version > existing[0]:
-                iceberg_tables[table_dir] = (version, blob.name, blob.size)
-        else:
-            regular_files.append((blob.name, blob.size))
-
     def get_table_names(
         self,
         bucket_name: str,
         prefix: Optional[str],  # noqa: UP045
         skip_cold_storage: bool = False,
     ) -> Iterable[Tuple[str, Optional[int]]]:  # noqa: UP006, UP045
         """
-        Lists tables in a GCS bucket using a single pass.
+        Lists tables in a GCS bucket using a two-pass approach.
 
-        Iceberg table directories are identified by blobs matching
-        ``<table_dir>/metadata/v<N>.metadata.json``. Only the blob with the
-        highest integer version is yielded per table directory. Regular files
-        not under any Iceberg table directory are also yielded.
+        Pass 1 collects only the Iceberg table dict (memory proportional to the
+        number of Iceberg tables, which is always small). Pass 2 streams regular
+        files without accumulation, keeping memory overhead at O(1) per object.
         """
         bucket = self._client.get_bucket(bucket_name)
         iceberg_tables: Dict[str, Tuple[int, str, Optional[int]]] = {}  # noqa: UP006
-        regular_files: List[Tuple[str, Optional[int]]] = []  # noqa: UP006
 
         for blob in bucket.list_blobs(prefix=prefix):
             if skip_cold_storage and self._should_skip_gcs_cold_storage(blob):
@@ -155,14 +135,19 @@ def get_table_names(
                     f"(storage_class: {getattr(blob, 'storage_class', None)})"
                 )
                 continue
-            self._classify_gcs_blob(iceberg_tables, regular_files, blob)
+            self._update_iceberg_entry(iceberg_tables, blob.name, blob.size)
 
         iceberg_dirs = set(iceberg_tables.keys())
         for _, metadata_blob_path, size in iceberg_tables.values():
             yield metadata_blob_path, size
-        for file_path, size in regular_files:
-            if not any(file_path.startswith(d + "/") for d in iceberg_dirs):
-                yield file_path, size
+
+        for blob in bucket.list_blobs(prefix=prefix):
+            if skip_cold_storage and self._should_skip_gcs_cold_storage(blob):
+                continue
+            if not self._ICEBERG_METADATA_RE.match(blob.name) and not any(
+                blob.name.startswith(d + "/") for d in iceberg_dirs
+            ):
+                yield blob.name, blob.size
 
     def close(self, service_connection):
         os.environ.pop("GOOGLE_CLOUD_PROJECT", "")
diff --git a/ingestion/src/metadata/ingestion/source/database/datalake/clients/s3.py b/ingestion/src/metadata/ingestion/source/database/datalake/clients/s3.py
@@ -13,9 +13,8 @@
 Datalake S3 Client
 """
 
-import re
 from functools import partial
-from typing import Callable, Dict, Iterable, List, Optional, Set, Tuple  # noqa: UP035
+from typing import Callable, Dict, Iterable, Optional, Set, Tuple  # noqa: UP035
 
 from metadata.clients.aws_client import AWSClient
 from metadata.generated.schema.entity.services.connections.database.datalake.s3Config import (
@@ -62,8 +61,6 @@ def get_database_schema_names(self, bucket_name: Optional[str]) -> Iterable[str]
             for bucket in self._client.list_buckets()["Buckets"]:
                 yield bucket["Name"]
 
-    _ICEBERG_METADATA_RE = re.compile(r"^(.*)/metadata/v(\d+)\.metadata\.json$")
-
     @staticmethod
     def _should_skip_s3_cold_storage(key: dict) -> bool:
         storage_class = key.get("StorageClass", "STANDARD")
@@ -73,42 +70,24 @@ def _should_skip_s3_cold_storage(key: dict) -> bool:
             "DEEP_ARCHIVE_ACCESS",
         }
 
-    def _classify_s3_object(
-        self,
-        iceberg_tables: Dict[str, Tuple[int, str, Optional[int]]],  # noqa: UP006, UP045
-        regular_files: List[Tuple[str, Optional[int]]],  # noqa: UP006, UP045
-        key_name: str,
-        size: Optional[int],  # noqa: UP045
-    ) -> None:
-        match = self._ICEBERG_METADATA_RE.match(key_name)
-        if match:
-            table_dir, version = match.group(1), int(match.group(2))
-            existing = iceberg_tables.get(table_dir)
-            if existing is None or version > existing[0]:
-                iceberg_tables[table_dir] = (version, key_name, size)
-        else:
-            regular_files.append((key_name, size))
-
     def get_table_names(
         self,
         bucket_name: str,
         prefix: Optional[str],  # noqa: UP045
         skip_cold_storage: bool = False,
     ) -> Iterable[Tuple[str, Optional[int]]]:  # noqa: UP006, UP045
         """
-        Lists tables in an S3 bucket using a single pass.
+        Lists tables in an S3 bucket using a two-pass approach.
 
-        Iceberg table directories are identified by objects matching
-        ``<table_dir>/metadata/v<N>.metadata.json``. Only the object with the
-        highest integer version is yielded per table directory. Regular files
-        not under any Iceberg table directory are also yielded.
+        Pass 1 collects only the Iceberg table dict (memory proportional to the
+        number of Iceberg tables, which is always small). Pass 2 streams regular
+        files without accumulation, keeping memory overhead at O(1) per object.
         """
         kwargs: Dict[str, str] = {"Bucket": bucket_name}  # noqa: UP006
         if prefix:
             kwargs["Prefix"] = prefix if prefix.endswith("/") else f"{prefix}/"
 
         iceberg_tables: Dict[str, Tuple[int, str, Optional[int]]] = {}  # noqa: UP006
-        regular_files: List[Tuple[str, Optional[int]]] = []  # noqa: UP006
 
         for key in list_s3_objects(self._client, **kwargs):
             key_name = key["Key"]
@@ -120,14 +99,21 @@ def get_table_names(
                     f"ArchiveStatus: {key.get('ArchiveStatus', '')})"
                 )
                 continue
-            self._classify_s3_object(iceberg_tables, regular_files, key_name, size)
+            self._update_iceberg_entry(iceberg_tables, key_name, size)
 
         iceberg_dirs = set(iceberg_tables.keys())
         for _, metadata_key, size in iceberg_tables.values():
             yield metadata_key, size
-        for file_path, size in regular_files:
-            if not any(file_path.startswith(d + "/") for d in iceberg_dirs):
-                yield file_path, size
+
+        for key in list_s3_objects(self._client, **kwargs):
+            key_name = key["Key"]
+            size = key.get("Size")
+            if skip_cold_storage and self._should_skip_s3_cold_storage(key):
+                continue
+            if not self._ICEBERG_METADATA_RE.match(key_name) and not any(
+                key_name.startswith(d + "/") for d in iceberg_dirs
+            ):
+                yield key_name, size
 
     def get_folders_prefix(self, bucket_name: str, prefix: Optional[str]) -> Iterable[str]:  # noqa: UP045
         for page in self._client.get_paginator("list_objects_v2").paginate(
diff --git a/ingestion/src/metadata/readers/dataframe/json.py b/ingestion/src/metadata/readers/dataframe/json.py
@@ -153,7 +153,17 @@ def _is_json_lines(file_obj) -> bool:
             return True
         try:
             obj = json.loads(first_line)
-            return isinstance(obj, dict) and not obj.get("$schema")
+            if not isinstance(obj, dict):
+                return False
+            if obj.get("$schema") is not None:
+                return False
+            if obj.get("format-version") is not None:
+                return False
+            if isinstance(obj.get("schema"), dict) and isinstance(
+                obj.get("schema", {}).get("fields"), list
+            ):
+                return False
+            return True
         except json.JSONDecodeError:
             return False
 
diff --git a/ingestion/tests/unit/source/database/test_iceberg_discovery.py b/ingestion/tests/unit/source/database/test_iceberg_discovery.py
@@ -17,14 +17,19 @@
 from unittest.mock import MagicMock, patch
 
 # Stub google.cloud.storage so this test file runs without the google-cloud-storage
-# package being installed. The logic under test (_get_iceberg_tables, get_table_names)
-# only interacts with the storage client through our own mock objects.
-_gcloud_mod = types.ModuleType("google.cloud")
-_storage_mod = types.ModuleType("google.cloud.storage")
-_storage_mod.Client = MagicMock
-sys.modules.setdefault("google", types.ModuleType("google"))
-sys.modules["google.cloud"] = _gcloud_mod
-sys.modules["google.cloud.storage"] = _storage_mod
+# package being installed. setdefault preserves the real module if it is already
+# present, which prevents breaking other tests or masking integration issues.
+_google_mod = sys.modules.setdefault("google", types.ModuleType("google"))
+_gcloud_mod = sys.modules.setdefault("google.cloud", types.ModuleType("google.cloud"))
+_storage_mod = sys.modules.setdefault(
+    "google.cloud.storage", types.ModuleType("google.cloud.storage")
+)
+if not hasattr(_storage_mod, "Client"):
+    _storage_mod.Client = MagicMock
+if not hasattr(_google_mod, "cloud"):
+    _google_mod.cloud = _gcloud_mod
+if not hasattr(_gcloud_mod, "storage"):
+    _gcloud_mod.storage = _storage_mod
 
 from metadata.ingestion.source.database.datalake.clients.gcs import (  # noqa: E402
     DatalakeGcsClient,