Fix: datalake parse array type nested strcuture fields inside json file (#27798)

harshsoni2024 · harshsoni2024 · commit be355e564ec5 · 2026-04-29T16:48:48.000+05:30
diff --git a/ingestion/src/metadata/ingestion/source/database/redshift/utils.py b/ingestion/src/metadata/ingestion/source/database/redshift/utils.py
@@ -13,6 +13,7 @@
 """
 import re
 from collections import defaultdict
+from typing import Any
 
 import sqlalchemy as sa
 from packaging.version import Version
@@ -49,6 +50,59 @@
 logger = ingestion_logger()
 
 
+def _redshift_initialize(self, connection):
+    """
+    Override PGDialect + PGDialect_psycopg2 initialization to skip
+    PostgreSQL-specific queries that Redshift doesn't support
+    (e.g., SHOW standard_conforming_strings).
+    """
+    from sqlalchemy.engine.default import DefaultDialect  # noqa: PLC0415
+
+    DefaultDialect.initialize(self, connection)
+    self._backslash_escapes = False
+    self.supports_smallserial = False
+    self._supports_drop_index_concurrently = False
+    self.supports_identity_columns = False
+    self._has_native_hstore = False
+
+
+def _load_domains(self, connection, schema: str | None = None, **kw: Any) -> dict:
+    """
+    Override to return empty dict since Redshift does not support user-created
+    domains and pg_catalog.pg_collation does not exist in Redshift, causing a
+    ProgrammingError that aborts the transaction and breaks all subsequent queries.
+    """
+    return {}
+
+
+def get_temp_table_names(self, connection, schema=None, **kw):
+    """
+    Override PGDialect's get_temp_table_names to avoid querying
+    pg_catalog.pg_class.relpersistence which does not exist in Redshift,
+    causing a ProgrammingError that aborts the transaction and breaks all
+    subsequent queries.
+    """
+    return []
+
+
+def get_multi_columns(
+    self,
+    connection,
+    schema: str | None = None,
+    filter_names: Any | None = None,
+    scope: Any | None = None,
+    kind: Any | None = None,
+    **kw: Any,
+):
+    """
+    Override PGDialect's get_multi_columns to avoid querying
+    pg_attribute.attcollation which does not exist in Redshift.
+    Falls back to the default implementation that delegates to
+    the already-overridden get_columns() method.
+    """
+    return self._default_multi_reflect(self.get_columns, connection, **kw)
+
+
 # pylint: disable=protected-access
 @calculate_execution_time()
 @reflection.cache
diff --git a/ingestion/src/metadata/profiler/orm/converter/redshift/converter.py b/ingestion/src/metadata/profiler/orm/converter/redshift/converter.py
@@ -14,7 +14,7 @@
 to an SQLAlchemy ORM class.
 """
 
-from typing import Dict, Set
+from typing import Dict, Set, cast  # noqa: UP035
 
 from sqlalchemy.sql.sqltypes import TypeEngine
 
@@ -48,5 +48,5 @@ def map_sqa_to_om_types() -> Dict[TypeEngine, Set[DataType]]:
 
         return {
             **CommonMapTypes.map_sqa_to_om_types(),
-            GEOMETRY: {DataType.GEOMETRY},
+            cast("TypeEngine", GEOMETRY): {DataType.GEOMETRY},
         }
diff --git a/ingestion/src/metadata/utils/datalake/datalake_utils.py b/ingestion/src/metadata/utils/datalake/datalake_utils.py
@@ -32,6 +32,16 @@
 logger = utils_logger()
 
 
+class _ArrayOfStruct:
+    """Marker for a JSON value observed as a list of dicts. Carries the merged struct shape
+    so downstream column construction can render it as ARRAY<STRUCT<...>>."""
+
+    __slots__ = ("struct",)
+
+    def __init__(self, struct: Dict):  # noqa: UP006
+        self.struct = struct
+
+
 def fetch_dataframe_generator(
     config_source,
     client,
@@ -288,6 +298,10 @@ def _get_columns(cls, data_frame: "DataFrame"):
                     }
                     if data_type == DataType.ARRAY:
                         parsed_string["arrayDataType"] = DataType.UNKNOWN
+                        struct_children = cls._get_array_struct_children(data_frame[column].dropna()[:100])
+                        if struct_children:
+                            parsed_string["arrayDataType"] = DataType.STRUCT
+                            parsed_string["children"] = struct_children
 
                     if data_type == DataType.JSON:
                         parsed_string["children"] = cls.get_children(
@@ -400,6 +414,11 @@ def unique_json_structure(cls, dicts: List[Dict]) -> Dict:
                     result[key] = cls.unique_json_structure(
                         [nested_json if isinstance(nested_json, dict) else {}, value]
                     )
+                elif isinstance(value, list) and value and all(isinstance(item, dict) for item in value):
+                    merged_struct = cls.unique_json_structure(value)
+                    existing = result.get(key)
+                    existing_struct = existing.struct if isinstance(existing, _ArrayOfStruct) else {}
+                    result[key] = _ArrayOfStruct(cls.unique_json_structure([existing_struct, merged_struct]))
                 else:
                     result[key] = value
         return result
@@ -414,15 +433,19 @@ def construct_json_column_children(cls, json_column: Dict) -> List[Dict]:
         children = []
         for key, value in json_column.items():
             column = {}
-            type_ = type(value).__name__.lower()
-            column["dataTypeDisplay"] = cls._data_formats.get(
-                type_, DataType.UNKNOWN
-            ).value
-            column["dataType"] = cls._data_formats.get(type_, DataType.UNKNOWN).value
             column["name"] = truncate_column_name(key)
             column["displayName"] = key
-            if isinstance(value, dict):
-                column["children"] = cls.construct_json_column_children(value)
+            if isinstance(value, _ArrayOfStruct):
+                column["dataType"] = DataType.ARRAY.value
+                column["dataTypeDisplay"] = DataType.ARRAY.value
+                column["arrayDataType"] = DataType.STRUCT
+                column["children"] = cls.construct_json_column_children(value.struct)
+            else:
+                type_ = type(value).__name__.lower()
+                column["dataTypeDisplay"] = cls._data_formats.get(type_, DataType.UNKNOWN).value
+                column["dataType"] = cls._data_formats.get(type_, DataType.UNKNOWN).value
+                if isinstance(value, dict):
+                    column["children"] = cls.construct_json_column_children(value)
             children.append(column)
 
         return children
@@ -451,6 +474,27 @@ def get_children(cls, json_column) -> List[Dict]:
 
         return cls.construct_json_column_children(json_structure)
 
+    @classmethod
+    def _get_array_struct_children(cls, array_column: Any) -> List[Dict]:  # noqa: UP006
+        """For an ARRAY column whose elements are dicts, infer the merged struct shape and
+        return it as children. Returns an empty list when elements are not dicts.
+        """
+        flattened = []
+        for value in array_column.values.tolist():
+            if isinstance(value, str):
+                try:
+                    value = json.loads(value)  # noqa: PLW2901
+                except (TypeError, ValueError):
+                    continue
+            if isinstance(value, dict):
+                flattened.append(value)
+            elif isinstance(value, list):
+                flattened.extend(item for item in value if isinstance(item, dict))
+        if not flattened:
+            return []
+        merged_struct = cls.unique_json_structure(flattened)
+        return cls.construct_json_column_children(merged_struct)
+
 
 # pylint: disable=import-outside-toplevel
 class ParquetDataFrameColumnParser:
diff --git a/ingestion/tests/unit/utils/test_datalake.py b/ingestion/tests/unit/utils/test_datalake.py
@@ -167,6 +167,63 @@ def test_construct_column(self):
         for el in zip(expected, actual):
             self.assertDictEqual(el[0], el[1])
 
+    def test_unique_json_structure_with_list_of_dicts(self):
+        """list-of-dicts values are merged into a struct shape (e.g. Iceberg `schema.fields`)."""
+        sample_data = [
+            {
+                "schema": {
+                    "fields": [
+                        {"id": 1, "name": "customer_id", "type": "string"},
+                        {"id": 2, "name": "customer_type_cd", "type": "string"},
+                    ]
+                }
+            }
+        ]
+
+        actual = GenericDataFrameColumnParser.unique_json_structure(sample_data)
+        fields_value = actual["schema"]["fields"]
+
+        from metadata.utils.datalake.datalake_utils import _ArrayOfStruct
+
+        assert isinstance(fields_value, _ArrayOfStruct)
+        assert set(fields_value.struct.keys()) == {"id", "name", "type"}
+
+    def test_unique_json_structure_merges_list_of_dicts_across_samples(self):
+        """list-of-dicts values across multiple samples are unioned, not overwritten."""
+        from metadata.utils.datalake.datalake_utils import _ArrayOfStruct
+
+        sample_data = [
+            {"schema": {"fields": [{"id": 1, "name": "customer_id", "type": "string"}]}},
+            {"schema": {"fields": [{"id": 2, "required": False, "type": "string"}]}},
+            {"schema": {"fields": [{"description": "ciam id"}]}},
+        ]
+
+        actual = GenericDataFrameColumnParser.unique_json_structure(sample_data)
+        fields_value = actual["schema"]["fields"]
+
+        assert isinstance(fields_value, _ArrayOfStruct)
+        assert set(fields_value.struct.keys()) == {"id", "name", "type", "required", "description"}
+
+    def test_construct_column_with_array_of_struct(self):
+        """list-of-dicts values render as ARRAY<STRUCT<...>> with children for the struct fields."""
+        structure = {
+            "schema": {
+                "fields": [
+                    {"id": 1, "name": "customer_id", "type": "string"},
+                    {"id": 2, "name": "ciam_id", "type": "string"},
+                ]
+            }
+        }
+        merged = GenericDataFrameColumnParser.unique_json_structure([structure])
+        children = GenericDataFrameColumnParser.construct_json_column_children(merged)
+
+        schema_col = children[0]
+        fields_col = next(c for c in schema_col["children"] if c["name"] == "fields")
+
+        assert fields_col["dataType"] == DataType.ARRAY.value
+        assert fields_col["arrayDataType"] == DataType.STRUCT
+        assert {child["name"] for child in fields_col["children"]} == {"id", "name", "type"}
+
     def test_create_column_object(self):
         """test create column object fn"""
         formatted_column = GenericDataFrameColumnParser.construct_json_column_children(