open-metadata
diff --git a/‎.github/workflows/auto-cherry-pick-labeled-prs.yaml‎
Lines changed: 25 additions & 13 deletions b/‎.github/workflows/auto-cherry-pick-labeled-prs.yaml‎
Lines changed: 25 additions & 13 deletions
diff --git a/‎bootstrap/sql/migrations/native/1.13.0/mysql/postDataMigrationSQLScript.sql‎
Lines changed: 3 additions & 0 deletions b/‎bootstrap/sql/migrations/native/1.13.0/mysql/postDataMigrationSQLScript.sql‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎bootstrap/sql/migrations/native/1.13.0/postgres/postDataMigrationSQLScript.sql‎
Lines changed: 3 additions & 0 deletions b/‎bootstrap/sql/migrations/native/1.13.0/postgres/postDataMigrationSQLScript.sql‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎ingestion/src/metadata/ingestion/source/database/redshift/utils.py‎
Lines changed: 11 additions & 2 deletions b/‎ingestion/src/metadata/ingestion/source/database/redshift/utils.py‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎ingestion/src/metadata/profiler/orm/converter/redshift/converter.py‎
Lines changed: 2 additions & 2 deletions b/‎ingestion/src/metadata/profiler/orm/converter/redshift/converter.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎ingestion/src/metadata/utils/datalake/datalake_utils.py‎
Lines changed: 51 additions & 5 deletions b/‎ingestion/src/metadata/utils/datalake/datalake_utils.py‎
Lines changed: 51 additions & 5 deletions
diff --git a/‎ingestion/tests/unit/utils/test_datalake.py‎
Lines changed: 57 additions & 0 deletions b/‎ingestion/tests/unit/utils/test_datalake.py‎
Lines changed: 57 additions & 0 deletions
@@ -16,49 +16,61 @@ permissions:
 env:
   CURRENT_RELEASE_ENDPOINT: ${{ vars.CURRENT_RELEASE_ENDPOINT }}  # Endpoint that returns the current release version in json format
 jobs:
-  cherry_pick_to_release_branch:
+  get_release_branch:
     if: github.event.pull_request.merged == true &&
         contains(github.event.pull_request.labels.*.name, 'To release')
+    runs-on: ubuntu-latest
+    outputs:
+      release_branches: ${{ steps.get_release_version.outputs.release_branches }}
+    steps:
+      - name: Get the release version
+        id: get_release_version
+        run: |
+          CURRENT_RELEASE=$(curl -s $CURRENT_RELEASE_ENDPOINT | jq -c '.collate_branches // []')
+          echo "release_branches=${CURRENT_RELEASE}" >> $GITHUB_OUTPUT
+
+  cherry_pick_to_release_branch:
+    needs: get_release_branch
+    if: needs.get_release_branch.outputs.release_branches != '' && needs.get_release_branch.outputs.release_branches != '[]'
     runs-on: ubuntu-latest  # Running it on ubuntu-latest on purpose (we're not using all the free minutes)
+    strategy:
+      fail-fast: false
+      matrix:
+        branch: ${{ fromJson(needs.get_release_branch.outputs.release_branches) }}
     steps:
       - name: Checkout main branch
         uses: actions/checkout@v4
         with:
           ref: main
           fetch-depth: 0
-      - name: Get the release version
-        id: get_release_version
-        run: |
-          CURRENT_RELEASE=$(curl -s $CURRENT_RELEASE_ENDPOINT | jq -r .om_branch)
-          echo "CURRENT_RELEASE=${CURRENT_RELEASE}" >> $GITHUB_ENV
       - name: Cherry-pick changes from PR
         id: cherry_pick
         continue-on-error: true
         run: |
           git config --global user.email "release-bot@open-metadata.org"
           git config --global user.name "OpenMetadata Release Bot"
-          git fetch origin ${CURRENT_RELEASE}
-          git checkout ${CURRENT_RELEASE}
+          git fetch origin ${{ matrix.branch }}
+          git checkout ${{ matrix.branch }}
           git cherry-pick -x ${{ github.event.pull_request.merge_commit_sha }}
       - name: Push changes to release branch
         id: push_changes
         continue-on-error: true
         if: steps.cherry_pick.outcome == 'success'
         run: |
-          git push origin ${CURRENT_RELEASE}
+          git push origin ${{ matrix.branch }}
       - name: Post a comment on failure
         if: steps.cherry_pick.outcome != 'success' || steps.push_changes.outcome != 'success'
         uses: actions/github-script@v7
         with:
           script: |
             const prNumber = context.payload.pull_request.number;
-            const releaseVersion = process.env.CURRENT_RELEASE;
+            const releaseBranch = '${{ matrix.branch }}';
             const workflowRunUrl = `${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}`;
             github.rest.issues.createComment({
               owner: context.repo.owner,
               repo: context.repo.repo,
               issue_number: prNumber,
-              body: `Failed to cherry-pick changes to the ${releaseVersion} branch.
+              body: `Failed to cherry-pick changes to the ${releaseBranch} branch.
                 Please cherry-pick the changes manually.
                 You can find more details [here](${workflowRunUrl}).`
             })
@@ -68,10 +80,10 @@ jobs:
         with:
           script: |
             const prNumber = context.payload.pull_request.number;
-            const releaseVersion = process.env.CURRENT_RELEASE;
+            const releaseBranch = '${{ matrix.branch }}';
             github.rest.issues.createComment({
               owner: context.repo.owner,
               repo: context.repo.repo,
               issue_number: prNumber,
-              body: `Changes have been cherry-picked to the ${releaseVersion} branch.`
+              body: `Changes have been cherry-picked to the ${releaseBranch} branch.`
             })
@@ -80,6 +80,9 @@ UPDATE glossary_term_entity
 SET json = JSON_REMOVE(json, '$.relatedTerms')
 WHERE JSON_EXTRACT(json, '$.relatedTerms') IS NOT NULL;
 
+-- entity_extension version snapshots: handled by Java migration
+-- migrateGlossaryTermVersionRelatedTermsToTermRelation (transforms in place to preserve history).
+
 -- Backfill conceptMappings for existing glossary terms
 UPDATE glossary_term_entity
 SET json = JSON_SET(COALESCE(json, '{}'), '$.conceptMappings', JSON_ARRAY())
 
@@ -82,6 +82,9 @@ UPDATE glossary_term_entity
 SET json = (json::jsonb - 'relatedTerms')::json
 WHERE jsonb_exists(json::jsonb, 'relatedTerms');
 
+-- entity_extension version snapshots: handled by Java migration
+-- migrateGlossaryTermVersionRelatedTermsToTermRelation (transforms in place to preserve history).
+
 -- Backfill conceptMappings for existing glossary terms
 UPDATE glossary_term_entity
 SET json = jsonb_set(COALESCE(json::jsonb, '{}'::jsonb), '{conceptMappings}', '[]'::jsonb)
 
@@ -14,6 +14,7 @@
 
 import re
 from collections import defaultdict
+from typing import Any
 
 import sqlalchemy as sa
 from packaging.version import Version
@@ -66,7 +67,7 @@ def _redshift_initialize(self, connection):
     self._has_native_hstore = False
 
 
-def _load_domains(self, connection, **kw):
+def _load_domains(self, connection, schema: str | None = None, **kw: Any) -> dict:
     """
     Override to return empty dict since Redshift does not support user-created
     domains and pg_catalog.pg_collation does not exist in Redshift, causing a
@@ -85,7 +86,15 @@ def get_temp_table_names(self, connection, schema=None, **kw):
     return []
 
 
-def get_multi_columns(self, connection, **kw):
+def get_multi_columns(
+    self,
+    connection,
+    schema: str | None = None,
+    filter_names: Any | None = None,
+    scope: Any | None = None,
+    kind: Any | None = None,
+    **kw: Any,
+):
     """
     Override PGDialect's get_multi_columns to avoid querying
     pg_attribute.attcollation which does not exist in Redshift.
 
@@ -14,7 +14,7 @@
 to an SQLAlchemy ORM class.
 """
 
-from typing import Dict, Set  # noqa: UP035
+from typing import Dict, Set, cast  # noqa: UP035
 
 from sqlalchemy.sql.sqltypes import TypeEngine
 
@@ -45,5 +45,5 @@ def map_sqa_to_om_types() -> Dict[TypeEngine, Set[DataType]]:  # noqa: UP006
 
         return {
             **CommonMapTypes.map_sqa_to_om_types(),
-            GEOMETRY: {DataType.GEOMETRY},
+            cast("TypeEngine", GEOMETRY): {DataType.GEOMETRY},
         }
@@ -34,6 +34,16 @@
 logger = utils_logger()
 
 
+class _ArrayOfStruct:
+    """Marker for a JSON value observed as a list of dicts. Carries the merged struct shape
+    so downstream column construction can render it as ARRAY<STRUCT<...>>."""
+
+    __slots__ = ("struct",)
+
+    def __init__(self, struct: Dict):  # noqa: UP006
+        self.struct = struct
+
+
 def fetch_dataframe_generator(
     config_source,
     client,
@@ -317,6 +327,10 @@ def _get_columns(cls, data_frame: "DataFrame"):  # noqa: F821
                     }
                     if data_type == DataType.ARRAY:
                         parsed_string["arrayDataType"] = DataType.UNKNOWN
+                        struct_children = cls._get_array_struct_children(data_frame[column].dropna()[:100])
+                        if struct_children:
+                            parsed_string["arrayDataType"] = DataType.STRUCT
+                            parsed_string["children"] = struct_children
 
                     if data_type == DataType.JSON:
                         parsed_string["children"] = cls.get_children(data_frame[column].dropna()[:100])
@@ -418,6 +432,11 @@ def unique_json_structure(cls, dicts: List[Dict]) -> Dict:  # noqa: UP006
                     result[key] = cls.unique_json_structure(
                         [nested_json if isinstance(nested_json, dict) else {}, value]
                     )
+                elif isinstance(value, list) and value and all(isinstance(item, dict) for item in value):
+                    merged_struct = cls.unique_json_structure(value)
+                    existing = result.get(key)
+                    existing_struct = existing.struct if isinstance(existing, _ArrayOfStruct) else {}
+                    result[key] = _ArrayOfStruct(cls.unique_json_structure([existing_struct, merged_struct]))
                 else:
                     result[key] = value
         return result
@@ -432,13 +451,19 @@ def construct_json_column_children(cls, json_column: Dict) -> List[Dict]:  # noq
         children = []
         for key, value in json_column.items():
             column = {}
-            type_ = type(value).__name__.lower()
-            column["dataTypeDisplay"] = cls._data_formats.get(type_, DataType.UNKNOWN).value
-            column["dataType"] = cls._data_formats.get(type_, DataType.UNKNOWN).value
             column["name"] = truncate_column_name(key)
             column["displayName"] = key
-            if isinstance(value, dict):
-                column["children"] = cls.construct_json_column_children(value)
+            if isinstance(value, _ArrayOfStruct):
+                column["dataType"] = DataType.ARRAY.value
+                column["dataTypeDisplay"] = DataType.ARRAY.value
+                column["arrayDataType"] = DataType.STRUCT
+                column["children"] = cls.construct_json_column_children(value.struct)
+            else:
+                type_ = type(value).__name__.lower()
+                column["dataTypeDisplay"] = cls._data_formats.get(type_, DataType.UNKNOWN).value
+                column["dataType"] = cls._data_formats.get(type_, DataType.UNKNOWN).value
+                if isinstance(value, dict):
+                    column["children"] = cls.construct_json_column_children(value)
             children.append(column)
 
         return children
@@ -466,6 +491,27 @@ def get_children(cls, json_column) -> List[Dict]:  # noqa: UP006
 
         return cls.construct_json_column_children(json_structure)
 
+    @classmethod
+    def _get_array_struct_children(cls, array_column: Any) -> List[Dict]:  # noqa: UP006
+        """For an ARRAY column whose elements are dicts, infer the merged struct shape and
+        return it as children. Returns an empty list when elements are not dicts.
+        """
+        flattened = []
+        for value in array_column.values.tolist():
+            if isinstance(value, str):
+                try:
+                    value = json.loads(value)  # noqa: PLW2901
+                except (TypeError, ValueError):
+                    continue
+            if isinstance(value, dict):
+                flattened.append(value)
+            elif isinstance(value, list):
+                flattened.extend(item for item in value if isinstance(item, dict))
+        if not flattened:
+            return []
+        merged_struct = cls.unique_json_structure(flattened)
+        return cls.construct_json_column_children(merged_struct)
+
 
 # pylint: disable=import-outside-toplevel
 class ParquetDataFrameColumnParser:
 
@@ -167,6 +167,63 @@ def test_construct_column(self):
         for el in zip(expected, actual):  # noqa: B905
             self.assertDictEqual(el[0], el[1])
 
+    def test_unique_json_structure_with_list_of_dicts(self):
+        """list-of-dicts values are merged into a struct shape (e.g. Iceberg `schema.fields`)."""
+        sample_data = [
+            {
+                "schema": {
+                    "fields": [
+                        {"id": 1, "name": "customer_id", "type": "string"},
+                        {"id": 2, "name": "customer_type_cd", "type": "string"},
+                    ]
+                }
+            }
+        ]
+
+        actual = GenericDataFrameColumnParser.unique_json_structure(sample_data)
+        fields_value = actual["schema"]["fields"]
+
+        from metadata.utils.datalake.datalake_utils import _ArrayOfStruct
+
+        assert isinstance(fields_value, _ArrayOfStruct)
+        assert set(fields_value.struct.keys()) == {"id", "name", "type"}
+
+    def test_unique_json_structure_merges_list_of_dicts_across_samples(self):
+        """list-of-dicts values across multiple samples are unioned, not overwritten."""
+        from metadata.utils.datalake.datalake_utils import _ArrayOfStruct
+
+        sample_data = [
+            {"schema": {"fields": [{"id": 1, "name": "customer_id", "type": "string"}]}},
+            {"schema": {"fields": [{"id": 2, "required": False, "type": "string"}]}},
+            {"schema": {"fields": [{"description": "ciam id"}]}},
+        ]
+
+        actual = GenericDataFrameColumnParser.unique_json_structure(sample_data)
+        fields_value = actual["schema"]["fields"]
+
+        assert isinstance(fields_value, _ArrayOfStruct)
+        assert set(fields_value.struct.keys()) == {"id", "name", "type", "required", "description"}
+
+    def test_construct_column_with_array_of_struct(self):
+        """list-of-dicts values render as ARRAY<STRUCT<...>> with children for the struct fields."""
+        structure = {
+            "schema": {
+                "fields": [
+                    {"id": 1, "name": "customer_id", "type": "string"},
+                    {"id": 2, "name": "ciam_id", "type": "string"},
+                ]
+            }
+        }
+        merged = GenericDataFrameColumnParser.unique_json_structure([structure])
+        children = GenericDataFrameColumnParser.construct_json_column_children(merged)
+
+        schema_col = children[0]
+        fields_col = next(c for c in schema_col["children"] if c["name"] == "fields")
+
+        assert fields_col["dataType"] == DataType.ARRAY.value
+        assert fields_col["arrayDataType"] == DataType.STRUCT
+        assert {child["name"] for child in fields_col["children"]} == {"id", "name", "type"}
+
     def test_create_column_object(self):
         """test create column object fn"""
         formatted_column = GenericDataFrameColumnParser.construct_json_column_children(STRUCTURE)