3333logger = utils_logger ()
3434
3535
36+ class _ArrayOfStruct :
37+ """Marker for a JSON value observed as a list of dicts. Carries the merged struct shape
38+ so downstream column construction can render it as ARRAY<STRUCT<...>>."""
39+
40+ __slots__ = ("struct" ,)
41+
42+ def __init__ (self , struct : Dict ): # noqa: UP006
43+ self .struct = struct
44+
45+
3646def fetch_dataframe_generator (
3747 config_source ,
3848 client ,
@@ -297,6 +307,10 @@ def _get_columns(cls, data_frame: "DataFrame"): # noqa: F821
297307 }
298308 if data_type == DataType .ARRAY :
299309 parsed_string ["arrayDataType" ] = DataType .UNKNOWN
310+ struct_children = cls ._get_array_struct_children (data_frame [column ].dropna ()[:100 ])
311+ if struct_children :
312+ parsed_string ["arrayDataType" ] = DataType .STRUCT
313+ parsed_string ["children" ] = struct_children
300314
301315 if data_type == DataType .JSON :
302316 parsed_string ["children" ] = cls .get_children (data_frame [column ].dropna ()[:100 ])
@@ -398,6 +412,11 @@ def unique_json_structure(cls, dicts: List[Dict]) -> Dict: # noqa: UP006
398412 result [key ] = cls .unique_json_structure (
399413 [nested_json if isinstance (nested_json , dict ) else {}, value ]
400414 )
415+ elif isinstance (value , list ) and value and all (isinstance (item , dict ) for item in value ):
416+ merged_struct = cls .unique_json_structure (value )
417+ existing = result .get (key )
418+ existing_struct = existing .struct if isinstance (existing , _ArrayOfStruct ) else {}
419+ result [key ] = _ArrayOfStruct (cls .unique_json_structure ([existing_struct , merged_struct ]))
401420 else :
402421 result [key ] = value
403422 return result
@@ -412,13 +431,19 @@ def construct_json_column_children(cls, json_column: Dict) -> List[Dict]: # noq
412431 children = []
413432 for key , value in json_column .items ():
414433 column = {}
415- type_ = type (value ).__name__ .lower ()
416- column ["dataTypeDisplay" ] = cls ._data_formats .get (type_ , DataType .UNKNOWN ).value
417- column ["dataType" ] = cls ._data_formats .get (type_ , DataType .UNKNOWN ).value
418434 column ["name" ] = truncate_column_name (key )
419435 column ["displayName" ] = key
420- if isinstance (value , dict ):
421- column ["children" ] = cls .construct_json_column_children (value )
436+ if isinstance (value , _ArrayOfStruct ):
437+ column ["dataType" ] = DataType .ARRAY .value
438+ column ["dataTypeDisplay" ] = DataType .ARRAY .value
439+ column ["arrayDataType" ] = DataType .STRUCT
440+ column ["children" ] = cls .construct_json_column_children (value .struct )
441+ else :
442+ type_ = type (value ).__name__ .lower ()
443+ column ["dataTypeDisplay" ] = cls ._data_formats .get (type_ , DataType .UNKNOWN ).value
444+ column ["dataType" ] = cls ._data_formats .get (type_ , DataType .UNKNOWN ).value
445+ if isinstance (value , dict ):
446+ column ["children" ] = cls .construct_json_column_children (value )
422447 children .append (column )
423448
424449 return children
@@ -446,6 +471,27 @@ def get_children(cls, json_column) -> List[Dict]: # noqa: UP006
446471
447472 return cls .construct_json_column_children (json_structure )
448473
474+ @classmethod
475+ def _get_array_struct_children (cls , array_column : Any ) -> List [Dict ]: # noqa: UP006
476+ """For an ARRAY column whose elements are dicts, infer the merged struct shape and
477+ return it as children. Returns an empty list when elements are not dicts.
478+ """
479+ flattened = []
480+ for value in array_column .values .tolist ():
481+ if isinstance (value , str ):
482+ try :
483+ value = json .loads (value ) # noqa: PLW2901
484+ except (TypeError , ValueError ):
485+ continue
486+ if isinstance (value , dict ):
487+ flattened .append (value )
488+ elif isinstance (value , list ):
489+ flattened .extend (item for item in value if isinstance (item , dict ))
490+ if not flattened :
491+ return []
492+ merged_struct = cls .unique_json_structure (flattened )
493+ return cls .construct_json_column_children (merged_struct )
494+
449495
450496# pylint: disable=import-outside-toplevel
451497class ParquetDataFrameColumnParser :
0 commit comments