3232logger = utils_logger ()
3333
3434
35+ class _ArrayOfStruct :
36+ """Marker for a JSON value observed as a list of dicts. Carries the merged struct shape
37+ so downstream column construction can render it as ARRAY<STRUCT<...>>."""
38+
39+ __slots__ = ("struct" ,)
40+
41+ def __init__ (self , struct : Dict ): # noqa: UP006
42+ self .struct = struct
43+
44+
3545def fetch_dataframe_generator (
3646 config_source ,
3747 client ,
@@ -288,6 +298,10 @@ def _get_columns(cls, data_frame: "DataFrame"):
288298 }
289299 if data_type == DataType .ARRAY :
290300 parsed_string ["arrayDataType" ] = DataType .UNKNOWN
301+ struct_children = cls ._get_array_struct_children (data_frame [column ].dropna ()[:100 ])
302+ if struct_children :
303+ parsed_string ["arrayDataType" ] = DataType .STRUCT
304+ parsed_string ["children" ] = struct_children
291305
292306 if data_type == DataType .JSON :
293307 parsed_string ["children" ] = cls .get_children (
@@ -400,6 +414,11 @@ def unique_json_structure(cls, dicts: List[Dict]) -> Dict:
400414 result [key ] = cls .unique_json_structure (
401415 [nested_json if isinstance (nested_json , dict ) else {}, value ]
402416 )
417+ elif isinstance (value , list ) and value and all (isinstance (item , dict ) for item in value ):
418+ merged_struct = cls .unique_json_structure (value )
419+ existing = result .get (key )
420+ existing_struct = existing .struct if isinstance (existing , _ArrayOfStruct ) else {}
421+ result [key ] = _ArrayOfStruct (cls .unique_json_structure ([existing_struct , merged_struct ]))
403422 else :
404423 result [key ] = value
405424 return result
@@ -414,15 +433,19 @@ def construct_json_column_children(cls, json_column: Dict) -> List[Dict]:
414433 children = []
415434 for key , value in json_column .items ():
416435 column = {}
417- type_ = type (value ).__name__ .lower ()
418- column ["dataTypeDisplay" ] = cls ._data_formats .get (
419- type_ , DataType .UNKNOWN
420- ).value
421- column ["dataType" ] = cls ._data_formats .get (type_ , DataType .UNKNOWN ).value
422436 column ["name" ] = truncate_column_name (key )
423437 column ["displayName" ] = key
424- if isinstance (value , dict ):
425- column ["children" ] = cls .construct_json_column_children (value )
438+ if isinstance (value , _ArrayOfStruct ):
439+ column ["dataType" ] = DataType .ARRAY .value
440+ column ["dataTypeDisplay" ] = DataType .ARRAY .value
441+ column ["arrayDataType" ] = DataType .STRUCT
442+ column ["children" ] = cls .construct_json_column_children (value .struct )
443+ else :
444+ type_ = type (value ).__name__ .lower ()
445+ column ["dataTypeDisplay" ] = cls ._data_formats .get (type_ , DataType .UNKNOWN ).value
446+ column ["dataType" ] = cls ._data_formats .get (type_ , DataType .UNKNOWN ).value
447+ if isinstance (value , dict ):
448+ column ["children" ] = cls .construct_json_column_children (value )
426449 children .append (column )
427450
428451 return children
@@ -451,6 +474,27 @@ def get_children(cls, json_column) -> List[Dict]:
451474
452475 return cls .construct_json_column_children (json_structure )
453476
477+ @classmethod
478+ def _get_array_struct_children (cls , array_column : Any ) -> List [Dict ]: # noqa: UP006
479+ """For an ARRAY column whose elements are dicts, infer the merged struct shape and
480+ return it as children. Returns an empty list when elements are not dicts.
481+ """
482+ flattened = []
483+ for value in array_column .values .tolist ():
484+ if isinstance (value , str ):
485+ try :
486+ value = json .loads (value ) # noqa: PLW2901
487+ except (TypeError , ValueError ):
488+ continue
489+ if isinstance (value , dict ):
490+ flattened .append (value )
491+ elif isinstance (value , list ):
492+ flattened .extend (item for item in value if isinstance (item , dict ))
493+ if not flattened :
494+ return []
495+ merged_struct = cls .unique_json_structure (flattened )
496+ return cls .construct_json_column_children (merged_struct )
497+
454498
455499# pylint: disable=import-outside-toplevel
456500class ParquetDataFrameColumnParser :
0 commit comments