3434logger = utils_logger ()
3535
3636
37+ class _ArrayOfStruct :
38+ """Marker for a JSON value observed as a list of dicts. Carries the merged struct shape
39+ so downstream column construction can render it as ARRAY<STRUCT<...>>."""
40+
41+ __slots__ = ("struct" ,)
42+
43+ def __init__ (self , struct : Dict ): # noqa: UP006
44+ self .struct = struct
45+
46+
3747def fetch_dataframe_generator (
3848 config_source ,
3949 client ,
@@ -317,6 +327,10 @@ def _get_columns(cls, data_frame: "DataFrame"): # noqa: F821
317327 }
318328 if data_type == DataType .ARRAY :
319329 parsed_string ["arrayDataType" ] = DataType .UNKNOWN
330+ struct_children = cls ._get_array_struct_children (data_frame [column ].dropna ()[:100 ])
331+ if struct_children :
332+ parsed_string ["arrayDataType" ] = DataType .STRUCT
333+ parsed_string ["children" ] = struct_children
320334
321335 if data_type == DataType .JSON :
322336 parsed_string ["children" ] = cls .get_children (data_frame [column ].dropna ()[:100 ])
@@ -418,6 +432,11 @@ def unique_json_structure(cls, dicts: List[Dict]) -> Dict: # noqa: UP006
418432 result [key ] = cls .unique_json_structure (
419433 [nested_json if isinstance (nested_json , dict ) else {}, value ]
420434 )
435+ elif isinstance (value , list ) and value and all (isinstance (item , dict ) for item in value ):
436+ merged_struct = cls .unique_json_structure (value )
437+ existing = result .get (key )
438+ existing_struct = existing .struct if isinstance (existing , _ArrayOfStruct ) else {}
439+ result [key ] = _ArrayOfStruct (cls .unique_json_structure ([existing_struct , merged_struct ]))
421440 else :
422441 result [key ] = value
423442 return result
@@ -432,13 +451,19 @@ def construct_json_column_children(cls, json_column: Dict) -> List[Dict]: # noq
432451 children = []
433452 for key , value in json_column .items ():
434453 column = {}
435- type_ = type (value ).__name__ .lower ()
436- column ["dataTypeDisplay" ] = cls ._data_formats .get (type_ , DataType .UNKNOWN ).value
437- column ["dataType" ] = cls ._data_formats .get (type_ , DataType .UNKNOWN ).value
438454 column ["name" ] = truncate_column_name (key )
439455 column ["displayName" ] = key
440- if isinstance (value , dict ):
441- column ["children" ] = cls .construct_json_column_children (value )
456+ if isinstance (value , _ArrayOfStruct ):
457+ column ["dataType" ] = DataType .ARRAY .value
458+ column ["dataTypeDisplay" ] = DataType .ARRAY .value
459+ column ["arrayDataType" ] = DataType .STRUCT
460+ column ["children" ] = cls .construct_json_column_children (value .struct )
461+ else :
462+ type_ = type (value ).__name__ .lower ()
463+ column ["dataTypeDisplay" ] = cls ._data_formats .get (type_ , DataType .UNKNOWN ).value
464+ column ["dataType" ] = cls ._data_formats .get (type_ , DataType .UNKNOWN ).value
465+ if isinstance (value , dict ):
466+ column ["children" ] = cls .construct_json_column_children (value )
442467 children .append (column )
443468
444469 return children
@@ -466,6 +491,27 @@ def get_children(cls, json_column) -> List[Dict]: # noqa: UP006
466491
467492 return cls .construct_json_column_children (json_structure )
468493
494+ @classmethod
495+ def _get_array_struct_children (cls , array_column : Any ) -> List [Dict ]: # noqa: UP006
496+ """For an ARRAY column whose elements are dicts, infer the merged struct shape and
497+ return it as children. Returns an empty list when elements are not dicts.
498+ """
499+ flattened = []
500+ for value in array_column .values .tolist ():
501+ if isinstance (value , str ):
502+ try :
503+ value = json .loads (value ) # noqa: PLW2901
504+ except (TypeError , ValueError ):
505+ continue
506+ if isinstance (value , dict ):
507+ flattened .append (value )
508+ elif isinstance (value , list ):
509+ flattened .extend (item for item in value if isinstance (item , dict ))
510+ if not flattened :
511+ return []
512+ merged_struct = cls .unique_json_structure (flattened )
513+ return cls .construct_json_column_children (merged_struct )
514+
469515
470516# pylint: disable=import-outside-toplevel
471517class ParquetDataFrameColumnParser :
0 commit comments