@@ -112,6 +112,43 @@ def _should_skip_gcs_cold_storage(blob) -> bool:
112112 storage_class = getattr (blob , "storage_class" , None )
113113 return bool (storage_class and storage_class in GCS_COLD_STORAGE_CLASSES )
114114
115+ def _discover_iceberg_dirs (
116+ self ,
117+ bucket ,
118+ prefix : Optional [str ], # noqa: UP045
119+ skip_cold_storage : bool ,
120+ ) -> Tuple [Dict [str , Tuple [int , str , int | None ]], Set [str ]]: # noqa: UP006
121+ """Pass 1: discover Iceberg table directories and return (iceberg_tables, iceberg_dirs)."""
122+ iceberg_tables : Dict [str , Tuple [int , str , int | None ]] = {} # noqa: UP006
123+ cold_iceberg_dirs : Set [str ] = set () # noqa: UP006
124+
125+ for blob in bucket .list_blobs (prefix = prefix ):
126+ if skip_cold_storage and self ._should_skip_gcs_cold_storage (blob ):
127+ match = self ._ICEBERG_METADATA_RE .match (blob .name )
128+ if match :
129+ cold_iceberg_dirs .add (match .group (1 ))
130+ continue
131+ self ._update_iceberg_entry (iceberg_tables , blob .name , blob .size )
132+
133+ return iceberg_tables , set (iceberg_tables .keys ()) | cold_iceberg_dirs
134+
135+ def _yield_regular_files (
136+ self ,
137+ bucket ,
138+ prefix : Optional [str ], # noqa: UP045
139+ skip_cold_storage : bool ,
140+ iceberg_dirs : Set [str ], # noqa: UP006
141+ ) -> Iterable [Tuple [str , Optional [int ]]]: # noqa: UP006, UP045
142+ """Pass 2: stream regular files, skipping Iceberg directory contents."""
143+ for blob in bucket .list_blobs (prefix = prefix ):
144+ if skip_cold_storage and self ._should_skip_gcs_cold_storage (blob ):
145+ continue
146+ if iceberg_dirs and (
147+ self ._ICEBERG_METADATA_RE .match (blob .name ) or any (blob .name .startswith (d + "/" ) for d in iceberg_dirs )
148+ ):
149+ continue
150+ yield blob .name , blob .size
151+
115152 def get_table_names (
116153 self ,
117154 bucket_name : str ,
@@ -126,38 +163,12 @@ def get_table_names(
126163 files without accumulation, keeping memory overhead at O(1) per object.
127164 """
128165 bucket = self ._client .get_bucket (bucket_name )
129- iceberg_tables : Dict [str , Tuple [int , str , int | None ]] = {} # noqa: UP006
130- cold_iceberg_dirs : Set [str ] = set () # noqa: UP006
166+ iceberg_tables , iceberg_dirs = self ._discover_iceberg_dirs (bucket , prefix , skip_cold_storage )
131167
132- for blob in bucket .list_blobs (prefix = prefix ):
133- is_cold = skip_cold_storage and self ._should_skip_gcs_cold_storage (blob )
134- if is_cold :
135- logger .debug (
136- f"Skipping cold storage object: { blob .name } (storage_class: { getattr (blob , 'storage_class' , None )} )"
137- )
138- match = self ._ICEBERG_METADATA_RE .match (blob .name )
139- if match :
140- cold_iceberg_dirs .add (match .group (1 ))
141- continue
142- self ._update_iceberg_entry (iceberg_tables , blob .name , blob .size )
143-
144- iceberg_dirs = set (iceberg_tables .keys ()) | cold_iceberg_dirs
145168 for _ , metadata_blob_path , size in iceberg_tables .values ():
146169 yield metadata_blob_path , size
147170
148- if not iceberg_dirs :
149- for blob in bucket .list_blobs (prefix = prefix ):
150- if skip_cold_storage and self ._should_skip_gcs_cold_storage (blob ):
151- continue
152- yield blob .name , blob .size
153- else :
154- for blob in bucket .list_blobs (prefix = prefix ):
155- if skip_cold_storage and self ._should_skip_gcs_cold_storage (blob ):
156- continue
157- if not self ._ICEBERG_METADATA_RE .match (blob .name ) and not any (
158- blob .name .startswith (d + "/" ) for d in iceberg_dirs
159- ):
160- yield blob .name , blob .size
171+ yield from self ._yield_regular_files (bucket , prefix , skip_cold_storage , iceberg_dirs )
161172
162173 def close (self , service_connection ):
163174 os .environ .pop ("GOOGLE_CLOUD_PROJECT" , "" )
0 commit comments