|
11 | 11 |
|
12 | 12 | """Athena source module""" |
13 | 13 |
|
14 | | -import threading |
| 14 | +import hashlib |
| 15 | +import re |
15 | 16 | import traceback |
16 | 17 | from typing import Dict, Iterable, Optional, Set, Tuple |
17 | 18 |
|
18 | 19 | from pyathena.sqlalchemy.base import AthenaDialect |
| 20 | +from sqlalchemy import text |
19 | 21 | from sqlalchemy.engine.reflection import Inspector |
20 | 22 |
|
21 | 23 | from metadata.clients.aws_client import AWSClient |
|
81 | 83 | ATHENA_TAG = "ATHENA TAG" |
82 | 84 | ATHENA_TAG_CLASSIFICATION = "ATHENA TAG CLASSIFICATION" |
83 | 85 |
|
84 | | -ATHENA_TABLE_PROPS_CONTEXT_KEY = "_athena_current_tbl_props" |
| 86 | +ICEBERG_TABLE_TYPE = "ICEBERG" |
| 87 | +PROPERTY_NAME_INVALID_CHARS_PATTERN = re.compile(r"[^A-Za-z0-9_]") |
| 88 | +PROPERTY_NAME_REPLACEMENT = "__" |
| 89 | +PROPERTY_NAME_MAX_LENGTH = 256 |
85 | 90 |
|
86 | 91 | ATHENA_INTERVAL_TYPE_MAP = { |
87 | 92 | **dict.fromkeys(["enum", "string", "VARCHAR"], PartitionIntervalTypes.COLUMN_VALUE), |
@@ -125,10 +130,8 @@ def __init__( |
125 | 130 | ) |
126 | 131 | self.external_location_map = {} |
127 | 132 | self.schema_description_map = {} |
128 | | - self._thread_local = threading.local() |
129 | 133 | self.glue_client = None |
130 | 134 | self._processed_prop: Set[str] = set() |
131 | | - self._processed_prop_lock = threading.Lock() |
132 | 135 | self._string_property_type_ref = None |
133 | 136 |
|
134 | 137 | def prepare(self): |
@@ -178,7 +181,7 @@ def query_table_names_and_types( |
178 | 181 | params = table.get("Parameters", {}) |
179 | 182 | table_type = ( |
180 | 183 | TableType.Iceberg |
181 | | - if params.get("table_type") == "ICEBERG" |
| 184 | + if params.get("table_type") == ICEBERG_TABLE_TYPE |
182 | 185 | else TableType.External |
183 | 186 | ) |
184 | 187 | results.append( |
@@ -340,24 +343,12 @@ def get_table_description( |
340 | 343 | self, schema_name: str, table_name: str, inspector: Inspector |
341 | 344 | ) -> str: |
342 | 345 | description = None |
343 | | - setattr(self._thread_local, ATHENA_TABLE_PROPS_CONTEXT_KEY, {}) |
344 | 346 | try: |
345 | 347 | table_info: dict = inspector.get_table_comment(table_name, schema_name) |
346 | 348 | table_option = inspector.get_table_options(table_name, schema_name) |
347 | 349 | self.external_location_map[ |
348 | 350 | (self.context.get().database, schema_name, table_name) |
349 | 351 | ] = table_option.get("awsathena_location") |
350 | | - setattr( |
351 | | - self._thread_local, |
352 | | - ATHENA_TABLE_PROPS_CONTEXT_KEY, |
353 | | - { |
354 | | - prop_name: str(prop_value) |
355 | | - for prop_name, prop_value in ( |
356 | | - table_option.get("awsathena_tblproperties") or {} |
357 | | - ).items() |
358 | | - if prop_value is not None |
359 | | - }, |
360 | | - ) |
361 | 352 | # Catch any exception without breaking the ingestion |
362 | 353 | except Exception as exc: # pylint: disable=broad-except |
363 | 354 | logger.debug(traceback.format_exc()) |
@@ -389,35 +380,69 @@ def _get_columns_internal( |
389 | 380 | catalog_id=self.service_connection.catalogId, |
390 | 381 | ) |
391 | 382 |
|
392 | | - def get_table_extensions(self, table_name: str) -> Optional[Dict[str, str]]: |
| 383 | + def get_table_extensions( |
| 384 | + self, table_name: str, table_type: Optional[TableType] = None |
| 385 | + ) -> Optional[Dict[str, str]]: |
393 | 386 | if not self._string_property_type_ref: |
394 | 387 | return None |
395 | | - tbl_properties = getattr(self._thread_local, ATHENA_TABLE_PROPS_CONTEXT_KEY, {}) |
| 388 | + if table_type != TableType.Iceberg: |
| 389 | + return None |
| 390 | + schema_name = self.context.get().database_schema |
| 391 | + tbl_properties = self._fetch_iceberg_properties(schema_name, table_name) |
396 | 392 | if not tbl_properties: |
397 | 393 | return None |
398 | 394 | registered_properties = {} |
399 | 395 | for prop_name, prop_value in tbl_properties.items(): |
400 | | - with self._processed_prop_lock: |
401 | | - prop_already_registered = prop_name in self._processed_prop |
402 | | - if not prop_already_registered: |
| 396 | + if prop_value is None or prop_value == "": |
| 397 | + continue |
| 398 | + sanitized_name = PROPERTY_NAME_INVALID_CHARS_PATTERN.sub( |
| 399 | + PROPERTY_NAME_REPLACEMENT, prop_name |
| 400 | + ) |
| 401 | + if len(sanitized_name) > PROPERTY_NAME_MAX_LENGTH: |
| 402 | + sanitized_name = hashlib.md5( |
| 403 | + prop_name.encode("utf-8"), usedforsecurity=False |
| 404 | + ).hexdigest() |
| 405 | + if sanitized_name not in self._processed_prop: |
403 | 406 | try: |
404 | 407 | self.metadata.create_or_update_custom_property( |
405 | 408 | OMetaCustomProperties( |
406 | 409 | entity_type=Table, |
407 | 410 | createCustomPropertyRequest=CreateCustomPropertyRequest( |
408 | | - name=prop_name, |
| 411 | + name=sanitized_name, |
| 412 | + displayName=prop_name, |
409 | 413 | description=prop_name, |
410 | 414 | propertyType=self._string_property_type_ref, |
411 | 415 | ), |
412 | 416 | ) |
413 | 417 | ) |
414 | | - with self._processed_prop_lock: |
415 | | - self._processed_prop.add(prop_name) |
| 418 | + self._processed_prop.add(sanitized_name) |
416 | 419 | except Exception as exc: |
417 | 420 | logger.warning( |
418 | 421 | f"Failed to register custom property [{prop_name}] for Athena table properties: {exc}" |
419 | 422 | ) |
420 | 423 | logger.debug(traceback.format_exc()) |
421 | 424 | continue |
422 | | - registered_properties[prop_name] = prop_value |
| 425 | + registered_properties[sanitized_name] = prop_value |
423 | 426 | return registered_properties or None |
| 427 | + |
| 428 | + def _fetch_iceberg_properties( |
| 429 | + self, schema_name: str, table_name: str |
| 430 | + ) -> Dict[str, str]: |
| 431 | + """Read Iceberg native properties from Athena's `<table>$properties` metatable.""" |
| 432 | + query = text( |
| 433 | + f'SELECT key, value FROM "{schema_name}"."{table_name}$properties"' |
| 434 | + ) |
| 435 | + try: |
| 436 | + with self.engine.connect() as conn: |
| 437 | + result = conn.execute(query) |
| 438 | + return { |
| 439 | + str(row[0]): str(row[1]) |
| 440 | + for row in result |
| 441 | + if row[0] is not None and row[1] is not None |
| 442 | + } |
| 443 | + except Exception as exc: |
| 444 | + logger.debug( |
| 445 | + f"Unable to read Iceberg $properties for [{schema_name}.{table_name}]: {exc}" |
| 446 | + ) |
| 447 | + logger.debug(traceback.format_exc()) |
| 448 | + return {} |
0 commit comments