|
11 | 11 |
|
12 | 12 | """Athena source module""" |
13 | 13 |
|
14 | | -import threading |
| 14 | +import hashlib |
| 15 | +import re |
15 | 16 | import traceback |
16 | | -from typing import Dict, Iterable, Optional, Set, Tuple # noqa: UP035 |
| 17 | +from typing import Iterable, Optional, Tuple # noqa: UP035 |
17 | 18 |
|
18 | 19 | from pyathena.sqlalchemy.base import AthenaDialect |
| 20 | +from sqlalchemy import text |
19 | 21 | from sqlalchemy.engine.reflection import Inspector |
20 | 22 |
|
21 | 23 | from metadata.clients.aws_client import AWSClient |
|
40 | 42 | from metadata.generated.schema.metadataIngestion.workflow import ( |
41 | 43 | Source as WorkflowSource, |
42 | 44 | ) |
| 45 | +from metadata.generated.schema.type.basic import EntityName, Markdown |
43 | 46 | from metadata.ingestion.api.models import Either |
44 | 47 | from metadata.ingestion.api.steps import InvalidSourceException |
45 | 48 | from metadata.ingestion.models.custom_properties import ( |
|
81 | 84 | ATHENA_TAG = "ATHENA TAG" |
82 | 85 | ATHENA_TAG_CLASSIFICATION = "ATHENA TAG CLASSIFICATION" |
83 | 86 |
|
84 | | -ATHENA_TABLE_PROPS_CONTEXT_KEY = "_athena_current_tbl_props" |
| 87 | +ICEBERG_TABLE_TYPE = "ICEBERG" |
| 88 | +PROPERTY_NAME_INVALID_CHARS_PATTERN = re.compile(r"[^A-Za-z0-9_.\-]") |
| 89 | +PROPERTY_NAME_REPLACEMENT = "__" |
| 90 | +PROPERTY_NAME_MAX_LENGTH = 256 |
85 | 91 |
|
86 | 92 | ATHENA_INTERVAL_TYPE_MAP = { |
87 | 93 | **dict.fromkeys(["enum", "string", "VARCHAR"], PartitionIntervalTypes.COLUMN_VALUE), |
@@ -117,10 +123,8 @@ def __init__( |
117 | 123 | self.athena_lake_formation_client = AthenaLakeFormationClient(connection=self.service_connection) |
118 | 124 | self.external_location_map = {} |
119 | 125 | self.schema_description_map = {} |
120 | | - self._thread_local = threading.local() |
121 | 126 | self.glue_client = None |
122 | | - self._processed_prop: Set[str] = set() # noqa: UP006 |
123 | | - self._processed_prop_lock = threading.Lock() |
| 127 | + self._processed_prop: set[str] = set() |
124 | 128 | self._string_property_type_ref = None |
125 | 129 |
|
126 | 130 | def prepare(self): |
@@ -160,7 +164,9 @@ def query_table_names_and_types(self, schema_name: str) -> Iterable[TableNameAnd |
160 | 164 | for page in paginator.paginate(DatabaseName=schema_name): |
161 | 165 | for table in page.get("TableList", []): |
162 | 166 | params = table.get("Parameters", {}) |
163 | | - table_type = TableType.Iceberg if params.get("table_type") == "ICEBERG" else TableType.External |
| 167 | + table_type = ( |
| 168 | + TableType.Iceberg if params.get("table_type") == ICEBERG_TABLE_TYPE else TableType.External |
| 169 | + ) |
164 | 170 | results.append(TableNameAndType(name=table["Name"], type_=table_type)) |
165 | 171 | return results # noqa: TRY300 |
166 | 172 | except Exception as exc: |
@@ -307,22 +313,12 @@ def yield_table_tags( |
307 | 313 | # pylint: disable=arguments-differ |
308 | 314 | def get_table_description(self, schema_name: str, table_name: str, inspector: Inspector) -> str: |
309 | 315 | description = None |
310 | | - setattr(self._thread_local, ATHENA_TABLE_PROPS_CONTEXT_KEY, {}) |
311 | 316 | try: |
312 | 317 | table_info: dict = inspector.get_table_comment(table_name, schema_name) |
313 | 318 | table_option = inspector.get_table_options(table_name, schema_name) |
314 | 319 | self.external_location_map[(self.context.get().database, schema_name, table_name)] = table_option.get( |
315 | 320 | "awsathena_location" |
316 | 321 | ) |
317 | | - setattr( |
318 | | - self._thread_local, |
319 | | - ATHENA_TABLE_PROPS_CONTEXT_KEY, |
320 | | - { |
321 | | - prop_name: str(prop_value) |
322 | | - for prop_name, prop_value in (table_option.get("awsathena_tblproperties") or {}).items() |
323 | | - if prop_value is not None |
324 | | - }, |
325 | | - ) |
326 | 322 | # Catch any exception without breaking the ingestion |
327 | 323 | except Exception as exc: # pylint: disable=broad-except |
328 | 324 | logger.debug(traceback.format_exc()) |
@@ -352,35 +348,56 @@ def _get_columns_internal( |
352 | 348 | catalog_id=self.service_connection.catalogId, |
353 | 349 | ) |
354 | 350 |
|
355 | | - def get_table_extensions(self, table_name: str) -> Optional[Dict[str, str]]: # noqa: UP006, UP045 |
| 351 | + def get_table_extensions(self, table_name: str, table_type: TableType | None = None) -> dict[str, str] | None: |
| 352 | + if not getattr(self.source_config, "includeCustomProperties", False): |
| 353 | + return None |
356 | 354 | if not self._string_property_type_ref: |
357 | 355 | return None |
358 | | - tbl_properties = getattr(self._thread_local, ATHENA_TABLE_PROPS_CONTEXT_KEY, {}) |
| 356 | + if table_type != TableType.Iceberg: |
| 357 | + return None |
| 358 | + schema_name: str = getattr(self.context.get(), "database_schema", "") |
| 359 | + tbl_properties = self._fetch_iceberg_properties(schema_name, table_name) |
359 | 360 | if not tbl_properties: |
360 | 361 | return None |
361 | 362 | registered_properties = {} |
362 | 363 | for prop_name, prop_value in tbl_properties.items(): |
363 | | - with self._processed_prop_lock: |
364 | | - prop_already_registered = prop_name in self._processed_prop |
365 | | - if not prop_already_registered: |
| 364 | + if not prop_value: |
| 365 | + continue |
| 366 | + sanitized_name = PROPERTY_NAME_INVALID_CHARS_PATTERN.sub(PROPERTY_NAME_REPLACEMENT, prop_name) |
| 367 | + if len(sanitized_name) > PROPERTY_NAME_MAX_LENGTH: |
| 368 | + sanitized_name = hashlib.md5(prop_name.encode("utf-8"), usedforsecurity=False).hexdigest() |
| 369 | + if sanitized_name not in self._processed_prop: |
366 | 370 | try: |
367 | | - self.metadata.create_or_update_custom_property( |
| 371 | + self.metadata.create_or_update_custom_property( # pyright: ignore[reportUnknownMemberType, reportUnusedCallResult] |
368 | 372 | OMetaCustomProperties( |
369 | 373 | entity_type=Table, |
370 | 374 | createCustomPropertyRequest=CreateCustomPropertyRequest( |
371 | | - name=prop_name, |
372 | | - description=prop_name, |
| 375 | + name=EntityName(sanitized_name), |
| 376 | + displayName=prop_name, |
| 377 | + description=Markdown(prop_name), |
373 | 378 | propertyType=self._string_property_type_ref, |
| 379 | + customPropertyConfig=None, |
374 | 380 | ), |
375 | 381 | ) |
376 | 382 | ) |
377 | | - with self._processed_prop_lock: |
378 | | - self._processed_prop.add(prop_name) |
| 383 | + self._processed_prop.add(sanitized_name) |
379 | 384 | except Exception as exc: |
380 | 385 | logger.warning( |
381 | 386 | f"Failed to register custom property [{prop_name}] for Athena table properties: {exc}" |
382 | 387 | ) |
383 | 388 | logger.debug(traceback.format_exc()) |
384 | 389 | continue |
385 | | - registered_properties[prop_name] = prop_value |
| 390 | + registered_properties[sanitized_name] = prop_value |
386 | 391 | return registered_properties or None |
| 392 | + |
| 393 | + def _fetch_iceberg_properties(self, schema_name: str, table_name: str) -> dict[str, str]: |
| 394 | + """Read Iceberg native properties from Athena's `<table>$properties` metatable.""" |
| 395 | + query = text(f'SELECT key, value FROM "{schema_name}"."{table_name}$properties"') |
| 396 | + try: |
| 397 | + with self.engine.connect() as conn: |
| 398 | + result = conn.execute(query) |
| 399 | + return {str(row[0]): str(row[1]) for row in result if row[0] is not None and row[1] is not None} |
| 400 | + except Exception as exc: |
| 401 | + logger.debug(f"Unable to read Iceberg $properties for [{schema_name}.{table_name}]: {exc}") |
| 402 | + logger.debug(traceback.format_exc()) |
| 403 | + return {} |
0 commit comments