-
Notifications
You must be signed in to change notification settings - Fork 2.1k
fix(clickhouse): add downstream lineage for MATERIALIZED VIEW TO clause #27628
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,7 +11,9 @@ | |
| """ | ||
| Lineage Parser configuration | ||
| """ | ||
|
|
||
| import hashlib | ||
| import re | ||
| import time | ||
| import traceback | ||
| from collections import defaultdict | ||
|
|
@@ -64,6 +66,31 @@ | |
| # max memory in MB that lineage parsing can consume | ||
| LINEAGE_PARSING_MEMORY_LIMIT_MB = 100 | ||
|
|
||
| # Pre-compiled regex to rewrite ClickHouse MATERIALIZED VIEW ... TO <target> queries | ||
| # into CREATE TABLE <target> AS SELECT ... so that sqllineage can correctly identify | ||
| # the downstream target table instead of the view name itself. | ||
| # | ||
| # Handles all documented ClickHouse CREATE MATERIALIZED VIEW forms: | ||
| # 1. CREATE MATERIALIZED VIEW [IF NOT EXISTS] mv_name [ON CLUSTER c] TO target AS SELECT ... | ||
| # 2. CREATE MATERIALIZED VIEW mv_name REFRESH EVERY n HOUR [OFFSET m MINUTE] TO target (col1, col2) | ||
| # [DEFINER = user] [SQL SECURITY ...] AS SELECT ... | ||
| # Also: REFRESH AFTER n SECOND form (alternative ClickHouse refresh syntax) | ||
| # 3. ENGINE = ... clauses between TO target and AS SELECT are skipped. | ||
| # | ||
| # The character class for <target> handles backtick-quoted segments with spaces | ||
| # and stops at the first whitespace / opening paren NOT inside backticks. | ||
| _CLICKHOUSE_MV_TO_RE = re.compile( | ||
| r"^\s*CREATE\s+MATERIALIZED\s+VIEW\s+" | ||
| r"(?:IF\s+NOT\s+EXISTS\s+)?" # optional IF NOT EXISTS | ||
| r"(?:`[^`]+`|\S+)\s+" # skip MV name (handles quoted names with spaces) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Edge Case: MV-name skip group fails for multi-part backtick-quoted namesThe regex group that skips the MV name on line 85 ( This is an edge case (ClickHouse MV names with spaces in both schema and table parts are rare), but it would result in a silent false-negative where the lineage is not captured. Suggested fix: Was this helpful? React with 👍 / 👎 | Reply |
||
| r"(?:ON\s+CLUSTER\s+\S+\s+)?" # optional ON CLUSTER <cluster_name> | ||
| r"(?:REFRESH\s+(?:EVERY|AFTER)\s+(?:(?!\bTO\b)[\s\S])*?)?" # optional REFRESH | ||
| r"TO\s+((?:`[^`]+`|[\w`\.\[\]\"])+)" # capture target table | ||
| r"(?:\s*\([^)]*\))?" # optional column list (col1, col2, ...) | ||
| r".*?AS\s+(SELECT.*)", # skip ENGINE/DEFINER/SETTINGS, capture SELECT body | ||
| re.IGNORECASE | re.DOTALL, | ||
| ) | ||
|
|
||
|
|
||
| class LineageParser: | ||
| """ | ||
|
|
@@ -234,7 +261,7 @@ def table_aliases(self) -> Dict[str, str]: | |
| # Check if involved_tables is present | ||
| if not self.involved_tables: | ||
| logger.debug( | ||
| f"[{self.query_hash}] [UsageSink] No involved tables found — alias map will be empty." | ||
| f"[{self.query_hash}] [UsageSink] No involved tables found -- alias map will be empty." | ||
| ) | ||
| return {} | ||
|
|
||
|
|
@@ -512,6 +539,22 @@ def clean_raw_query(cls, raw_query: str) -> Optional[str]: | |
|
|
||
| clean_query = clean_query.replace("\\n", "\n") | ||
|
|
||
| # Rewrite ClickHouse MATERIALIZED VIEW ... TO <target> AS SELECT ... | ||
| # into CREATE TABLE <target> AS SELECT ... so that sqllineage correctly | ||
| # identifies the downstream target table instead of the view name. | ||
| # | ||
| # Without this rewrite, sqllineage treats the MV name as the CREATE target | ||
| # and never registers the table named after TO as a downstream node. | ||
| # We handle it at this layer (query normalisation) so all three parsers | ||
| # (SqlGlot, SqlFluff, SqlParse) benefit automatically and no synthetic | ||
| # queries are written to query history. | ||
| if insensitive_match(clean_query, r"^\s*CREATE\s+MATERIALIZED\s+VIEW\s+"): | ||
| mv_to_match = _CLICKHOUSE_MV_TO_RE.search(clean_query) | ||
| if mv_to_match: | ||
| target_table = mv_to_match.group(1).strip() | ||
| select_body = mv_to_match.group(2) | ||
| clean_query = f"CREATE TABLE {target_table} AS {select_body}" | ||
|
|
||
| if insensitive_match( | ||
| clean_query, r"\s*/\*.*?\*/\s*merge.*into.*?when matched.*?" | ||
| ): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
💡 Quality: Stray indentation on module-level comment
Line 80 has a 4-space indent on a module-level comment (
# The character class for <target>...). While Python ignores comment indentation, this visually suggests it belongs to a code block rather than continuing the module-level documentation block above it.Was this helpful? React with 👍 / 👎 | Reply
gitar fixto apply this suggestion