From db8e97a7d3cb40c3d5847eed53392c765d1b5ffe Mon Sep 17 00:00:00 2001 From: alrevuelta Date: Fri, 8 May 2026 17:14:50 +0200 Subject: [PATCH] Support clickhouse positional tuple element access (tup.1) --- src/dialect/clickhouse.rs | 5 ++ src/dialect/mod.rs | 12 ++++ src/parser/mod.rs | 7 ++ src/tokenizer.rs | 128 ++++++++++++++++++++++++++++++++-- tests/sqlparser_clickhouse.rs | 23 ++++++ 5 files changed, 171 insertions(+), 4 deletions(-) diff --git a/src/dialect/clickhouse.rs b/src/dialect/clickhouse.rs index 6ee60cc993..9a6bb17def 100644 --- a/src/dialect/clickhouse.rs +++ b/src/dialect/clickhouse.rs @@ -72,6 +72,11 @@ impl Dialect for ClickHouseDialect { true } + /// See + fn supports_tuple_element_access(&self) -> bool { + true + } + // ClickHouse uses this for some FORMAT expressions in `INSERT` context, e.g. when inserting // with FORMAT JSONEachRow a raw JSON key-value expression is valid and expected. // diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 9b2ede40d2..af1de6ff90 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -479,6 +479,18 @@ pub trait Dialect: Debug + Any { false } + /// Returns true if the dialect supports ClickHouse-style positional tuple + /// element access such as `tup.1`, `arr[1].2`, or `(1, 2, 3).3`. When + /// enabled, the tokenizer emits a standalone `.` (instead of fusing it + /// into a decimal literal) when the previous token can be the left-hand + /// side of a tuple access an identifier, `]`, `)`, or another integer + /// already inside the chain. + /// + /// See + fn supports_tuple_element_access(&self) -> bool { + false + } + /// Returns true if the dialect supports numbers containing underscores, e.g. `10_000_000` fn supports_numeric_literal_underscores(&self) -> bool { false diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 07497b04f6..7d656807f5 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1337,6 +1337,13 @@ impl<'a> Parser<'a> { AttachedToken(next_token), )); } + Token::Number(_, _) if self.dialect.supports_tuple_element_access() => { + // ClickHouse-style positional tuple access (`t.1`, + // `t.1.2`). Exit the wildcard-detection loop and let + // `parse_expr` handle the expression via the + // index-rewind fall-through below. + break; + } _ => { return self.expected("an identifier or a '*' after '.'", next_token); } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index d9f131f8fc..da72b483d5 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1381,10 +1381,18 @@ impl<'a> Tokenizer<'a> { return Ok(Some(Token::HexStringLiteral(s2))); } - // match one period - if let Some('.') = chars.peek() { - s.push('.'); - chars.next(); + // match one period. if we've just consumed an integer + // and the previous token is `.`, we're inside a ClickHouse + // tuple element access chain, and the trailing dot belongs + // to the chain, not to this number. + let in_tuple_chain = self.dialect.supports_tuple_element_access() + && prev_token == Some(&Token::Period) + && !s.is_empty(); + if !in_tuple_chain { + if let Some('.') = chars.peek() { + s.push('.'); + chars.next(); + } } // If the dialect supports identifiers that start with a numeric prefix @@ -1398,6 +1406,26 @@ impl<'a> Tokenizer<'a> { } } + // ClickHouse-style positional tuple element access: emit `.` as a + // standalone Period when it follows the LHS of a chain (an + // identifier, `]`, `)`, or another integer already in the chain), + // so e.g. `arr[1].1` and `t.1.2` parse as `CompoundFieldAccess` + // instead of being fused into a decimal literal. + if s == "." + && self.dialect.supports_tuple_element_access() + && matches!( + prev_token, + Some( + Token::Word(_) + | Token::RBracket + | Token::RParen + | Token::Number(_, _) + ) + ) + { + return Ok(Some(Token::Period)); + } + // Consume fractional digits. s += &peeking_next_take_while(chars, |ch, next_ch| { ch.is_ascii_digit() || is_number_separator(ch, next_ch) @@ -4303,6 +4331,98 @@ mod tests { ); } + #[test] + fn tokenize_clickhouse_tuple_element_access() { + let dialects = all_dialects_where(|dialect| dialect.supports_tuple_element_access()); + + // After a Word, RBracket, or RParen, `.` is split into `Period` + // and a separate integer `Number`, so the parser can build a + // CompoundFieldAccess instead of seeing a single decimal literal. + dialects.tokenizes_to( + "t.1", + vec![ + Token::make_word("t", None), + Token::Period, + Token::Number("1".to_string(), false), + ], + ); + + dialects.tokenizes_to( + "arr[1].2", + vec![ + Token::make_word("arr", None), + Token::LBracket, + Token::Number("1".to_string(), false), + Token::RBracket, + Token::Period, + Token::Number("2".to_string(), false), + ], + ); + + dialects.tokenizes_to( + "(1,2).2", + vec![ + Token::LParen, + Token::Number("1".to_string(), false), + Token::Comma, + Token::Number("2".to_string(), false), + Token::RParen, + Token::Period, + Token::Number("2".to_string(), false), + ], + ); + + // Nested access `tup.1.2` (Tuple of Tuple) — the rule must re-fire on + // the second dot, and the integer between the two dots must not eat + // the trailing dot as a decimal fraction. + dialects.tokenizes_to( + "t.1.2", + vec![ + Token::make_word("t", None), + Token::Period, + Token::Number("1".to_string(), false), + Token::Period, + Token::Number("2".to_string(), false), + ], + ); + + // Decimal literals must remain untouched: the previous token is + // either whitespace or a number, never the LHS of an access chain. + dialects.tokenizes_to( + "SELECT 0.5", + vec![ + Token::make_keyword("SELECT"), + Token::Whitespace(Whitespace::Space), + Token::Number("0.5".to_string(), false), + ], + ); + + dialects.tokenizes_to( + "SELECT .5", + vec![ + Token::make_keyword("SELECT"), + Token::Whitespace(Whitespace::Space), + Token::Number(".5".to_string(), false), + ], + ); + + // Regression: dialects without the flag keep the old behavior. The + // dot and digit fuse into a single decimal-shaped Number token. + let tokens = Tokenizer::new(&GenericDialect {}, "arr[1].2") + .tokenize() + .unwrap(); + assert_eq!( + tokens, + vec![ + Token::make_word("arr", None), + Token::LBracket, + Token::Number("1".to_string(), false), + Token::RBracket, + Token::Number(".2".to_string(), false), + ] + ); + } + #[test] fn tokenize_period_underscore() { let sql = String::from("SELECT table._col"); diff --git a/tests/sqlparser_clickhouse.rs b/tests/sqlparser_clickhouse.rs index 716a3919fc..fe13e6997b 100644 --- a/tests/sqlparser_clickhouse.rs +++ b/tests/sqlparser_clickhouse.rs @@ -110,6 +110,29 @@ fn parse_map_access_expr() { ); } +#[test] +fn parse_tuple_element_access() { + // Single-level access on an array of tuples. + let sql = "SELECT arr[1].1 FROM t"; + let select = clickhouse().verified_only_select(sql); + assert_eq!( + &Expr::CompoundFieldAccess { + root: Box::new(Expr::Identifier(Ident::new("arr"))), + access_chain: vec![ + AccessExpr::Subscript(Subscript::Index { + index: Expr::value(Value::Number("1".parse().unwrap(), false)), + }), + AccessExpr::Dot(Expr::value(Value::Number("1".parse().unwrap(), false))), + ], + }, + expr_from_projection(only(&select.projection)) + ); + + clickhouse().verified_stmt("SELECT t.1 FROM x"); + clickhouse().verified_stmt("SELECT (1, 2, 3).2"); + clickhouse().verified_stmt("SELECT arr[1].1.2 FROM t"); +} + #[test] fn parse_array_expr() { let sql = "SELECT ['1', '2'] FROM test";