From e6d99fc385a2488c4b238e75d4505c743e7ca7b8 Mon Sep 17 00:00:00 2001 From: Akanksha Akkihal Date: Fri, 22 May 2026 22:28:06 +0000 Subject: [PATCH 1/2] feat(sql-utilities): support multiple SELECT projections with one aggregate --- .../src/ast_matching/sqlparser_test.rs | 89 ++++++++++ .../src/ast_matching/sqlpattern_parser.rs | 156 ++++++++++-------- 2 files changed, 173 insertions(+), 72 deletions(-) diff --git a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs index a6f593a1..cc8cdef8 100644 --- a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs +++ b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs @@ -802,4 +802,93 @@ mod tests { Some(QueryError::SpatialDurationSmall), ); } + + // ── Multi-projection SELECT (group cols + aggregate) ───────────────────── + // + // ClickHouse and standard SQL allow `SELECT g1, g2, agg(v) FROM t GROUP BY g1, g2` + // (one row per group with the grouping keys included alongside the aggregate). + // The pattern parser must also accept this shape and produce the same structural + // SQLQueryData as the single-projection form `SELECT agg(v) FROM t GROUP BY g1, g2`. + + #[test] + fn test_multi_projection_groupcols_then_aggregate() { + let query = parse_sql_query( + "SELECT L1, L2, L3, L4, SUM(value) FROM cpu_usage \ + WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \ + GROUP BY L1, L2, L3, L4", + ) + .expect("multi-projection SELECT with group cols + aggregate should parse"); + assert_eq!(query.metric, "cpu_usage"); + assert_eq!(query.aggregation_info.get_name(), "SUM"); + assert!(query.labels.contains("L1")); + assert!(query.labels.contains("L4")); + } + + #[test] + fn test_multi_projection_aggregate_first() { + let query = parse_sql_query( + "SELECT SUM(value), L1, L2, L3, L4 FROM cpu_usage \ + WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \ + GROUP BY L1, L2, L3, L4", + ) + .expect("aggregate-first multi-projection SELECT should parse"); + assert_eq!(query.aggregation_info.get_name(), "SUM"); + } + + #[test] + fn test_multi_projection_quantile_clickhouse_syntax() { + // The exact shape of the user's netflow query: ClickHouse parametric quantile + // with grouping columns alongside the aggregate in SELECT. + let query = parse_sql_query( + "SELECT L1, L2, quantile(0.99)(value) AS p99 FROM cpu_usage \ + WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) \ + GROUP BY L1, L2", + ) + .expect("multi-projection ClickHouse parametric quantile should parse"); + assert_eq!(query.aggregation_info.get_name(), "QUANTILE"); + assert_eq!(query.aggregation_info.get_args()[0], "0.99"); + } + + #[test] + fn test_multi_projection_matches_single_projection_template() { + // A template registered as single-projection should structurally match an + // incoming query that lists the group cols in SELECT alongside the aggregate. + let template = parse_sql_query( + "SELECT SUM(value) FROM cpu_usage \ + WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \ + GROUP BY L1, L2, L3, L4", + ) + .expect("single-projection template should parse"); + let incoming = parse_sql_query( + "SELECT L1, L2, L3, L4, SUM(value) FROM cpu_usage \ + WHERE time BETWEEN DATEADD(s, -10, '2025-10-01 00:00:10') AND '2025-10-01 00:00:10' \ + GROUP BY L1, L2, L3, L4", + ) + .expect("multi-projection incoming should parse"); + assert!(incoming.matches_sql_pattern(&template)); + } + + #[test] + fn test_multi_projection_rejects_two_aggregates() { + // Two aggregate functions in the projection list — the parser only tracks one + // statistic so this must be rejected to avoid silently dropping one. + assert!(parse_sql_query( + "SELECT SUM(value), AVG(value), L1 FROM cpu_usage \ + WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \ + GROUP BY L1", + ) + .is_none()); + } + + #[test] + fn test_multi_projection_rejects_arbitrary_expr() { + // Non-identifier, non-function projection items (computed expressions, literals, …) + // are not supported by the pattern model and must be rejected. + assert!(parse_sql_query( + "SELECT (L1 + 1), SUM(value) FROM cpu_usage \ + WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \ + GROUP BY L1", + ) + .is_none()); + } } diff --git a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_parser.rs b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_parser.rs index 4b653ec2..cfadc79a 100644 --- a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_parser.rs +++ b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_parser.rs @@ -211,88 +211,100 @@ impl SQLPatternParser { } fn get_aggregation(&self, select: &Select) -> Option { - if select.projection.len() != 1 { - return None; + // Find the (single) aggregate function in the projection list. Other + // projection items must be plain column references — these are expected to + // be group-by keys (e.g. `SELECT g1, g2, SUM(v) FROM t GROUP BY g1, g2`). + // Anything else (multiple aggregates, computed expressions, literals, *) + // is rejected since the structural pattern model only tracks one statistic. + let mut agg_func: Option<&Function> = None; + for item in &select.projection { + let expr = match item { + SelectItem::UnnamedExpr(expr) => expr, + SelectItem::ExprWithAlias { expr, .. } => expr, + _ => return None, + }; + match expr { + Expr::Function(f) => { + if agg_func.is_some() { + return None; + } + agg_func = Some(f); + } + Expr::Identifier(_) | Expr::CompoundIdentifier(_) => {} + _ => return None, + } } + let func = agg_func?; - match &select.projection[0] { - SelectItem::UnnamedExpr(Expr::Function(func)) - | SelectItem::ExprWithAlias { - expr: Expr::Function(func), - .. - } => { - let name = func.name.to_string().to_uppercase(); - - let args = self.get_quantile_args(func); - - // Get the column being aggregated - let col = match &func.args { - FunctionArguments::None => return None, - FunctionArguments::Subquery(_) => return None, - FunctionArguments::List(func_args) => { - if name == "QUANTILE" { - if let FunctionArguments::List(params) = &func.parameters { - if !params.args.is_empty() { - // ClickHouse parametric syntax: quantile(0.95)(column) - // Column is the sole argument in func.args. - match func_args.args.first() { - Some(FunctionArg::Unnamed(FunctionArgExpr::Expr( - Expr::Identifier(ident), - ))) => ident.value.clone(), - _ => return None, - } - } else { - return None; - } - } else { - // ASAP syntax: QUANTILE(0.95, value) - column is second argument - if func_args.args.len() < 2 { - return None; - } - match &func_args.args[1] { - FunctionArg::Unnamed(FunctionArgExpr::Expr( - Expr::Identifier(ident), - )) => ident.value.clone(), - _ => return None, - } - } - } else if name == "PERCENTILE" { - // PERCENTILE(value, 95) - column is first argument - if func_args.args.is_empty() { - return None; - } - match &func_args.args[0] { - FunctionArg::Unnamed(FunctionArgExpr::Expr(Expr::Identifier( - ident, + let name = func.name.to_string().to_uppercase(); + + let args = self.get_quantile_args(func); + + // Get the column being aggregated + let col = match &func.args { + FunctionArguments::None => return None, + FunctionArguments::Subquery(_) => return None, + FunctionArguments::List(func_args) => { + if name == "QUANTILE" { + if let FunctionArguments::List(params) = &func.parameters { + if !params.args.is_empty() { + // ClickHouse parametric syntax: quantile(0.95)(column) + // Column is the sole argument in func.args. + match func_args.args.first() { + Some(FunctionArg::Unnamed(FunctionArgExpr::Expr( + Expr::Identifier(ident), ))) => ident.value.clone(), _ => return None, } } else { - // For other aggregations - column is first argument - if func_args.args.is_empty() { - return None; - } - match &func_args.args[0] { - FunctionArg::Unnamed(FunctionArgExpr::Expr(Expr::Identifier( - ident, - ))) => ident.value.clone(), - _ => return None, - } + return None; + } + } else { + // ASAP syntax: QUANTILE(0.95, value) - column is second argument + if func_args.args.len() < 2 { + return None; + } + match &func_args.args[1] { + FunctionArg::Unnamed(FunctionArgExpr::Expr(Expr::Identifier( + ident, + ))) => ident.value.clone(), + _ => return None, } } - }; - - // Always store PERCENTILE as QUANTILE internally - let normalized_name = if name == "PERCENTILE" { - "QUANTILE".to_string() + } else if name == "PERCENTILE" { + // PERCENTILE(value, 95) - column is first argument + if func_args.args.is_empty() { + return None; + } + match &func_args.args[0] { + FunctionArg::Unnamed(FunctionArgExpr::Expr(Expr::Identifier(ident))) => { + ident.value.clone() + } + _ => return None, + } } else { - name - }; - - Some(AggregationInfo::new(normalized_name, col, args)) + // For other aggregations - column is first argument + if func_args.args.is_empty() { + return None; + } + match &func_args.args[0] { + FunctionArg::Unnamed(FunctionArgExpr::Expr(Expr::Identifier(ident))) => { + ident.value.clone() + } + _ => return None, + } + } } - _ => None, - } + }; + + // Always store PERCENTILE as QUANTILE internally + let normalized_name = if name == "PERCENTILE" { + "QUANTILE".to_string() + } else { + name + }; + + Some(AggregationInfo::new(normalized_name, col, args)) } fn get_metric(&self, select: &Select) -> Option<(String, bool)> { From 59f77146c13377e443ed993201d32b5c5ba1a5f1 Mon Sep 17 00:00:00 2001 From: Akanksha Akkihal Date: Fri, 22 May 2026 22:28:06 +0000 Subject: [PATCH 2/2] fix(sql-utilities): reject SELECT identifiers absent from GROUP BY --- .../src/ast_matching/sqlparser_test.rs | 28 +++++++++++++++++++ .../src/ast_matching/sqlpattern_parser.rs | 28 +++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs index cc8cdef8..5843de6c 100644 --- a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs +++ b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs @@ -891,4 +891,32 @@ mod tests { ) .is_none()); } + + #[test] + fn test_multi_projection_rejects_select_col_not_in_groupby() { + // L2 is in SELECT but not in GROUP BY. Standard SQL rejects this; we must too, + // otherwise the column would be silently dropped from the output. + assert!(parse_sql_query( + "SELECT L2, SUM(value) FROM cpu_usage \ + WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \ + GROUP BY L1", + ) + .is_none()); + } + + #[test] + fn test_multi_projection_accepts_select_subset_of_groupby() { + // SELECT lists a subset of group-by keys (L1) while the GROUP BY uses two + // (L1, L2). Allowed: every SELECT identifier is in GROUP BY; the remaining + // group-by key is just absent from the projection. + let query = parse_sql_query( + "SELECT L1, SUM(value) FROM cpu_usage \ + WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \ + GROUP BY L1, L2", + ) + .expect("SELECT subset of GROUP BY should parse"); + assert!(query.labels.contains("L1")); + assert!(query.labels.contains("L2")); + assert_eq!(query.aggregation_info.get_name(), "SUM"); + } } diff --git a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_parser.rs b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_parser.rs index cfadc79a..3453b700 100644 --- a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_parser.rs +++ b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_parser.rs @@ -92,6 +92,10 @@ impl SQLPatternParser { let group_bys = self.get_groupbys(select)?; + if !self.select_identifiers_subset_of(select, &group_bys) { + return None; + } + if !has_subquery { let time_info = self.get_time_info(select, &metric)?; @@ -126,6 +130,9 @@ impl SQLPatternParser { SetExpr::Select(inner_select) => { let inner_aggregation = self.get_aggregation(inner_select)?; let inner_group_bys = self.get_groupbys(inner_select)?; + if !self.select_identifiers_subset_of(inner_select, &inner_group_bys) { + return None; + } let time_info = self.get_time_info(inner_select, &metric)?; Some(Box::new(SQLQueryData { @@ -210,6 +217,27 @@ impl SQLPatternParser { } } + /// Returns true iff every non-aggregate identifier in `select.projection` is + /// also present in `group_bys`. Used to reject queries like + /// `SELECT srcip, SUM(v) FROM t GROUP BY proto`, where standard SQL would + /// require `srcip` to appear in the GROUP BY clause; without this check the + /// pattern parser would silently drop `srcip` from the output. + fn select_identifiers_subset_of(&self, select: &Select, group_bys: &HashSet) -> bool { + for item in &select.projection { + let expr = match item { + SelectItem::UnnamedExpr(expr) => expr, + SelectItem::ExprWithAlias { expr, .. } => expr, + _ => continue, + }; + if let Expr::Identifier(ident) = expr { + if !group_bys.contains(&ident.value) { + return false; + } + } + } + true + } + fn get_aggregation(&self, select: &Select) -> Option { // Find the (single) aggregate function in the projection list. Other // projection items must be plain column references — these are expected to