@@ -595,6 +595,73 @@ mod tests {
595595 ) ;
596596 }
597597
598+ /// Regression for three matcher-side gaps that surface together when the
599+ /// simple_engine SQL path runs an HLL `COUNT(DISTINCT)` query end-to-end:
600+ ///
601+ /// 1. The parser correctly normalises `COUNT(DISTINCT col)` to the
602+ /// aggregation name `"CARDINALITY"`, but
603+ /// `SQLPatternMatcher::is_valid_aggregation` never gained `CARDINALITY`
604+ /// in its `legal_aggregations` set, so the validator rejected the query
605+ /// with `IllegalAggregationFn` before pattern matching ran.
606+ ///
607+ /// 2. After fixing (1), `flatten_query_info` validates the aggregation's
608+ /// "value column" against `schema.is_valid_value_column`, which only
609+ /// knows table value columns. `COUNT(DISTINCT col)` legitimately targets
610+ /// metadata/label columns (e.g. `COUNT(DISTINCT dstip)`), so the
611+ /// validator rejected it with `InvalidValueCol`. The fix accepts
612+ /// metadata columns *only* for CARDINALITY.
613+ ///
614+ /// 3. With both fixed, the query classifies as `SpatioTemporal` because
615+ /// `GROUP BY` only covers a subset of metadata columns — exactly the
616+ /// shape of the user's real `COUNT(DISTINCT dstip) GROUP BY srcip`
617+ /// query, which selects on `srcip` and aggregates over `dstip` (so
618+ /// labels ⊊ metadata_columns).
619+ ///
620+ /// Observed log line that motivated this test:
621+ /// error: Some(IllegalAggregationFn),
622+ /// msg: Some("attempt to use illegal aggregation function CARDINALITY")
623+ #[ test]
624+ fn test_count_distinct_passes_aggregation_allowlist ( ) {
625+ check_query (
626+ "SELECT COUNT(DISTINCT L4) FROM cpu_usage \
627+ WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \
628+ GROUP BY L1, L2, L3",
629+ vec ! [ QueryType :: SpatioTemporal ] ,
630+ None ,
631+ ) ;
632+ }
633+
634+ /// Companion: when `GROUP BY` covers all metadata columns *except* the
635+ /// distinct-target itself, the query is still SpatioTemporal — the
636+ /// distinct-target is the value column, not a grouping label, so labels
637+ /// always form a strict subset of metadata_columns. Guards against future
638+ /// "treat L4 as both label and value" regressions in the classifier.
639+ #[ test]
640+ fn test_count_distinct_with_full_remaining_labels_is_spatiotemporal ( ) {
641+ check_query (
642+ "SELECT COUNT(DISTINCT L4) FROM cpu_usage \
643+ WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \
644+ GROUP BY L1, L2, L3",
645+ vec ! [ QueryType :: SpatioTemporal ] ,
646+ None ,
647+ ) ;
648+ }
649+
650+ /// Negative case: `COUNT(DISTINCT not_in_schema)` against a column that's
651+ /// neither a value_column nor a metadata_column must still be rejected as
652+ /// `InvalidValueCol`. The CARDINALITY relaxation widens what's *allowed*
653+ /// (metadata columns) but doesn't disable the schema check entirely.
654+ #[ test]
655+ fn test_count_distinct_unknown_column_still_rejected ( ) {
656+ check_query (
657+ "SELECT COUNT(DISTINCT bogus_column) FROM cpu_usage \
658+ WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \
659+ GROUP BY L1, L2, L3",
660+ vec ! [ ] ,
661+ Some ( QueryError :: InvalidValueCol ) ,
662+ ) ;
663+ }
664+
598665 #[ test]
599666 fn test_error_spatial_scrape_duration_too_small ( ) {
600667 check_query (
@@ -1034,4 +1101,143 @@ mod tests {
10341101 . unwrap ( ) ;
10351102 assert ! ( incoming. matches_sql_pattern( & template) ) ;
10361103 }
1104+
1105+ // ── COUNT(DISTINCT col) support ──────────────────────────────────────────
1106+ //
1107+ // `COUNT(DISTINCT col)` must be normalised to a cardinality aggregation
1108+ // (`AggregationInfo.name == "CARDINALITY"`) so the engine routes it to a
1109+ // distinct-tracking sketch (SetAggregator / HLL) instead of a plain Count
1110+ // sketch. The parser today drops `DISTINCT` silently — a parser-level bug
1111+ // that would dispatch streaming counts as totals.
1112+
1113+ #[ test]
1114+ fn test_count_distinct_single_column_maps_to_cardinality ( ) {
1115+ // The structural signature of the user's COUNT(DISTINCT) query.
1116+ let q = parse_sql_query (
1117+ "SELECT L1, COUNT(DISTINCT L2) FROM cpu_usage \
1118+ WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \
1119+ GROUP BY L1",
1120+ )
1121+ . expect ( "COUNT(DISTINCT col) should parse" ) ;
1122+ assert_eq ! ( q. aggregation_info. get_name( ) , "CARDINALITY" ) ;
1123+ assert_eq ! ( q. aggregation_info. get_value_column_name( ) , "L2" ) ;
1124+ assert ! ( q. aggregation_info. get_args( ) . is_empty( ) ) ;
1125+ assert ! ( q. labels. contains( "L1" ) ) ;
1126+ }
1127+
1128+ #[ test]
1129+ fn test_count_distinct_full_user_query_with_order_by_limit ( ) {
1130+ // The exact shape of the user's HLL netflow query, ported to the test schema.
1131+ let q = parse_sql_query (
1132+ "SELECT L1, COUNT(DISTINCT L2) AS unique_peers FROM cpu_usage \
1133+ WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) \
1134+ GROUP BY L1 \
1135+ ORDER BY unique_peers DESC LIMIT 20",
1136+ )
1137+ . expect ( "COUNT(DISTINCT col) + ORDER BY + LIMIT should parse" ) ;
1138+ assert_eq ! ( q. aggregation_info. get_name( ) , "CARDINALITY" ) ;
1139+ assert_eq ! ( q. aggregation_info. get_value_column_name( ) , "L2" ) ;
1140+ assert_eq ! ( q. aggregation_alias. as_deref( ) , Some ( "unique_peers" ) ) ;
1141+ assert_eq ! ( q. order_by. len( ) , 1 ) ;
1142+ assert_eq ! ( q. order_by[ 0 ] . column, "unique_peers" ) ;
1143+ assert ! ( !q. order_by[ 0 ] . ascending) ;
1144+ assert_eq ! ( q. limit, Some ( 20 ) ) ;
1145+ }
1146+
1147+ #[ test]
1148+ fn test_count_distinct_matches_count_distinct_template ( ) {
1149+ // Pattern matching: incoming COUNT(DISTINCT col) with absolute timestamps must
1150+ // match a NOW()-relative COUNT(DISTINCT col) template.
1151+ let template = parse_sql_query (
1152+ "SELECT COUNT(DISTINCT L2) FROM cpu_usage \
1153+ WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \
1154+ GROUP BY L1",
1155+ )
1156+ . unwrap ( ) ;
1157+ let incoming = parse_sql_query (
1158+ "SELECT COUNT(DISTINCT L2) FROM cpu_usage \
1159+ WHERE time BETWEEN DATEADD(s, -10, '2025-10-01 00:00:10') AND '2025-10-01 00:00:10' \
1160+ GROUP BY L1",
1161+ )
1162+ . unwrap ( ) ;
1163+ assert ! ( incoming. matches_sql_pattern( & template) ) ;
1164+ }
1165+
1166+ #[ test]
1167+ fn test_count_distinct_does_not_match_plain_count_template ( ) {
1168+ // CARDINALITY and COUNT are distinct aggregations — a COUNT(DISTINCT col)
1169+ // template must not be served by an incoming COUNT(col) query (and vice versa).
1170+ let count_distinct = parse_sql_query (
1171+ "SELECT COUNT(DISTINCT L2) FROM cpu_usage \
1172+ WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \
1173+ GROUP BY L1",
1174+ )
1175+ . unwrap ( ) ;
1176+ let plain_count = parse_sql_query (
1177+ "SELECT COUNT(L2) FROM cpu_usage \
1178+ WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \
1179+ GROUP BY L1",
1180+ )
1181+ . unwrap ( ) ;
1182+ assert ! ( !plain_count. matches_sql_pattern( & count_distinct) ) ;
1183+ assert ! ( !count_distinct. matches_sql_pattern( & plain_count) ) ;
1184+ }
1185+
1186+ #[ test]
1187+ fn test_count_all_treated_as_plain_count ( ) {
1188+ // The redundant explicit `ALL` modifier (the SQL default) must NOT switch the
1189+ // aggregation to CARDINALITY; only `DISTINCT` triggers cardinality semantics.
1190+ let q = parse_sql_query (
1191+ "SELECT COUNT(ALL L2) FROM cpu_usage \
1192+ WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \
1193+ GROUP BY L1",
1194+ )
1195+ . expect ( "COUNT(ALL col) should parse as plain COUNT" ) ;
1196+ assert_eq ! ( q. aggregation_info. get_name( ) , "COUNT" ) ;
1197+ }
1198+
1199+ #[ test]
1200+ fn test_count_without_distinct_remains_count ( ) {
1201+ // Regression guard: ensure the DISTINCT-aware path doesn't accidentally rewrite
1202+ // `COUNT(col)` (without any duplicate_treatment).
1203+ let q = parse_sql_query (
1204+ "SELECT COUNT(L2) FROM cpu_usage \
1205+ WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \
1206+ GROUP BY L1",
1207+ )
1208+ . expect ( "COUNT(col) should parse" ) ;
1209+ assert_eq ! ( q. aggregation_info. get_name( ) , "COUNT" ) ;
1210+ }
1211+
1212+ #[ test]
1213+ fn test_count_distinct_multiple_columns_rejected ( ) {
1214+ // Multi-column DISTINCT (`COUNT(DISTINCT a, b)`) is a compound-key cardinality
1215+ // that the structural model can't represent with a single value_column. Reject
1216+ // it explicitly rather than silently keeping only the first argument.
1217+ assert ! ( parse_sql_query(
1218+ "SELECT COUNT(DISTINCT L1, L2) FROM cpu_usage \
1219+ WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \
1220+ GROUP BY L3",
1221+ )
1222+ . is_none( ) ) ;
1223+ }
1224+
1225+ #[ test]
1226+ fn test_distinct_on_non_count_aggregate_rejected ( ) {
1227+ // DISTINCT on aggregates other than COUNT (e.g. `SUM(DISTINCT v)`, `AVG(DISTINCT v)`)
1228+ // is not modelled by any precompute sketch type; reject rather than silently
1229+ // dropping the modifier and dispatching to a plain Sum.
1230+ assert ! ( parse_sql_query(
1231+ "SELECT SUM(DISTINCT value) FROM cpu_usage \
1232+ WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \
1233+ GROUP BY L1",
1234+ )
1235+ . is_none( ) ) ;
1236+ assert ! ( parse_sql_query(
1237+ "SELECT AVG(DISTINCT value) FROM cpu_usage \
1238+ WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \
1239+ GROUP BY L1",
1240+ )
1241+ . is_none( ) ) ;
1242+ }
10371243}
0 commit comments