Skip to content

Commit 8612022

Browse files
Refined comments
1 parent da67cb2 commit 8612022

6 files changed

Lines changed: 52 additions & 132 deletions

File tree

asap-common/dependencies/rs/asap_types/src/capability_matching.rs

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,6 @@ pub fn compatible_agg_types(stat: Statistic) -> &'static [AggregationType] {
3333
Statistic::Cardinality => &[
3434
AggregationType::SetAggregator,
3535
AggregationType::DeltaSetAggregator,
36-
// HLL is the single-population probabilistic alternative used by
37-
// `COUNT(DISTINCT col) GROUP BY …` queries. It backs the per-bucket
38-
// sketch directly (no paired key aggregation required) — see
39-
// `is_multi_population_value_type`, which excludes HLL.
4036
AggregationType::HLL,
4137
],
4238
Statistic::Topk => &[AggregationType::CountMinSketchWithHeap],

asap-common/dependencies/rs/promql_utilities/src/query_logics/enums.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -185,8 +185,6 @@ pub enum AggregationOperator {
185185
Min,
186186
Max,
187187
Topk,
188-
/// Distinct-value count. SQL-side normalisation maps `COUNT(DISTINCT col)` to
189-
/// the aggregation name "CARDINALITY", which lands here.
190188
Cardinality,
191189
}
192190

@@ -453,8 +451,7 @@ mod tests {
453451
fn test_aggregation_operator_cardinality_round_trip() {
454452
// The SQL parser normalises `COUNT(DISTINCT col)` to the aggregation name
455453
// "CARDINALITY"; `parse_single_statistic` then routes it through
456-
// `AggregationOperator::FromStr`. Without a Cardinality variant the lookup
457-
// returns Err and the query is rejected as "Unsupported statistic name".
454+
// `AggregationOperator::FromStr`.
458455
let op: AggregationOperator = "cardinality".parse().expect("cardinality should parse");
459456
assert_eq!(op, AggregationOperator::Cardinality);
460457
assert_eq!(op.to_statistics(), vec![Statistic::Cardinality]);

asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs

Lines changed: 46 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -595,73 +595,6 @@ mod tests {
595595
);
596596
}
597597

598-
/// Regression for three matcher-side gaps that surface together when the
599-
/// simple_engine SQL path runs an HLL `COUNT(DISTINCT)` query end-to-end:
600-
///
601-
/// 1. The parser correctly normalises `COUNT(DISTINCT col)` to the
602-
/// aggregation name `"CARDINALITY"`, but
603-
/// `SQLPatternMatcher::is_valid_aggregation` never gained `CARDINALITY`
604-
/// in its `legal_aggregations` set, so the validator rejected the query
605-
/// with `IllegalAggregationFn` before pattern matching ran.
606-
///
607-
/// 2. After fixing (1), `flatten_query_info` validates the aggregation's
608-
/// "value column" against `schema.is_valid_value_column`, which only
609-
/// knows table value columns. `COUNT(DISTINCT col)` legitimately targets
610-
/// metadata/label columns (e.g. `COUNT(DISTINCT dstip)`), so the
611-
/// validator rejected it with `InvalidValueCol`. The fix accepts
612-
/// metadata columns *only* for CARDINALITY.
613-
///
614-
/// 3. With both fixed, the query classifies as `SpatioTemporal` because
615-
/// `GROUP BY` only covers a subset of metadata columns — exactly the
616-
/// shape of the user's real `COUNT(DISTINCT dstip) GROUP BY srcip`
617-
/// query, which selects on `srcip` and aggregates over `dstip` (so
618-
/// labels ⊊ metadata_columns).
619-
///
620-
/// Observed log line that motivated this test:
621-
/// error: Some(IllegalAggregationFn),
622-
/// msg: Some("attempt to use illegal aggregation function CARDINALITY")
623-
#[test]
624-
fn test_count_distinct_passes_aggregation_allowlist() {
625-
check_query(
626-
"SELECT COUNT(DISTINCT L4) FROM cpu_usage \
627-
WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \
628-
GROUP BY L1, L2, L3",
629-
vec![QueryType::SpatioTemporal],
630-
None,
631-
);
632-
}
633-
634-
/// Companion: when `GROUP BY` covers all metadata columns *except* the
635-
/// distinct-target itself, the query is still SpatioTemporal — the
636-
/// distinct-target is the value column, not a grouping label, so labels
637-
/// always form a strict subset of metadata_columns. Guards against future
638-
/// "treat L4 as both label and value" regressions in the classifier.
639-
#[test]
640-
fn test_count_distinct_with_full_remaining_labels_is_spatiotemporal() {
641-
check_query(
642-
"SELECT COUNT(DISTINCT L4) FROM cpu_usage \
643-
WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \
644-
GROUP BY L1, L2, L3",
645-
vec![QueryType::SpatioTemporal],
646-
None,
647-
);
648-
}
649-
650-
/// Negative case: `COUNT(DISTINCT not_in_schema)` against a column that's
651-
/// neither a value_column nor a metadata_column must still be rejected as
652-
/// `InvalidValueCol`. The CARDINALITY relaxation widens what's *allowed*
653-
/// (metadata columns) but doesn't disable the schema check entirely.
654-
#[test]
655-
fn test_count_distinct_unknown_column_still_rejected() {
656-
check_query(
657-
"SELECT COUNT(DISTINCT bogus_column) FROM cpu_usage \
658-
WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \
659-
GROUP BY L1, L2, L3",
660-
vec![],
661-
Some(QueryError::InvalidValueCol),
662-
);
663-
}
664-
665598
#[test]
666599
fn test_error_spatial_scrape_duration_too_small() {
667600
check_query(
@@ -1107,8 +1040,7 @@ mod tests {
11071040
// `COUNT(DISTINCT col)` must be normalised to a cardinality aggregation
11081041
// (`AggregationInfo.name == "CARDINALITY"`) so the engine routes it to a
11091042
// distinct-tracking sketch (SetAggregator / HLL) instead of a plain Count
1110-
// sketch. The parser today drops `DISTINCT` silently — a parser-level bug
1111-
// that would dispatch streaming counts as totals.
1043+
// sketch.
11121044

11131045
#[test]
11141046
fn test_count_distinct_single_column_maps_to_cardinality() {
@@ -1240,4 +1172,49 @@ mod tests {
12401172
)
12411173
.is_none());
12421174
}
1175+
1176+
/// Matcher must accept parser-normalised `CARDINALITY` (not `IllegalAggregationFn`),
1177+
/// allow distinct targets in metadata_columns (e.g. `dstip`), and classify
1178+
/// `COUNT(DISTINCT col) GROUP BY <label subset>` as `SpatioTemporal`.
1179+
#[test]
1180+
fn test_count_distinct_passes_aggregation_allowlist() {
1181+
check_query(
1182+
"SELECT COUNT(DISTINCT L4) FROM cpu_usage \
1183+
WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \
1184+
GROUP BY L1, L2, L3",
1185+
vec![QueryType::SpatioTemporal],
1186+
None,
1187+
);
1188+
}
1189+
1190+
/// Companion: when `GROUP BY` covers all metadata columns *except* the
1191+
/// distinct-target itself, the query is still SpatioTemporal — the
1192+
/// distinct-target is the value column, not a grouping label, so labels
1193+
/// always form a strict subset of metadata_columns. Guards against future
1194+
/// "treat L4 as both label and value" regressions in the classifier.
1195+
#[test]
1196+
fn test_count_distinct_with_full_remaining_labels_is_spatiotemporal() {
1197+
check_query(
1198+
"SELECT COUNT(DISTINCT L4) FROM cpu_usage \
1199+
WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \
1200+
GROUP BY L1, L2, L3",
1201+
vec![QueryType::SpatioTemporal],
1202+
None,
1203+
);
1204+
}
1205+
1206+
/// Negative case: `COUNT(DISTINCT not_in_schema)` against a column that's
1207+
/// neither a value_column nor a metadata_column must still be rejected as
1208+
/// `InvalidValueCol`. The CARDINALITY relaxation widens what's *allowed*
1209+
/// (metadata columns) but doesn't disable the schema check entirely.
1210+
#[test]
1211+
fn test_count_distinct_unknown_column_still_rejected() {
1212+
check_query(
1213+
"SELECT COUNT(DISTINCT bogus_column) FROM cpu_usage \
1214+
WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() \
1215+
GROUP BY L1, L2, L3",
1216+
vec![],
1217+
Some(QueryError::InvalidValueCol),
1218+
);
1219+
}
12431220
}

asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_matcher.rs

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -102,11 +102,7 @@ impl SQLPatternMatcher {
102102
legal_aggregations.insert("MIN");
103103
legal_aggregations.insert("MAX");
104104
legal_aggregations.insert("QUANTILE");
105-
// `COUNT(DISTINCT col)` is normalised by the parser to the aggregation
106-
// name "CARDINALITY" (see `SQLPatternParser::get_aggregation`). Without
107-
// this entry the simple_engine SQL handler rejects every COUNT(DISTINCT)
108-
// query with `IllegalAggregationFn` before pattern matching runs, which
109-
// in turn blocks routing to the precompute engine's HLL accumulator.
105+
// COUNT(DISTINCT col) is normalised by the parser to the aggregationname "CARDINALITY"
110106
legal_aggregations.insert("CARDINALITY");
111107

112108
Self {

asap-query-engine/src/precompute_engine/accumulator_factory.rs

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -896,13 +896,8 @@ mod tests {
896896
}
897897
}
898898

899-
// ── HLL updater ──────────────────────────────────────────────────────
900-
//
901-
// `COUNT(DISTINCT col)` queries flow through `AggregationType::HLL`. The
902-
// factory must produce an `HllAccumulatorUpdater` (not the silent
903-
// `SumAccumulatorUpdater` fallback that the old default arm gave) so the
904-
// streaming layer actually hashes incoming samples into an HLL register
905-
// array rather than summing them.
899+
// HLL: `AggregationType::HLL` must build `HllAccumulatorUpdater` (hashes samples
900+
// into a sketch), not fall through to the default `SumAccumulatorUpdater`.
906901

907902
#[test]
908903
fn test_hll_updater_via_factory_routes_to_hll_accumulator() {

asap-query-engine/src/precompute_operators/hll_accumulator.rs

Lines changed: 2 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,3 @@
1-
//! HyperLogLog accumulator for `COUNT(DISTINCT col)` queries.
2-
//!
3-
//! Wraps `asap_sketchlib::HllSketch` (precision-14 `HllVariant::Regular` by default)
4-
//! and exposes it through the precompute-engine accumulator traits so the streaming
5-
//! worker can feed values into it and the query path can dispatch `Cardinality`
6-
//! against the resulting sketch.
7-
//!
8-
//! Mirrors the `DatasketchesKLLAccumulator` layout: a single subpopulation per
9-
//! grouping key, msgpack-encoded byte serialization (round-trippable, unlike the
10-
//! lossy `datafusion_summary_library::physical::hll::HllSketch::to_bytes`).
11-
//!
12-
//! End-to-end pipeline for `COUNT(DISTINCT col)`:
13-
//! 1. SQL parser normalises the call to `name="CARDINALITY", value_column=col`.
14-
//! 2. `AggregationOperator::Cardinality.to_statistics()` → `[Statistic::Cardinality]`.
15-
//! 3. Capability matching pairs the query with an `AggregationType::HLL` config.
16-
//! 4. The precompute worker feeds `col`'s f64 samples through
17-
//! `HllAccumulatorUpdater::update_single`, which calls `HllAccumulator::update`.
18-
//! 5. Query path: `plan_builder` maps `Statistic::Cardinality → InferOperation::CountDistinct`;
19-
//! `SummaryInferExec` deserializes the stored sketch and calls
20-
//! `SingleSubpopulationAggregate::query(Cardinality)`, which returns
21-
//! `HllSketch::estimate()`.
22-
231
use std::collections::HashMap;
242

253
use asap_sketchlib::sketches::hll::{HllSketch, HllVariant};
@@ -33,15 +11,12 @@ use crate::data_model::{
3311
SingleSubpopulationAggregate,
3412
};
3513

36-
/// Default precision when none is supplied via streaming-config parameters.
37-
/// Matches `datafusion_summary_library::physical::hll::HllSketch` (~0.8% std error,
38-
/// ~16 KiB per sketch).
14+
/// Default HLL precision when streaming config omits `parameters.precision`.
3915
pub const DEFAULT_HLL_PRECISION: u32 = 14;
4016

4117
/// HLL sketch accumulator — wraps `asap_sketchlib::HllSketch`.
4218
/// Core insert/merge/serde logic lives in `asap_sketchlib`; this file retains
43-
/// the QE-specific trait impls (`AggregateCore`, `SingleSubpopulationAggregate`,
44-
/// `SerializableToSink`, `MergeableAccumulator`).
19+
/// QE-specific trait impls.
4520
#[derive(Debug, Clone)]
4621
pub struct HllAccumulator {
4722
pub inner: HllSketch,
@@ -58,19 +33,10 @@ impl HllAccumulator {
5833
Self::new(DEFAULT_HLL_PRECISION)
5934
}
6035

61-
/// Feed a value into the sketch. The streaming layer surfaces all column
62-
/// values as `f64`; we hash their little-endian bytes via `HllSketch::update`,
63-
/// which goes through the canonical sketchlib hash (`hash64_seeded` with
64-
/// `CANONICAL_HASH_SEED`). That makes the on-disk sketch byte-identical to
65-
/// what `sketchlib-go` would produce for the same stream — cross-language
66-
/// parity is locked in by upstream's golden-byte test.
6736
pub fn update(&mut self, value: f64) {
6837
self.inner.update(&value.to_le_bytes());
6938
}
7039

71-
/// Current cardinality estimate. Returned as `f64` so callers don't lose the
72-
/// fractional small-range correction; the engine truncates to an integer in
73-
/// the final result column when needed.
7440
pub fn estimate(&self) -> f64 {
7541
self.inner.estimate()
7642
}
@@ -96,9 +62,6 @@ impl Default for HllAccumulator {
9662

9763
impl SerializableToSink for HllAccumulator {
9864
fn serialize_to_json(&self) -> Value {
99-
// Mirrors KLL's JSON shape: opaque base64 of the canonical msgpack body so
100-
// downstream consumers can round-trip through any serde-capable channel
101-
// without losing the register array.
10265
let bytes = self.inner.serialize_msgpack().unwrap_or_default();
10366
let b64 = general_purpose::STANDARD.encode(&bytes);
10467
serde_json::json!({
@@ -152,8 +115,6 @@ impl AggregateCore for HllAccumulator {
152115
}
153116

154117
fn get_keys(&self) -> Option<Vec<crate::KeyByLabelValues>> {
155-
// HLL is a probabilistic counter, not a set: it estimates |S| without
156-
// retaining the individual members of S. There are no keys to enumerate.
157118
None
158119
}
159120

@@ -163,8 +124,6 @@ impl AggregateCore for HllAccumulator {
163124
_key: &Option<crate::KeyByLabelValues>,
164125
query_kwargs: &HashMap<String, String>,
165126
) -> Result<f64, Box<dyn std::error::Error + Send + Sync>> {
166-
// HLL is a single-population accumulator (one sketch per grouping key
167-
// partition), so `key` is unused — same shape as KllAccumulator.
168127
SingleSubpopulationAggregate::query(self, statistic, Some(query_kwargs))
169128
}
170129
}

0 commit comments

Comments
 (0)