Skip to content

Commit 5d50a0e

Browse files
committed
x
1 parent 7491181 commit 5d50a0e

10 files changed

Lines changed: 1782 additions & 255 deletions

File tree

src/common/statistics/src/datum.rs

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,6 @@ impl Datum {
3434
matches!(self, Datum::Bytes(_))
3535
}
3636

37-
pub fn cast_float(self) -> Self {
38-
match self {
39-
Datum::Int(v) => Datum::Float(F64::from(v as f64)),
40-
Datum::UInt(v) => Datum::Float(F64::from(v as f64)),
41-
_ => self,
42-
}
43-
}
44-
4537
pub fn as_double(&self) -> Result<f64, ErrorCode> {
4638
match self {
4739
Datum::Bool(v) => Ok(*v as u8 as f64),

src/common/statistics/src/histogram.rs

Lines changed: 32 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414

1515
use std::fmt;
1616

17+
use databend_common_exception::ErrorCode;
18+
use databend_common_exception::Result as ExceptionResult;
19+
1720
use crate::Datum;
1821
use crate::F64;
1922
use crate::JoinEstimation;
@@ -22,6 +25,25 @@ use crate::TypedHistogramBucket;
2225

2326
pub const DEFAULT_HISTOGRAM_BUCKETS: usize = 100;
2427

28+
/// A column histogram used by optimizer statistics.
29+
///
30+
/// Histograms currently have two sources with different reliability:
31+
/// - `accuracy == true`: buckets come from `ANALYZE TABLE`. For each supported
32+
/// non-null column, ANALYZE runs a query equivalent to sorting rows by the
33+
/// column, assigning `NTILE(DEFAULT_HISTOGRAM_BUCKETS)`, then grouping by tile
34+
/// and collecting `MIN(col)`, `MAX(col)`, `COUNT()`, and
35+
/// `COUNT(DISTINCT col)`. Each bucket is therefore the closed value envelope
36+
/// observed in one row-order tile. The bucket list is not a value-domain
37+
/// partition: adjacent buckets may share boundaries or overlap when duplicate
38+
/// values cross tile boundaries.
39+
/// - `accuracy == false`: buckets are synthesized from column NDV plus
40+
/// min/max bounds by [`crate::HistogramBuilder::from_ndv`]. These buckets
41+
/// assume a uniform distribution over the recorded bounds, and numeric
42+
/// histograms keep `avg_spacing` so consumers can detect distorted ranges.
43+
///
44+
/// Consumers should preserve this distinction when updating or interpreting
45+
/// bucket counts. The type variants preserve the bucket value type for
46+
/// serialization, function selectivity, and type-specific join estimation.
2547
#[derive(Debug, Clone)]
2648
pub enum Histogram {
2749
Int(TypedHistogram<i64>),
@@ -160,13 +182,16 @@ impl Histogram {
160182
}
161183
}
162184

163-
pub fn estimate_join(&self, other: &Histogram) -> JoinEstimation {
185+
/// Estimate a join only when both histograms use the same typed bucket representation.
186+
pub fn estimate_join(&self, other: &Histogram) -> ExceptionResult<JoinEstimation> {
164187
match (self, other) {
165-
(Self::Int(left), Self::Int(right)) => left.estimate_join(right),
166-
(Self::UInt(left), Self::UInt(right)) => left.estimate_join(right),
167-
(Self::Float(left), Self::Float(right)) => left.estimate_join(right),
168-
(Self::Bytes(left), Self::Bytes(right)) => left.estimate_join(right),
169-
_ => JoinEstimation::zero(),
188+
(Self::Int(left), Self::Int(right)) => Ok(left.estimate_join(right)),
189+
(Self::UInt(left), Self::UInt(right)) => Ok(left.estimate_join(right)),
190+
(Self::Float(left), Self::Float(right)) => Ok(left.estimate_join(right)),
191+
(Self::Bytes(left), Self::Bytes(right)) => Ok(left.estimate_join(right)),
192+
_ => Err(ErrorCode::Internal(
193+
"cannot estimate join for histograms with different bucket types",
194+
)),
170195
}
171196
}
172197

@@ -290,15 +315,7 @@ impl HistogramBucket {
290315
(Datum::Bytes(lower_bound), Datum::Bytes(upper_bound)) => Ok(Self::Bytes(
291316
TypedHistogramBucket::new(lower_bound, upper_bound, num_values, num_distinct),
292317
)),
293-
(lower_bound, upper_bound) if lower_bound.is_numeric() && upper_bound.is_numeric() => {
294-
Ok(Self::Float(TypedHistogramBucket::new(
295-
F64::from(lower_bound.as_double().unwrap_or(0.0)),
296-
F64::from(upper_bound.as_double().unwrap_or(0.0)),
297-
num_values,
298-
num_distinct,
299-
)))
300-
}
301-
_ => Err("histogram bucket bounds must have comparable types"),
318+
_ => Err("histogram bucket bounds must have the same supported type"),
302319
}
303320
}
304321

src/common/statistics/src/histogram_builder.rs

Lines changed: 123 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -66,28 +66,44 @@ impl HistogramBuilder {
6666
pub type UniformSampleSet = HistogramBounds;
6767

6868
impl HistogramBounds {
69+
pub fn has_same_supported_type(&self, other: &HistogramBounds) -> bool {
70+
let Some(kind) = histogram_bound_kind(self.lower_bound()) else {
71+
return false;
72+
};
73+
74+
histogram_bound_kind(self.upper_bound()) == Some(kind)
75+
&& histogram_bound_kind(other.lower_bound()) == Some(kind)
76+
&& histogram_bound_kind(other.upper_bound()) == Some(kind)
77+
}
78+
6979
pub fn has_intersection(&self, other: &HistogramBounds) -> Result<bool> {
7080
match (
7181
self.lower_bound(),
7282
self.upper_bound(),
7383
other.lower_bound(),
7484
other.upper_bound(),
7585
) {
76-
(left_min, left_max, right_min, right_max)
77-
if left_min.is_numeric()
78-
&& left_max.is_numeric()
79-
&& right_min.is_numeric()
80-
&& right_max.is_numeric() =>
81-
{
82-
Ok(TypedHistogramBounds::new(
83-
F64::from(left_min.as_double()?),
84-
F64::from(left_max.as_double()?),
85-
)
86-
.has_intersection(&TypedHistogramBounds::new(
87-
F64::from(right_min.as_double()?),
88-
F64::from(right_max.as_double()?),
89-
)))
90-
}
86+
(
87+
Datum::Int(left_min),
88+
Datum::Int(left_max),
89+
Datum::Int(right_min),
90+
Datum::Int(right_max),
91+
) => Ok(TypedHistogramBounds::new(*left_min, *left_max)
92+
.has_intersection(&TypedHistogramBounds::new(*right_min, *right_max))),
93+
(
94+
Datum::UInt(left_min),
95+
Datum::UInt(left_max),
96+
Datum::UInt(right_min),
97+
Datum::UInt(right_max),
98+
) => Ok(TypedHistogramBounds::new(*left_min, *left_max)
99+
.has_intersection(&TypedHistogramBounds::new(*right_min, *right_max))),
100+
(
101+
Datum::Float(left_min),
102+
Datum::Float(left_max),
103+
Datum::Float(right_min),
104+
Datum::Float(right_max),
105+
) => Ok(TypedHistogramBounds::new(*left_min, *left_max)
106+
.has_intersection(&TypedHistogramBounds::new(*right_min, *right_max))),
91107
(
92108
Datum::Bytes(left_min),
93109
Datum::Bytes(left_max),
@@ -109,20 +125,34 @@ impl HistogramBounds {
109125
other.lower_bound(),
110126
other.upper_bound(),
111127
) {
112-
(left_min, left_max, right_min, right_max)
113-
if left_min.is_numeric()
114-
&& left_max.is_numeric()
115-
&& right_min.is_numeric()
116-
&& right_max.is_numeric() =>
117-
{
118-
let (min, max) = TypedHistogramBounds::new(
119-
F64::from(left_min.as_double()?),
120-
F64::from(left_max.as_double()?),
121-
)
122-
.intersection(&TypedHistogramBounds::new(
123-
F64::from(right_min.as_double()?),
124-
F64::from(right_max.as_double()?),
125-
));
128+
(
129+
Datum::Int(left_min),
130+
Datum::Int(left_max),
131+
Datum::Int(right_min),
132+
Datum::Int(right_max),
133+
) => {
134+
let (min, max) = TypedHistogramBounds::new(*left_min, *left_max)
135+
.intersection(&TypedHistogramBounds::new(*right_min, *right_max));
136+
Ok((min.map(Datum::Int), max.map(Datum::Int)))
137+
}
138+
(
139+
Datum::UInt(left_min),
140+
Datum::UInt(left_max),
141+
Datum::UInt(right_min),
142+
Datum::UInt(right_max),
143+
) => {
144+
let (min, max) = TypedHistogramBounds::new(*left_min, *left_max)
145+
.intersection(&TypedHistogramBounds::new(*right_min, *right_max));
146+
Ok((min.map(Datum::UInt), max.map(Datum::UInt)))
147+
}
148+
(
149+
Datum::Float(left_min),
150+
Datum::Float(left_max),
151+
Datum::Float(right_min),
152+
Datum::Float(right_max),
153+
) => {
154+
let (min, max) = TypedHistogramBounds::new(*left_min, *left_max)
155+
.intersection(&TypedHistogramBounds::new(*right_min, *right_max));
126156
Ok((min.map(Datum::Float), max.map(Datum::Float)))
127157
}
128158
(
@@ -142,9 +172,28 @@ impl HistogramBounds {
142172
}
143173
}
144174

175+
#[derive(Clone, Copy, PartialEq, Eq)]
176+
enum HistogramBoundKind {
177+
Int,
178+
UInt,
179+
Float,
180+
Bytes,
181+
}
182+
183+
fn histogram_bound_kind(datum: &Datum) -> Option<HistogramBoundKind> {
184+
match datum {
185+
Datum::Int(_) => Some(HistogramBoundKind::Int),
186+
Datum::UInt(_) => Some(HistogramBoundKind::UInt),
187+
Datum::Float(_) => Some(HistogramBoundKind::Float),
188+
Datum::Bytes(_) => Some(HistogramBoundKind::Bytes),
189+
Datum::Bool(_) => None,
190+
}
191+
}
192+
145193
#[cfg(test)]
146194
mod tests {
147195
use super::*;
196+
use crate::HistogramBucket;
148197
use crate::TypedHistogram;
149198
use crate::TypedHistogramBucket;
150199

@@ -165,13 +214,32 @@ mod tests {
165214
let left = UniformSampleSet::new(Datum::UInt(0), Datum::UInt(10));
166215
let right = UniformSampleSet::new(Datum::UInt(5), Datum::UInt(15));
167216

217+
assert!(left.has_same_supported_type(&right));
168218
assert!(left.has_intersection(&right).unwrap());
169219
assert_eq!(
170220
left.intersection(&right).unwrap(),
171-
(
172-
Some(Datum::Float(F64::from(5.0))),
173-
Some(Datum::Float(F64::from(10.0)))
174-
)
221+
(Some(Datum::UInt(5)), Some(Datum::UInt(10)))
222+
);
223+
}
224+
225+
#[test]
226+
fn test_uniform_sample_set_rejects_mixed_numeric_intersection() {
227+
let left = UniformSampleSet::new(Datum::UInt(0), Datum::UInt(10));
228+
let right = UniformSampleSet::new(Datum::Int(5), Datum::Int(15));
229+
230+
assert!(!left.has_same_supported_type(&right));
231+
assert!(!left.has_intersection(&right).unwrap());
232+
assert_eq!(left.intersection(&right).unwrap(), (None, None));
233+
}
234+
235+
#[test]
236+
fn test_histogram_bucket_rejects_mixed_numeric_bounds() {
237+
let err = HistogramBucket::try_from_bounds(Datum::UInt(0), Datum::Int(10), 10.0, 10.0)
238+
.unwrap_err();
239+
240+
assert_eq!(
241+
err,
242+
"histogram bucket bounds must have the same supported type"
175243
);
176244
}
177245

@@ -199,9 +267,30 @@ mod tests {
199267
avg_spacing: None,
200268
});
201269

202-
let estimation = left.estimate_join(&right);
270+
let estimation = left.estimate_join(&right).unwrap();
203271

204272
assert_eq!(estimation.cardinality.expected, 5.0);
205273
assert_eq!(estimation.ndv.expected, 5.0);
206274
}
275+
276+
#[test]
277+
fn test_estimate_histogram_join_rejects_mixed_numeric_types() {
278+
let left = Histogram::UInt(TypedHistogram {
279+
accuracy: true,
280+
buckets: vec![TypedHistogramBucket::new(0, 10, 10.0, 10.0)],
281+
avg_spacing: None,
282+
});
283+
let right = Histogram::Int(TypedHistogram {
284+
accuracy: true,
285+
buckets: vec![TypedHistogramBucket::new(5, 15, 10.0, 10.0)],
286+
avg_spacing: None,
287+
});
288+
289+
let err = left.estimate_join(&right).unwrap_err();
290+
291+
assert_eq!(
292+
err.message(),
293+
"cannot estimate join for histograms with different bucket types"
294+
);
295+
}
207296
}

0 commit comments

Comments
 (0)