Skip to content

Commit 514c15c

Browse files
committed
x
1 parent 5d50a0e commit 514c15c

4 files changed

Lines changed: 533 additions & 514 deletions

File tree

src/common/statistics/src/datum.rs

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,14 @@ use databend_common_exception::ErrorCode;
2020

2121
pub type F64 = OrderedFloat<f64>;
2222

23+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
24+
pub enum DatumKind {
25+
Int,
26+
UInt,
27+
Float,
28+
Bytes,
29+
}
30+
2331
#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
2432
pub enum Datum {
2533
Bool(bool),
@@ -114,6 +122,104 @@ impl Datum {
114122
matches!(self, Datum::Int(_) | Datum::UInt(_) | Datum::Float(_))
115123
}
116124

125+
pub fn kind(&self) -> Option<DatumKind> {
126+
match self {
127+
Datum::Int(_) => Some(DatumKind::Int),
128+
Datum::UInt(_) => Some(DatumKind::UInt),
129+
Datum::Float(_) => Some(DatumKind::Float),
130+
Datum::Bytes(_) => Some(DatumKind::Bytes),
131+
Datum::Bool(_) => None,
132+
}
133+
}
134+
135+
pub fn normalize_to_kind(&self, kind: DatumKind) -> Option<Datum> {
136+
match kind {
137+
DatumKind::Int => self.as_i64().map(Datum::Int),
138+
DatumKind::UInt => self.as_u64().map(Datum::UInt),
139+
DatumKind::Float => self.as_double().ok().map(F64::from).map(Datum::Float),
140+
DatumKind::Bytes => match self {
141+
Datum::Bytes(value) => Some(Datum::Bytes(value.clone())),
142+
_ => None,
143+
},
144+
}
145+
}
146+
147+
pub fn lower_bound_to_kind(&self, kind: DatumKind) -> Option<Datum> {
148+
match kind {
149+
DatumKind::Int => self.lower_bound_as_i64().map(Datum::Int),
150+
DatumKind::UInt => self.lower_bound_as_u64().map(Datum::UInt),
151+
_ => self.normalize_to_kind(kind),
152+
}
153+
}
154+
155+
pub fn upper_bound_to_kind(&self, kind: DatumKind) -> Option<Datum> {
156+
match kind {
157+
DatumKind::Int => self.upper_bound_as_i64().map(Datum::Int),
158+
DatumKind::UInt => self.upper_bound_as_u64().map(Datum::UInt),
159+
_ => self.normalize_to_kind(kind),
160+
}
161+
}
162+
163+
pub fn as_i64(&self) -> Option<i64> {
164+
match self {
165+
Datum::Int(value) => Some(*value),
166+
Datum::UInt(value) => i64::try_from(*value).ok(),
167+
Datum::Float(value) => {
168+
let value = value.into_inner();
169+
(value.is_finite()
170+
&& value.fract() == 0.0
171+
&& value >= i64::MIN as f64
172+
&& value <= i64::MAX as f64)
173+
.then_some(value as i64)
174+
}
175+
_ => None,
176+
}
177+
}
178+
179+
pub fn as_u64(&self) -> Option<u64> {
180+
match self {
181+
Datum::UInt(value) => Some(*value),
182+
Datum::Int(value) => u64::try_from(*value).ok(),
183+
Datum::Float(value) => {
184+
let value = value.into_inner();
185+
(value.is_finite()
186+
&& value.fract() == 0.0
187+
&& value >= 0.0
188+
&& value <= u64::MAX as f64)
189+
.then_some(value as u64)
190+
}
191+
_ => None,
192+
}
193+
}
194+
195+
fn lower_bound_as_i64(&self) -> Option<i64> {
196+
match self {
197+
Datum::Float(value) => float_to_i64_bound(value.into_inner(), f64::ceil),
198+
_ => self.as_i64(),
199+
}
200+
}
201+
202+
fn upper_bound_as_i64(&self) -> Option<i64> {
203+
match self {
204+
Datum::Float(value) => float_to_i64_bound(value.into_inner(), f64::floor),
205+
_ => self.as_i64(),
206+
}
207+
}
208+
209+
fn lower_bound_as_u64(&self) -> Option<u64> {
210+
match self {
211+
Datum::Float(value) => float_to_u64_bound(value.into_inner(), f64::ceil),
212+
_ => self.as_u64(),
213+
}
214+
}
215+
216+
fn upper_bound_as_u64(&self) -> Option<u64> {
217+
match self {
218+
Datum::Float(value) => float_to_u64_bound(value.into_inner(), f64::floor),
219+
_ => self.as_u64(),
220+
}
221+
}
222+
117223
pub fn type_name(&self) -> &'static str {
118224
match self {
119225
Datum::Bool(_) => "Boolean",
@@ -151,3 +257,14 @@ impl Datum {
151257
}
152258
}
153259
}
260+
261+
fn float_to_i64_bound(value: f64, round: fn(f64) -> f64) -> Option<i64> {
262+
let value = round(value);
263+
(value.is_finite() && value >= i64::MIN as f64 && value <= i64::MAX as f64)
264+
.then_some(value as i64)
265+
}
266+
267+
fn float_to_u64_bound(value: f64, round: fn(f64) -> f64) -> Option<u64> {
268+
let value = round(value);
269+
(value.is_finite() && value >= 0.0 && value <= u64::MAX as f64).then_some(value as u64)
270+
}

src/common/statistics/src/histogram.rs

Lines changed: 243 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,12 +195,200 @@ impl Histogram {
195195
}
196196
}
197197

198+
/// Estimate a join for matching histogram types, or for mixed numeric histograms
199+
/// through a temporary float view. Non-numeric mixed types return `None`.
200+
pub fn estimate_join_numeric_compatible(
201+
&self,
202+
other: &Histogram,
203+
) -> ExceptionResult<Option<JoinEstimation>> {
204+
match (self, other) {
205+
(Self::Int(left), Self::Int(right)) => Ok(Some(left.estimate_join(right))),
206+
(Self::UInt(left), Self::UInt(right)) => Ok(Some(left.estimate_join(right))),
207+
(Self::Float(left), Self::Float(right)) => Ok(Some(left.estimate_join(right))),
208+
(Self::Bytes(left), Self::Bytes(right)) => Ok(Some(left.estimate_join(right))),
209+
(Self::Bytes(_), _) | (_, Self::Bytes(_)) => Ok(None),
210+
_ => estimate_mixed_numeric_histogram_join(self, other),
211+
}
212+
}
213+
214+
pub fn restrict_to_bounds(&self, min: &Datum, max: &Datum) -> ExceptionResult<Option<Self>> {
215+
let buckets = self
216+
.bucket_iter()
217+
.map(|bucket| {
218+
let bucket_min = bucket.lower_bound();
219+
let bucket_max = bucket.upper_bound();
220+
if bucket_min.compare(max)? == std::cmp::Ordering::Greater
221+
|| bucket_max.compare(min)? == std::cmp::Ordering::Less
222+
{
223+
return Ok(None);
224+
}
225+
226+
let Some(lower_bound) = max_datum_as_bucket_kind(&bucket_min, min) else {
227+
return Ok(None);
228+
};
229+
let Some(upper_bound) = min_datum_as_bucket_kind(&bucket_max, max) else {
230+
return Ok(None);
231+
};
232+
if lower_bound.compare(&upper_bound)? == std::cmp::Ordering::Greater {
233+
return Ok(None);
234+
}
235+
236+
let selectivity = bucket_overlap_selectivity(
237+
&bucket_min,
238+
&bucket_max,
239+
&lower_bound,
240+
&upper_bound,
241+
);
242+
HistogramBucket::try_from_bounds(
243+
lower_bound,
244+
upper_bound,
245+
bucket.num_values() * selectivity,
246+
bucket.num_distinct() * selectivity,
247+
)
248+
.map(Some)
249+
.map_err(|err| ErrorCode::Internal(err.to_string()))
250+
})
251+
.collect::<ExceptionResult<Vec<_>>>()?
252+
.into_iter()
253+
.flatten()
254+
.collect::<Vec<_>>();
255+
256+
if buckets.is_empty() {
257+
return Ok(None);
258+
}
259+
260+
Self::try_from_buckets(self.accuracy(), buckets, self.avg_spacing())
261+
.map(Some)
262+
.map_err(|err| ErrorCode::Internal(err.to_string()))
263+
}
264+
198265
pub fn is_range_distorted(&self) -> bool {
199266
self.avg_spacing()
200267
.is_some_and(|bucket_width| bucket_width > 1e12)
201268
}
202269
}
203270

271+
fn estimate_mixed_numeric_histogram_join(
272+
left: &Histogram,
273+
right: &Histogram,
274+
) -> ExceptionResult<Option<JoinEstimation>> {
275+
let Some(left) = numeric_histogram_as_float(left)? else {
276+
return Ok(None);
277+
};
278+
let Some(right) = numeric_histogram_as_float(right)? else {
279+
return Ok(None);
280+
};
281+
282+
Ok(Some(left.estimate_join(&right)))
283+
}
284+
285+
fn numeric_histogram_as_float(
286+
histogram: &Histogram,
287+
) -> ExceptionResult<Option<TypedHistogram<F64>>> {
288+
if matches!(histogram, Histogram::Bytes(_)) {
289+
return Ok(None);
290+
}
291+
292+
let buckets = histogram
293+
.bucket_iter()
294+
.map(|bucket| {
295+
let lower_bound = F64::from(bucket.lower_bound().as_double()?);
296+
let upper_bound = F64::from(bucket.upper_bound().as_double()?);
297+
Ok(TypedHistogramBucket::new(
298+
lower_bound,
299+
upper_bound,
300+
bucket.num_values(),
301+
bucket.num_distinct(),
302+
))
303+
})
304+
.collect::<ExceptionResult<Vec<_>>>()?;
305+
306+
Ok(Some(TypedHistogram {
307+
accuracy: histogram.accuracy(),
308+
buckets,
309+
avg_spacing: histogram.avg_spacing(),
310+
}))
311+
}
312+
313+
fn max_datum_as_bucket_kind(bucket_value: &Datum, stat_value: &Datum) -> Option<Datum> {
314+
let selected = if bucket_value.compare(stat_value).ok()? == std::cmp::Ordering::Less {
315+
stat_value
316+
} else {
317+
bucket_value
318+
};
319+
selected.normalize_to_kind(bucket_value.kind()?)
320+
}
321+
322+
fn min_datum_as_bucket_kind(bucket_value: &Datum, stat_value: &Datum) -> Option<Datum> {
323+
let selected = if bucket_value.compare(stat_value).ok()? == std::cmp::Ordering::Greater {
324+
stat_value
325+
} else {
326+
bucket_value
327+
};
328+
selected.normalize_to_kind(bucket_value.kind()?)
329+
}
330+
331+
fn bucket_overlap_selectivity(
332+
bucket_min: &Datum,
333+
bucket_max: &Datum,
334+
new_min: &Datum,
335+
new_max: &Datum,
336+
) -> f64 {
337+
match (bucket_min, bucket_max, new_min, new_max) {
338+
(
339+
Datum::Int(bucket_min),
340+
Datum::Int(bucket_max),
341+
Datum::Int(new_min),
342+
Datum::Int(new_max),
343+
) => discrete_overlap_selectivity(
344+
*bucket_min as i128,
345+
*bucket_max as i128,
346+
*new_min as i128,
347+
*new_max as i128,
348+
),
349+
(
350+
Datum::UInt(bucket_min),
351+
Datum::UInt(bucket_max),
352+
Datum::UInt(new_min),
353+
Datum::UInt(new_max),
354+
) => discrete_overlap_selectivity(
355+
*bucket_min as i128,
356+
*bucket_max as i128,
357+
*new_min as i128,
358+
*new_max as i128,
359+
),
360+
(
361+
Datum::Float(bucket_min),
362+
Datum::Float(bucket_max),
363+
Datum::Float(new_min),
364+
Datum::Float(new_max),
365+
) => {
366+
let bucket_width = bucket_max.into_inner() - bucket_min.into_inner();
367+
if bucket_width <= 0.0 {
368+
return 1.0;
369+
}
370+
let overlap_width = new_max.into_inner() - new_min.into_inner();
371+
(overlap_width / bucket_width).clamp(0.0, 1.0)
372+
}
373+
(Datum::Bytes(_), Datum::Bytes(_), Datum::Bytes(_), Datum::Bytes(_)) => 1.0,
374+
_ => 1.0,
375+
}
376+
}
377+
378+
fn discrete_overlap_selectivity(
379+
bucket_min: i128,
380+
bucket_max: i128,
381+
new_min: i128,
382+
new_max: i128,
383+
) -> f64 {
384+
let bucket_count = bucket_max - bucket_min + 1;
385+
if bucket_count <= 0 {
386+
return 1.0;
387+
}
388+
let overlap_count = new_max - new_min + 1;
389+
(overlap_count as f64 / bucket_count as f64).clamp(0.0, 1.0)
390+
}
391+
204392
pub enum HistogramBucketIter<'a> {
205393
Int(std::slice::Iter<'a, TypedHistogramBucket<i64>>),
206394
UInt(std::slice::Iter<'a, TypedHistogramBucket<u64>>),
@@ -393,3 +581,58 @@ impl fmt::Display for Histogram {
393581
Ok(())
394582
}
395583
}
584+
585+
#[cfg(test)]
586+
mod tests {
587+
use super::*;
588+
589+
#[test]
590+
fn test_restrict_to_bounds_uses_existing_buckets() -> ExceptionResult<()> {
591+
let histogram = Histogram::UInt(TypedHistogram {
592+
accuracy: true,
593+
buckets: vec![
594+
TypedHistogramBucket::new(0, 4, 5.0, 5.0),
595+
TypedHistogramBucket::new(5, 9, 5.0, 5.0),
596+
],
597+
avg_spacing: None,
598+
});
599+
600+
let restricted = histogram
601+
.restrict_to_bounds(&Datum::UInt(2), &Datum::UInt(6))?
602+
.expect("histogram should keep intersecting buckets");
603+
let buckets = restricted.bucket_iter().collect::<Vec<_>>();
604+
605+
assert_eq!(buckets.len(), 2);
606+
assert_eq!(buckets[0].lower_bound(), Datum::UInt(2));
607+
assert_eq!(buckets[0].upper_bound(), Datum::UInt(4));
608+
assert_eq!(buckets[0].num_values(), 3.0);
609+
assert_eq!(buckets[0].num_distinct(), 3.0);
610+
assert_eq!(buckets[1].lower_bound(), Datum::UInt(5));
611+
assert_eq!(buckets[1].upper_bound(), Datum::UInt(6));
612+
assert_eq!(buckets[1].num_values(), 2.0);
613+
assert_eq!(buckets[1].num_distinct(), 2.0);
614+
Ok(())
615+
}
616+
617+
#[test]
618+
fn test_mixed_numeric_join_uses_float_view_of_existing_buckets() -> ExceptionResult<()> {
619+
let left = Histogram::Int(TypedHistogram {
620+
accuracy: true,
621+
buckets: vec![TypedHistogramBucket::new(1, 1, 3.0, 1.0)],
622+
avg_spacing: None,
623+
});
624+
let right = Histogram::UInt(TypedHistogram {
625+
accuracy: true,
626+
buckets: vec![TypedHistogramBucket::new(1, 1, 2.0, 1.0)],
627+
avg_spacing: None,
628+
});
629+
630+
let estimation = left
631+
.estimate_join_numeric_compatible(&right)?
632+
.expect("mixed numeric histograms should use a float view");
633+
634+
assert_eq!(estimation.cardinality.expected, 6.0);
635+
assert_eq!(estimation.ndv.expected, 1.0);
636+
Ok(())
637+
}
638+
}

0 commit comments

Comments
 (0)