Skip to content

Commit 53c3611

Browse files
committed
x
1 parent 88c46bd commit 53c3611

3 files changed

Lines changed: 163 additions & 106 deletions

File tree

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
// Copyright 2021 Datafuse Labs
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
use std::ops::Bound;
16+
17+
use databend_common_exception::Result;
18+
use databend_common_statistics::DEFAULT_HISTOGRAM_BUCKETS;
19+
use databend_common_statistics::Datum;
20+
21+
use super::HistogramBuilder;
22+
use crate::optimizer::ir::ColumnStat;
23+
use crate::plans::ComparisonOp;
24+
25+
pub(super) enum ValueConstraint {
26+
Eq(Datum),
27+
NotEq(Datum),
28+
Range {
29+
lower: Bound<Datum>,
30+
upper: Bound<Datum>,
31+
},
32+
}
33+
34+
impl ValueConstraint {
35+
pub(super) fn from_comparison(op: ComparisonOp, datum: Datum) -> Self {
36+
match op {
37+
ComparisonOp::Equal => ValueConstraint::Eq(datum),
38+
ComparisonOp::NotEqual => ValueConstraint::NotEq(datum),
39+
ComparisonOp::GT => ValueConstraint::Range {
40+
lower: Bound::Excluded(datum),
41+
upper: Bound::Unbounded,
42+
},
43+
ComparisonOp::GTE => ValueConstraint::Range {
44+
lower: Bound::Included(datum),
45+
upper: Bound::Unbounded,
46+
},
47+
ComparisonOp::LT => ValueConstraint::Range {
48+
lower: Bound::Unbounded,
49+
upper: Bound::Excluded(datum),
50+
},
51+
ComparisonOp::LTE => ValueConstraint::Range {
52+
lower: Bound::Unbounded,
53+
upper: Bound::Included(datum),
54+
},
55+
}
56+
}
57+
58+
pub(super) fn apply_to_column_stat(
59+
&self,
60+
column_stat: &mut ColumnStat,
61+
selectivity: Option<f64>,
62+
) -> Result<()> {
63+
match self {
64+
ValueConstraint::Eq(datum) => {
65+
*column_stat = ColumnStat::from_const(datum.clone());
66+
}
67+
ValueConstraint::NotEq(datum) => {
68+
let _ = datum;
69+
if let Some(selectivity) = selectivity {
70+
update_statistic(
71+
column_stat,
72+
column_stat.min.clone(),
73+
column_stat.max.clone(),
74+
selectivity,
75+
)?;
76+
}
77+
}
78+
ValueConstraint::Range { .. } => match selectivity {
79+
Some(0.0) => {
80+
column_stat.ndv = column_stat.ndv.reduce_by_selectivity(0.0);
81+
}
82+
Some(selectivity) if selectivity < 1.0 => {
83+
if let Some((new_min, new_max)) = self.range_bounds(column_stat)? {
84+
update_statistic(column_stat, new_min, new_max, selectivity)?;
85+
}
86+
}
87+
_ => {}
88+
},
89+
}
90+
91+
Ok(())
92+
}
93+
94+
fn range_bounds(&self, column_stat: &ColumnStat) -> Result<Option<(Datum, Datum)>> {
95+
let ValueConstraint::Range { lower, upper } = self else {
96+
unreachable!()
97+
};
98+
99+
let new_min = match lower {
100+
Bound::Unbounded => Some(column_stat.min.clone()),
101+
Bound::Included(datum) | Bound::Excluded(datum) => {
102+
Datum::max(Some(column_stat.min.clone()), Some(datum.clone()))
103+
}
104+
};
105+
let new_max = match upper {
106+
Bound::Unbounded => Some(column_stat.max.clone()),
107+
Bound::Included(datum) | Bound::Excluded(datum) => {
108+
Datum::min(Some(column_stat.max.clone()), Some(datum.clone()))
109+
}
110+
};
111+
112+
let (Some(new_min), Some(new_max)) = (new_min, new_max) else {
113+
return Ok(None);
114+
};
115+
if new_min.compare(&new_max)? == std::cmp::Ordering::Greater {
116+
return Ok(None);
117+
}
118+
119+
Ok(Some((new_min, new_max)))
120+
}
121+
}
122+
123+
fn update_statistic(
124+
column_stat: &mut ColumnStat,
125+
new_min: Datum,
126+
new_max: Datum,
127+
selectivity: f64,
128+
) -> Result<()> {
129+
column_stat.ndv = column_stat.ndv.reduce_by_selectivity(selectivity);
130+
column_stat.min = new_min.clone();
131+
column_stat.max = new_max.clone();
132+
column_stat.null_count = (column_stat.null_count as f64 * selectivity).ceil() as u64;
133+
134+
if let Some(histogram) = &column_stat.histogram {
135+
// If selectivity < 0.2, most buckets are invalid and
136+
// the accuracy histogram can be discarded.
137+
// Todo: support unfixed buckets number for histogram and prune the histogram.
138+
if !histogram.accuracy() || selectivity < 0.2 {
139+
let num_values = histogram.num_values();
140+
let new_num_values = (num_values * selectivity).ceil() as u64;
141+
let new_ndv = column_stat.ndv.value() as u64;
142+
column_stat.histogram = if new_ndv <= 2 {
143+
None
144+
} else {
145+
Some(HistogramBuilder::from_ndv(
146+
new_ndv,
147+
new_num_values.max(new_ndv),
148+
Some((new_min, new_max)),
149+
DEFAULT_HISTOGRAM_BUCKETS,
150+
)?)
151+
}
152+
}
153+
}
154+
155+
Ok(())
156+
}

src/query/sql/src/planner/optimizer/ir/stats/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
// limitations under the License.
1414

1515
mod column_stat;
16+
mod constraint;
1617
mod histogram;
1718
mod selectivity;
1819

src/query/sql/src/planner/optimizer/ir/stats/selectivity.rs

Lines changed: 6 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,12 @@ use databend_common_expression::stat_distribution::ArgStat;
2626
use databend_common_expression::types::DataType;
2727
use databend_common_expression::types::NumberScalar;
2828
use databend_common_functions::BUILTIN_FUNCTIONS;
29-
use databend_common_statistics::DEFAULT_HISTOGRAM_BUCKETS;
30-
use databend_common_statistics::Datum;
3129

30+
use super::constraint::ValueConstraint;
3231
use crate::ColumnBinding;
3332
use crate::Symbol;
3433
use crate::optimizer::ir::ColumnStat;
3534
use crate::optimizer::ir::ColumnStatSet;
36-
use crate::optimizer::ir::HistogramBuilder;
3735
use crate::plans::ComparisonOp;
3836
use crate::plans::FunctionCall;
3937
use crate::plans::ScalarExpr;
@@ -237,7 +235,8 @@ impl SelectivityVisitor<'_> {
237235
let column_stat = self
238236
.ensure_column_stat(column_index)
239237
.expect("checked above");
240-
Self::update_comparison_column_stat(column_stat, op, const_datum, selectivity)?;
238+
let constraint = ValueConstraint::from_comparison(op, const_datum);
239+
constraint.apply_to_column_stat(column_stat, selectivity.as_n().copied())?;
241240
return Ok(selectivity);
242241
}
243242
(Expr::FunctionCall(func), Expr::Constant(val))
@@ -294,47 +293,6 @@ impl SelectivityVisitor<'_> {
294293
Ok(Selectivity::N(distr.true_count.expected / self.cardinality))
295294
}
296295

297-
fn update_comparison_column_stat(
298-
column_stat: &mut ColumnStat,
299-
op: ComparisonOp,
300-
const_datum: Datum,
301-
selectivity: Selectivity,
302-
) -> Result<()> {
303-
match op {
304-
ComparisonOp::Equal => {
305-
*column_stat = ColumnStat::from_const(const_datum);
306-
Ok(())
307-
}
308-
ComparisonOp::NotEqual => {
309-
if let Selectivity::N(n) = selectivity {
310-
update_statistic(
311-
column_stat,
312-
column_stat.min.clone(),
313-
column_stat.max.clone(),
314-
n,
315-
)?;
316-
}
317-
Ok(())
318-
}
319-
_ => {
320-
match selectivity {
321-
Selectivity::N(0.0) => {
322-
column_stat.ndv = column_stat.ndv.reduce_by_selectivity(0.0);
323-
}
324-
Selectivity::N(n) if n < 1.0 => {
325-
if let Some((new_min, new_max)) =
326-
comparison_range_bounds(column_stat, &const_datum, op)?
327-
{
328-
update_statistic(column_stat, new_min, new_max, n)?;
329-
}
330-
}
331-
_ => {}
332-
}
333-
Ok(())
334-
}
335-
}
336-
}
337-
338296
// The method uses probability predication to compute like selectivity.
339297
// The core idea is from postgresql.
340298
fn compute_like(&mut self, func: &ExprCall) -> Result<Selectivity> {
@@ -532,78 +490,20 @@ fn is_true_constant_predicate(constant: &Constant) -> bool {
532490
}
533491
}
534492

535-
fn comparison_range_bounds(
536-
column_stat: &ColumnStat,
537-
const_datum: &Datum,
538-
op: ComparisonOp,
539-
) -> Result<Option<(Datum, Datum)>> {
540-
let (new_min, new_max) = match op {
541-
ComparisonOp::GT | ComparisonOp::GTE => (
542-
Datum::max(Some(column_stat.min.clone()), Some(const_datum.clone())),
543-
Some(column_stat.max.clone()),
544-
),
545-
ComparisonOp::LT | ComparisonOp::LTE => (
546-
Some(column_stat.min.clone()),
547-
Datum::min(Some(column_stat.max.clone()), Some(const_datum.clone())),
548-
),
549-
_ => unreachable!(),
550-
};
551-
let (Some(new_min), Some(new_max)) = (new_min, new_max) else {
552-
return Ok(None);
553-
};
554-
if new_min.compare(&new_max)? == std::cmp::Ordering::Greater {
555-
return Ok(None);
556-
}
557-
558-
Ok(Some((new_min, new_max)))
559-
}
560-
561-
fn update_statistic(
562-
column_stat: &mut ColumnStat,
563-
new_min: Datum,
564-
new_max: Datum,
565-
selectivity: f64,
566-
) -> Result<()> {
567-
column_stat.ndv = column_stat.ndv.reduce_by_selectivity(selectivity);
568-
column_stat.min = new_min.clone();
569-
column_stat.max = new_max.clone();
570-
column_stat.null_count = (column_stat.null_count as f64 * selectivity).ceil() as u64;
571-
572-
if let Some(histogram) = &column_stat.histogram {
573-
// If selectivity < 0.2, most buckets are invalid and
574-
// the accuracy histogram can be discarded.
575-
// Todo: support unfixed buckets number for histogram and prune the histogram.
576-
if !histogram.accuracy() || selectivity < 0.2 {
577-
let num_values = histogram.num_values();
578-
let new_num_values = (num_values * selectivity).ceil() as u64;
579-
let new_ndv = column_stat.ndv.value() as u64;
580-
column_stat.histogram = if new_ndv <= 2 {
581-
None
582-
} else {
583-
Some(HistogramBuilder::from_ndv(
584-
new_ndv,
585-
new_num_values.max(new_ndv),
586-
Some((new_min, new_max)),
587-
DEFAULT_HISTOGRAM_BUCKETS,
588-
)?)
589-
}
590-
}
591-
}
592-
593-
Ok(())
594-
}
595-
596493
#[cfg(test)]
597494
mod tests {
598495
use databend_common_expression::Scalar;
599496
use databend_common_expression::types::NumberDataType;
600497
use databend_common_expression::types::NumberScalar;
601498
use databend_common_expression::types::decimal::DecimalScalar;
602499
use databend_common_expression::types::decimal::DecimalSize;
500+
use databend_common_statistics::DEFAULT_HISTOGRAM_BUCKETS;
501+
use databend_common_statistics::Datum;
603502

604503
use super::*;
605504
use crate::ColumnBindingBuilder;
606505
use crate::Visibility;
506+
use crate::optimizer::ir::HistogramBuilder;
607507
use crate::optimizer::ir::Ndv;
608508
use crate::plans::BoundColumnRef;
609509
use crate::plans::ConstantExpr;

0 commit comments

Comments
 (0)