Skip to content

Commit 99ae306

Browse files
committed
xx
1 parent fe72372 commit 99ae306

4 files changed

Lines changed: 843 additions & 254 deletions

File tree

src/query/expression/src/function/comparison.rs

Lines changed: 133 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,19 @@
1313
// limitations under the License.
1414

1515
use std::cmp::Ordering;
16+
use std::marker::PhantomData;
1617

1718
use crate::Scalar;
19+
use crate::property::Domain;
1820
use crate::stat_distribution::ArgStat;
21+
use crate::stat_distribution::BooleanDistribution;
1922
use crate::stat_distribution::Ndv;
23+
use crate::stat_distribution::OwnedDistribution;
24+
use crate::stat_distribution::ReturnStat;
2025
use crate::stat_distribution::StatBinaryArg;
2126
use crate::stat_distribution::StatEstimate;
27+
use crate::types::boolean::BooleanDomain;
28+
use crate::types::nullable::NullableDomain;
2229

2330
pub trait StatComparisonOp {
2431
type Reverse: StatComparisonOp;
@@ -72,13 +79,13 @@ pub trait StatComparisonOp {
7279
}
7380
}
7481

75-
#[derive(Default)]
82+
#[derive(Default, Clone, Copy)]
7683
pub struct LtOp;
77-
#[derive(Default)]
84+
#[derive(Default, Clone, Copy)]
7885
pub struct LteOp;
79-
#[derive(Default)]
86+
#[derive(Default, Clone, Copy)]
8087
pub struct GtOp;
81-
#[derive(Default)]
88+
#[derive(Default, Clone, Copy)]
8289
pub struct GteOp;
8390

8491
impl StatComparisonOp for LtOp {
@@ -109,72 +116,146 @@ impl StatComparisonOp for GteOp {
109116
const INCLUDE_EQUAL: bool = true;
110117
}
111118

112-
pub struct ConstantComparison<'s, 'a> {
119+
pub struct ConstantComparison<'s, 'a, A: ConstantComparisonAdapter> {
113120
pub stat: &'s ArgStat<'a>,
114-
pub constant: Scalar,
115-
pub cardinality: f64,
121+
pub constant: A::Value,
122+
pub domain: Option<A::Domain>,
123+
pub non_null_cardinality: f64,
124+
pub null_count: u64,
125+
pub nullable: bool,
126+
_a: PhantomData<fn(A)>,
116127
}
117128

118-
impl<'s, 'a> ConstantComparison<'s, 'a> {
119-
pub fn from_equality_args(stat: &'s StatBinaryArg<'a>) -> Option<Self> {
120-
Self::from_right_constant(stat).or_else(|| Self::from_left_constant(stat))
129+
pub trait ConstantComparisonAdapter {
130+
type Value;
131+
type Domain;
132+
133+
fn constant(scalar: Scalar) -> Result<Self::Value, String>;
134+
135+
fn domain(domain: &Domain) -> Result<Self::Domain, String>;
136+
137+
fn compare(left: &Self::Value, right: &Self::Value) -> Ordering;
138+
}
139+
140+
impl<'s, 'a, A: ConstantComparisonAdapter> ConstantComparison<'s, 'a, A> {
141+
pub fn from_constant_args(stat: &'s StatBinaryArg<'a>) -> Result<Option<(Self, bool)>, String> {
142+
if let Some(input) =
143+
Self::new(&stat.args[0], &stat.args[1], stat.cardinality)?.map(|input| (input, false))
144+
{
145+
return Ok(Some(input));
146+
}
147+
Ok(Self::new(&stat.args[1], &stat.args[0], stat.cardinality)?.map(|input| (input, true)))
121148
}
122149

123-
pub fn from_right_constant(stat: &'s StatBinaryArg<'a>) -> Option<Self> {
124-
Some(Self {
125-
stat: &stat.args[0],
126-
constant: stat.args[1].singleton()?,
127-
cardinality: stat.cardinality,
128-
})
150+
fn new(
151+
stat: &'s ArgStat<'a>,
152+
constant_stat: &ArgStat<'_>,
153+
input_cardinality: f64,
154+
) -> Result<Option<Self>, String> {
155+
let Some(constant) = constant_stat.singleton() else {
156+
return Ok(None);
157+
};
158+
if constant.is_null() {
159+
return Err(
160+
"constant comparison null constant was not handled before typed comparison"
161+
.to_string(),
162+
);
163+
}
164+
let nullable = stat.domain.is_nullable() || constant_stat.domain.is_nullable();
165+
let null_count = stat.null_count.min(input_cardinality.ceil() as u64);
166+
let non_null_cardinality = (input_cardinality - null_count as f64).max(0.0);
167+
let domain = match &stat.domain {
168+
Domain::Nullable(NullableDomain { value: None, .. }) => None,
169+
Domain::Nullable(NullableDomain {
170+
value: Some(box domain),
171+
..
172+
})
173+
| domain => match A::domain(domain) {
174+
Ok(domain) => Some(domain),
175+
Err(err) => {
176+
return Err(err);
177+
}
178+
},
179+
};
180+
let constant = match A::constant(constant) {
181+
Ok(constant) => constant,
182+
Err(err) => {
183+
return Err(err);
184+
}
185+
};
186+
187+
Ok(Some(Self {
188+
stat,
189+
constant,
190+
domain,
191+
non_null_cardinality,
192+
null_count,
193+
nullable,
194+
_a: PhantomData,
195+
}))
129196
}
130197

131-
pub fn from_left_constant(stat: &'s StatBinaryArg<'a>) -> Option<Self> {
132-
Some(Self {
133-
stat: &stat.args[1],
134-
constant: stat.args[0].singleton()?,
135-
cardinality: stat.cardinality,
136-
})
198+
pub fn boolean_stat(&self, true_count: StatEstimate) -> ReturnStat {
199+
let domain = if self.nullable {
200+
Domain::Nullable(NullableDomain {
201+
has_null: self.null_count != 0,
202+
value: Some(Box::new(Domain::Boolean(BooleanDomain {
203+
has_true: true,
204+
has_false: true,
205+
}))),
206+
})
207+
} else {
208+
Domain::Boolean(BooleanDomain {
209+
has_true: true,
210+
has_false: true,
211+
})
212+
};
213+
214+
ReturnStat {
215+
domain,
216+
ndv: Ndv::Stat(2.0),
217+
null_count: self.null_count,
218+
distribution: OwnedDistribution::Boolean(BooleanDistribution { true_count }),
219+
}
137220
}
138221

139-
pub fn equality_true_count(
222+
pub fn constant_equality_true_count(
140223
&self,
224+
minmax_cmp: Option<(Ordering, Ordering)>,
141225
not_eq: bool,
142-
compare: impl Fn(&Scalar, &Scalar) -> Option<Ordering>,
143-
) -> Option<StatEstimate> {
144-
let Some((min, max)) = self.stat.value_minmax() else {
145-
return Some(StatEstimate::exact(if not_eq {
146-
self.cardinality
147-
} else {
148-
0.0
149-
}));
226+
) -> StatEstimate {
227+
let Some((cmp_min, cmp_max)) = minmax_cmp else {
228+
return estimate_ndv_true_count(self.stat.ndv, not_eq, self.non_null_cardinality);
150229
};
151-
if compare(&self.constant, &min)? == Ordering::Less
152-
|| compare(&self.constant, &max)? == Ordering::Greater
153-
{
154-
return Some(StatEstimate::exact(if not_eq {
155-
self.cardinality
230+
if cmp_min == Ordering::Less || cmp_max == Ordering::Greater {
231+
return StatEstimate::exact(if not_eq {
232+
self.non_null_cardinality
156233
} else {
157234
0.0
158-
}));
235+
});
159236
}
160237

161-
Some(estimate_ndv_true_count(
162-
self.stat.ndv,
163-
not_eq,
164-
self.cardinality,
165-
))
238+
estimate_ndv_true_count(self.stat.ndv, not_eq, self.non_null_cardinality)
166239
}
240+
}
167241

168-
pub fn minmax_range_true_count<Op: StatComparisonOp>(
169-
&self,
170-
compare: impl Fn(&Scalar, &Scalar) -> Option<Ordering>,
171-
) -> Option<StatEstimate> {
172-
try {
173-
let (min, max) = self.stat.value_minmax()?;
174-
let cmp_min = compare(&self.constant, &min)?;
175-
let cmp_max = compare(&self.constant, &max)?;
176-
Op::estimate_minmax_range_true_count(self.stat.ndv, self.cardinality, cmp_min, cmp_max)?
177-
}
242+
pub fn null_comparison_stat(stat: &StatBinaryArg) -> Option<ReturnStat> {
243+
if stat.args.iter().any(|arg| {
244+
arg.domain
245+
.as_singleton()
246+
.is_some_and(|scalar| scalar.is_null())
247+
}) {
248+
Some(ReturnStat {
249+
domain: Domain::Nullable(NullableDomain {
250+
has_null: true,
251+
value: None,
252+
}),
253+
ndv: Ndv::Stat(0.0),
254+
null_count: stat.cardinality.ceil() as u64,
255+
distribution: OwnedDistribution::Unknown,
256+
})
257+
} else {
258+
None
178259
}
179260
}
180261

src/query/expression/src/function/stat_distribution.rs

Lines changed: 73 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -71,17 +71,6 @@ impl<D: DistributionInvariant> StatDistribution<D> {
7171
}
7272

7373
impl<D> StatDistribution<D> {
74-
pub fn value_domain(&self) -> Option<&Domain> {
75-
match &self.domain {
76-
Domain::Nullable(domain) => domain.value.as_deref(),
77-
domain => Some(domain),
78-
}
79-
}
80-
81-
pub fn value_minmax(&self) -> Option<(Scalar, Scalar)> {
82-
self.value_domain().map(Domain::to_minmax)
83-
}
84-
8574
pub fn singleton(&self) -> Option<Scalar> {
8675
self.domain.as_singleton()
8776
}
@@ -106,18 +95,6 @@ impl<'a> ArgStat<'a> {
10695
}
10796

10897
impl ReturnStat {
109-
pub fn boolean(true_count: StatEstimate) -> Self {
110-
Self {
111-
domain: Domain::Boolean(BooleanDomain {
112-
has_true: true,
113-
has_false: true,
114-
}),
115-
ndv: Ndv::Stat(2.0),
116-
null_count: 0,
117-
distribution: OwnedDistribution::Boolean(BooleanDistribution { true_count }),
118-
}
119-
}
120-
12198
pub fn histogram(&self) -> Option<&Histogram> {
12299
self.distribution.as_histogram()
123100
}
@@ -241,8 +218,11 @@ fn check_histogram_distribution<D>(
241218
if !histogram_ndv.is_finite() || histogram_ndv < 0.0 {
242219
return Err(format!("histogram ndv is invalid: {histogram_ndv}"));
243220
}
244-
if histogram.num_buckets() != 0 && stat.value_domain().is_none() {
245-
return Err("histogram distribution requires a value domain".to_string());
221+
if matches!(
222+
stat.domain,
223+
Domain::Nullable(NullableDomain { value: None, .. })
224+
) {
225+
return Err("histogram distribution requires a non-null value domain".to_string());
246226
}
247227
Ok(())
248228
}
@@ -251,9 +231,12 @@ fn check_boolean_distribution<D>(
251231
stat: &StatDistribution<D>,
252232
distribution: &BooleanDistribution,
253233
) -> Result<(), String> {
254-
if !matches!(stat.value_domain(), Some(Domain::Boolean(_))) {
234+
if !matches!(
235+
stat.domain,
236+
Domain::Nullable(NullableDomain { value: Some(box Domain::Boolean(_)), .. })|Domain::Boolean(_)
237+
) {
255238
return Err(format!(
256-
"boolean distribution requires boolean value domain, got {:?}",
239+
"boolean distribution requires boolean non-null value domain, got {:?}",
257240
stat.domain
258241
));
259242
}
@@ -310,3 +293,66 @@ impl StatEstimate {
310293
pub struct BooleanDistribution {
311294
pub true_count: StatEstimate,
312295
}
296+
297+
#[cfg(test)]
298+
mod tests {
299+
use databend_common_statistics::Histogram;
300+
use databend_common_statistics::TypedHistogram;
301+
302+
use super::*;
303+
use crate::types::boolean::BooleanDomain;
304+
305+
#[test]
306+
fn test_empty_histogram_requires_non_null_value_domain() {
307+
let stat = ReturnStat {
308+
domain: Domain::Nullable(NullableDomain {
309+
has_null: true,
310+
value: None,
311+
}),
312+
ndv: Ndv::Stat(0.0),
313+
null_count: 10,
314+
distribution: OwnedDistribution::Histogram(Histogram::Int(TypedHistogram::new(
315+
vec![],
316+
true,
317+
))),
318+
};
319+
320+
let err = stat.check_consistency().unwrap_err();
321+
assert!(err.contains("non-null value domain"));
322+
}
323+
324+
#[test]
325+
fn test_nullable_boolean_distribution_checks_non_null_value_domain() {
326+
let valid = ReturnStat {
327+
domain: Domain::Nullable(NullableDomain {
328+
has_null: true,
329+
value: Some(Box::new(Domain::Boolean(BooleanDomain {
330+
has_true: true,
331+
has_false: true,
332+
}))),
333+
}),
334+
ndv: Ndv::Stat(2.0),
335+
null_count: 1,
336+
distribution: OwnedDistribution::Boolean(BooleanDistribution {
337+
true_count: StatEstimate::exact(1.0),
338+
}),
339+
};
340+
341+
valid.check_consistency().unwrap();
342+
343+
let invalid = ReturnStat {
344+
domain: Domain::Nullable(NullableDomain {
345+
has_null: true,
346+
value: None,
347+
}),
348+
ndv: Ndv::Stat(0.0),
349+
null_count: 10,
350+
distribution: OwnedDistribution::Boolean(BooleanDistribution {
351+
true_count: StatEstimate::exact(0.0),
352+
}),
353+
};
354+
355+
let err = invalid.check_consistency().unwrap_err();
356+
assert!(err.contains("boolean non-null value domain"));
357+
}
358+
}

0 commit comments

Comments
 (0)