@@ -195,12 +195,200 @@ impl Histogram {
195195 }
196196 }
197197
198+ /// Estimate a join for matching histogram types, or for mixed numeric histograms
199+ /// through a temporary float view. Non-numeric mixed types return `None`.
200+ pub fn estimate_join_numeric_compatible (
201+ & self ,
202+ other : & Histogram ,
203+ ) -> ExceptionResult < Option < JoinEstimation > > {
204+ match ( self , other) {
205+ ( Self :: Int ( left) , Self :: Int ( right) ) => Ok ( Some ( left. estimate_join ( right) ) ) ,
206+ ( Self :: UInt ( left) , Self :: UInt ( right) ) => Ok ( Some ( left. estimate_join ( right) ) ) ,
207+ ( Self :: Float ( left) , Self :: Float ( right) ) => Ok ( Some ( left. estimate_join ( right) ) ) ,
208+ ( Self :: Bytes ( left) , Self :: Bytes ( right) ) => Ok ( Some ( left. estimate_join ( right) ) ) ,
209+ ( Self :: Bytes ( _) , _) | ( _, Self :: Bytes ( _) ) => Ok ( None ) ,
210+ _ => estimate_mixed_numeric_histogram_join ( self , other) ,
211+ }
212+ }
213+
214+ pub fn restrict_to_bounds ( & self , min : & Datum , max : & Datum ) -> ExceptionResult < Option < Self > > {
215+ let buckets = self
216+ . bucket_iter ( )
217+ . map ( |bucket| {
218+ let bucket_min = bucket. lower_bound ( ) ;
219+ let bucket_max = bucket. upper_bound ( ) ;
220+ if bucket_min. compare ( max) ? == std:: cmp:: Ordering :: Greater
221+ || bucket_max. compare ( min) ? == std:: cmp:: Ordering :: Less
222+ {
223+ return Ok ( None ) ;
224+ }
225+
226+ let Some ( lower_bound) = max_datum_as_bucket_kind ( & bucket_min, min) else {
227+ return Ok ( None ) ;
228+ } ;
229+ let Some ( upper_bound) = min_datum_as_bucket_kind ( & bucket_max, max) else {
230+ return Ok ( None ) ;
231+ } ;
232+ if lower_bound. compare ( & upper_bound) ? == std:: cmp:: Ordering :: Greater {
233+ return Ok ( None ) ;
234+ }
235+
236+ let selectivity = bucket_overlap_selectivity (
237+ & bucket_min,
238+ & bucket_max,
239+ & lower_bound,
240+ & upper_bound,
241+ ) ;
242+ HistogramBucket :: try_from_bounds (
243+ lower_bound,
244+ upper_bound,
245+ bucket. num_values ( ) * selectivity,
246+ bucket. num_distinct ( ) * selectivity,
247+ )
248+ . map ( Some )
249+ . map_err ( |err| ErrorCode :: Internal ( err. to_string ( ) ) )
250+ } )
251+ . collect :: < ExceptionResult < Vec < _ > > > ( ) ?
252+ . into_iter ( )
253+ . flatten ( )
254+ . collect :: < Vec < _ > > ( ) ;
255+
256+ if buckets. is_empty ( ) {
257+ return Ok ( None ) ;
258+ }
259+
260+ Self :: try_from_buckets ( self . accuracy ( ) , buckets, self . avg_spacing ( ) )
261+ . map ( Some )
262+ . map_err ( |err| ErrorCode :: Internal ( err. to_string ( ) ) )
263+ }
264+
198265 pub fn is_range_distorted ( & self ) -> bool {
199266 self . avg_spacing ( )
200267 . is_some_and ( |bucket_width| bucket_width > 1e12 )
201268 }
202269}
203270
271+ fn estimate_mixed_numeric_histogram_join (
272+ left : & Histogram ,
273+ right : & Histogram ,
274+ ) -> ExceptionResult < Option < JoinEstimation > > {
275+ let Some ( left) = numeric_histogram_as_float ( left) ? else {
276+ return Ok ( None ) ;
277+ } ;
278+ let Some ( right) = numeric_histogram_as_float ( right) ? else {
279+ return Ok ( None ) ;
280+ } ;
281+
282+ Ok ( Some ( left. estimate_join ( & right) ) )
283+ }
284+
285+ fn numeric_histogram_as_float (
286+ histogram : & Histogram ,
287+ ) -> ExceptionResult < Option < TypedHistogram < F64 > > > {
288+ if matches ! ( histogram, Histogram :: Bytes ( _) ) {
289+ return Ok ( None ) ;
290+ }
291+
292+ let buckets = histogram
293+ . bucket_iter ( )
294+ . map ( |bucket| {
295+ let lower_bound = F64 :: from ( bucket. lower_bound ( ) . as_double ( ) ?) ;
296+ let upper_bound = F64 :: from ( bucket. upper_bound ( ) . as_double ( ) ?) ;
297+ Ok ( TypedHistogramBucket :: new (
298+ lower_bound,
299+ upper_bound,
300+ bucket. num_values ( ) ,
301+ bucket. num_distinct ( ) ,
302+ ) )
303+ } )
304+ . collect :: < ExceptionResult < Vec < _ > > > ( ) ?;
305+
306+ Ok ( Some ( TypedHistogram {
307+ accuracy : histogram. accuracy ( ) ,
308+ buckets,
309+ avg_spacing : histogram. avg_spacing ( ) ,
310+ } ) )
311+ }
312+
313+ fn max_datum_as_bucket_kind ( bucket_value : & Datum , stat_value : & Datum ) -> Option < Datum > {
314+ let selected = if bucket_value. compare ( stat_value) . ok ( ) ? == std:: cmp:: Ordering :: Less {
315+ stat_value
316+ } else {
317+ bucket_value
318+ } ;
319+ selected. normalize_to_kind ( bucket_value. kind ( ) ?)
320+ }
321+
322+ fn min_datum_as_bucket_kind ( bucket_value : & Datum , stat_value : & Datum ) -> Option < Datum > {
323+ let selected = if bucket_value. compare ( stat_value) . ok ( ) ? == std:: cmp:: Ordering :: Greater {
324+ stat_value
325+ } else {
326+ bucket_value
327+ } ;
328+ selected. normalize_to_kind ( bucket_value. kind ( ) ?)
329+ }
330+
331+ fn bucket_overlap_selectivity (
332+ bucket_min : & Datum ,
333+ bucket_max : & Datum ,
334+ new_min : & Datum ,
335+ new_max : & Datum ,
336+ ) -> f64 {
337+ match ( bucket_min, bucket_max, new_min, new_max) {
338+ (
339+ Datum :: Int ( bucket_min) ,
340+ Datum :: Int ( bucket_max) ,
341+ Datum :: Int ( new_min) ,
342+ Datum :: Int ( new_max) ,
343+ ) => discrete_overlap_selectivity (
344+ * bucket_min as i128 ,
345+ * bucket_max as i128 ,
346+ * new_min as i128 ,
347+ * new_max as i128 ,
348+ ) ,
349+ (
350+ Datum :: UInt ( bucket_min) ,
351+ Datum :: UInt ( bucket_max) ,
352+ Datum :: UInt ( new_min) ,
353+ Datum :: UInt ( new_max) ,
354+ ) => discrete_overlap_selectivity (
355+ * bucket_min as i128 ,
356+ * bucket_max as i128 ,
357+ * new_min as i128 ,
358+ * new_max as i128 ,
359+ ) ,
360+ (
361+ Datum :: Float ( bucket_min) ,
362+ Datum :: Float ( bucket_max) ,
363+ Datum :: Float ( new_min) ,
364+ Datum :: Float ( new_max) ,
365+ ) => {
366+ let bucket_width = bucket_max. into_inner ( ) - bucket_min. into_inner ( ) ;
367+ if bucket_width <= 0.0 {
368+ return 1.0 ;
369+ }
370+ let overlap_width = new_max. into_inner ( ) - new_min. into_inner ( ) ;
371+ ( overlap_width / bucket_width) . clamp ( 0.0 , 1.0 )
372+ }
373+ ( Datum :: Bytes ( _) , Datum :: Bytes ( _) , Datum :: Bytes ( _) , Datum :: Bytes ( _) ) => 1.0 ,
374+ _ => 1.0 ,
375+ }
376+ }
377+
378+ fn discrete_overlap_selectivity (
379+ bucket_min : i128 ,
380+ bucket_max : i128 ,
381+ new_min : i128 ,
382+ new_max : i128 ,
383+ ) -> f64 {
384+ let bucket_count = bucket_max - bucket_min + 1 ;
385+ if bucket_count <= 0 {
386+ return 1.0 ;
387+ }
388+ let overlap_count = new_max - new_min + 1 ;
389+ ( overlap_count as f64 / bucket_count as f64 ) . clamp ( 0.0 , 1.0 )
390+ }
391+
204392pub enum HistogramBucketIter < ' a > {
205393 Int ( std:: slice:: Iter < ' a , TypedHistogramBucket < i64 > > ) ,
206394 UInt ( std:: slice:: Iter < ' a , TypedHistogramBucket < u64 > > ) ,
@@ -393,3 +581,58 @@ impl fmt::Display for Histogram {
393581 Ok ( ( ) )
394582 }
395583}
584+
585+ #[ cfg( test) ]
586+ mod tests {
587+ use super :: * ;
588+
589+ #[ test]
590+ fn test_restrict_to_bounds_uses_existing_buckets ( ) -> ExceptionResult < ( ) > {
591+ let histogram = Histogram :: UInt ( TypedHistogram {
592+ accuracy : true ,
593+ buckets : vec ! [
594+ TypedHistogramBucket :: new( 0 , 4 , 5.0 , 5.0 ) ,
595+ TypedHistogramBucket :: new( 5 , 9 , 5.0 , 5.0 ) ,
596+ ] ,
597+ avg_spacing : None ,
598+ } ) ;
599+
600+ let restricted = histogram
601+ . restrict_to_bounds ( & Datum :: UInt ( 2 ) , & Datum :: UInt ( 6 ) ) ?
602+ . expect ( "histogram should keep intersecting buckets" ) ;
603+ let buckets = restricted. bucket_iter ( ) . collect :: < Vec < _ > > ( ) ;
604+
605+ assert_eq ! ( buckets. len( ) , 2 ) ;
606+ assert_eq ! ( buckets[ 0 ] . lower_bound( ) , Datum :: UInt ( 2 ) ) ;
607+ assert_eq ! ( buckets[ 0 ] . upper_bound( ) , Datum :: UInt ( 4 ) ) ;
608+ assert_eq ! ( buckets[ 0 ] . num_values( ) , 3.0 ) ;
609+ assert_eq ! ( buckets[ 0 ] . num_distinct( ) , 3.0 ) ;
610+ assert_eq ! ( buckets[ 1 ] . lower_bound( ) , Datum :: UInt ( 5 ) ) ;
611+ assert_eq ! ( buckets[ 1 ] . upper_bound( ) , Datum :: UInt ( 6 ) ) ;
612+ assert_eq ! ( buckets[ 1 ] . num_values( ) , 2.0 ) ;
613+ assert_eq ! ( buckets[ 1 ] . num_distinct( ) , 2.0 ) ;
614+ Ok ( ( ) )
615+ }
616+
617+ #[ test]
618+ fn test_mixed_numeric_join_uses_float_view_of_existing_buckets ( ) -> ExceptionResult < ( ) > {
619+ let left = Histogram :: Int ( TypedHistogram {
620+ accuracy : true ,
621+ buckets : vec ! [ TypedHistogramBucket :: new( 1 , 1 , 3.0 , 1.0 ) ] ,
622+ avg_spacing : None ,
623+ } ) ;
624+ let right = Histogram :: UInt ( TypedHistogram {
625+ accuracy : true ,
626+ buckets : vec ! [ TypedHistogramBucket :: new( 1 , 1 , 2.0 , 1.0 ) ] ,
627+ avg_spacing : None ,
628+ } ) ;
629+
630+ let estimation = left
631+ . estimate_join_numeric_compatible ( & right) ?
632+ . expect ( "mixed numeric histograms should use a float view" ) ;
633+
634+ assert_eq ! ( estimation. cardinality. expected, 6.0 ) ;
635+ assert_eq ! ( estimation. ndv. expected, 1.0 ) ;
636+ Ok ( ( ) )
637+ }
638+ }
0 commit comments