Skip to content

Commit 13604de

Browse files
authored
fix: extract partition bits from low 32 bits on SSE4.2 for hash join spill (#19753)
1 parent 9865cc2 commit 13604de

2 files changed

Lines changed: 85 additions & 0 deletions

File tree

src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_spiller.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,15 @@ impl HashJoinSpiller {
359359
}
360360

361361
#[inline(always)]
362+
#[cfg(target_feature = "sse4.2")]
363+
fn get_partition_id(hash: u64, bits: u64) -> u64 {
364+
// On x86 SSE4.2, _mm_crc32_u64 only sets the low 32 bits; high 32 bits are always 0.
365+
// Extract partition bits from the low 32 bits to avoid all rows landing in partition 0.
366+
(hash >> (32 - bits)) & ((1 << bits) - 1)
367+
}
368+
369+
#[inline(always)]
370+
#[cfg(not(target_feature = "sse4.2"))]
362371
fn get_partition_id(hash: u64, bits: u64) -> u64 {
363372
(hash >> (64 - bits)) & ((1 << bits) - 1)
364373
}
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# Regression test: CRC32 high-bits-zero bug in hash join spill partitioning.
2+
# On x86 SSE4.2, _mm_crc32_u64 high 32 bits are always 0.
3+
# get_partition_id() must extract from low 32 bits on SSE4.2,
4+
# otherwise all rows land in partition 0 and spilling is ineffective.
5+
6+
statement ok
7+
set force_join_data_spill = 1;
8+
9+
statement ok
10+
set disable_join_reorder = 1;
11+
12+
# INT join key (KeysU32, 4 bytes)
13+
statement ok
14+
create or replace table t_int as select number::int as id from numbers(1000);
15+
16+
query I
17+
select count(*) from t_int t1 join t_int t2 on t1.id = t2.id;
18+
----
19+
1000
20+
21+
# BIGINT join key (KeysU64, 8 bytes)
22+
statement ok
23+
create or replace table t_bigint as select number::bigint as id from numbers(1000);
24+
25+
query I
26+
select count(*) from t_bigint t1 join t_bigint t2 on t1.id = t2.id;
27+
----
28+
1000
29+
30+
# DATE join key (KeysU32, 4 bytes)
31+
statement ok
32+
create or replace table t_date as select to_date(number) as id from numbers(1000);
33+
34+
query I
35+
select count(*) from t_date t1 join t_date t2 on t1.id = t2.id;
36+
----
37+
1000
38+
39+
# TIMESTAMP join key (KeysU64, 8 bytes)
40+
statement ok
41+
create or replace table t_ts as select to_timestamp(number) as id from numbers(1000);
42+
43+
query I
44+
select count(*) from t_ts t1 join t_ts t2 on t1.id = t2.id;
45+
----
46+
1000
47+
48+
# VARCHAR join key (Serializer)
49+
statement ok
50+
create or replace table t_str as select number::varchar as id from numbers(1000);
51+
52+
query I
53+
select count(*) from t_str t1 join t_str t2 on t1.id = t2.id;
54+
----
55+
1000
56+
57+
statement ok
58+
unset force_join_data_spill;
59+
60+
statement ok
61+
unset disable_join_reorder;
62+
63+
statement ok
64+
drop table if exists t_int;
65+
66+
statement ok
67+
drop table if exists t_bigint;
68+
69+
statement ok
70+
drop table if exists t_date;
71+
72+
statement ok
73+
drop table if exists t_ts;
74+
75+
statement ok
76+
drop table if exists t_str;

0 commit comments

Comments
 (0)