From 990235eee41d7f953d82a1a793be5b78799e0401 Mon Sep 17 00:00:00 2001 From: Tonu Samuel Date: Wed, 24 Jun 2026 15:19:49 +0300 Subject: [PATCH] MDEV-40145 Vectorize VEC_DISTANCE_EUCLIDEAN/COSINE brute-force distance calc_distance_euclidean()/calc_distance_cosine() (used by VEC_DISTANCE_*() for non-indexed, brute-force search) accumulated into a single double. With strict floating-point order (no -ffast-math) the compiler must preserve the summation order, so the reduction is a serial dependency chain it cannot vectorize -- even though it already vectorizes the surrounding subtract/widen/square. Use several independent accumulators so the compiler emits a vectorized reduction on every architecture, with no intrinsics. The summation order changes, so results differ from the previous code only in the last ~1 ULP; neighbour rankings are unaffected and all main.vector* tests pass unchanged. Measured on a real server build (Apple M4, 20000 x 512-dim, no index): VEC_DISTANCE_EUCLIDEAN 7.37ms -> 3.98ms (1.85x) VEC_DISTANCE_COSINE 9.05ms -> 6.94ms (1.30x) Standalone kernel A/B confirms cross-platform: ~2.0x AVX2 (Zen), ~1.8x AVX-512 (Xeon). Correctness checked across all dimensions 1..2050 (max relative error 1.3e-14 euclidean, 9e-9 cosine). --- sql/item_vectorfunc.cc | 69 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 62 insertions(+), 7 deletions(-) diff --git a/sql/item_vectorfunc.cc b/sql/item_vectorfunc.cc index 354405c3b5fe2..8936905c7f972 100644 --- a/sql/item_vectorfunc.cc +++ b/sql/item_vectorfunc.cc @@ -24,12 +24,42 @@ #include "vector_mhnsw.h" #include "sql_type_vector.h" +/* + Distance functions use several independent accumulators instead of one. + The MariaDB build keeps strict floating-point order (no -ffast-math), so a + single `sum+= ...` accumulator is a serial dependency chain the compiler may + not reorder or vectorize. Independent partial sums break that chain and let + the compiler emit a vectorized reduction on every architecture, with no + intrinsics. The summation order changes, so results differ from the previous + code only in the last ~1 ULP; rankings are unaffected. +*/ static double calc_distance_euclidean(float *v1, float *v2, size_t v_len) { - double d= 0; - for (size_t i= 0; i < v_len; i++, v1++, v2++) + double a0= 0, a1= 0, a2= 0, a3= 0, a4= 0, a5= 0, a6= 0, a7= 0; + size_t i= 0; + for (; i + 8 <= v_len; i+= 8) + { + double d0= get_float(v1 + i) - get_float(v2 + i); + double d1= get_float(v1 + i + 1) - get_float(v2 + i + 1); + double d2= get_float(v1 + i + 2) - get_float(v2 + i + 2); + double d3= get_float(v1 + i + 3) - get_float(v2 + i + 3); + double d4= get_float(v1 + i + 4) - get_float(v2 + i + 4); + double d5= get_float(v1 + i + 5) - get_float(v2 + i + 5); + double d6= get_float(v1 + i + 6) - get_float(v2 + i + 6); + double d7= get_float(v1 + i + 7) - get_float(v2 + i + 7); + a0+= d0 * d0; + a1+= d1 * d1; + a2+= d2 * d2; + a3+= d3 * d3; + a4+= d4 * d4; + a5+= d5 * d5; + a6+= d6 * d6; + a7+= d7 * d7; + } + double d= ((a0 + a1) + (a2 + a3)) + ((a4 + a5) + (a6 + a7)); + for (; i < v_len; i++) { - double dist= get_float(v1) - get_float(v2); + double dist= get_float(v1 + i) - get_float(v2 + i); d+= dist * dist; } return sqrt(d); @@ -37,15 +67,40 @@ static double calc_distance_euclidean(float *v1, float *v2, size_t v_len) static double calc_distance_cosine(float *v1, float *v2, size_t v_len) { - double dotp=0, abs1=0, abs2=0; - for (size_t i= 0; i < v_len; i++, v1++, v2++) + double dp0= 0, dp1= 0, dp2= 0, dp3= 0; + double abs1_0= 0, abs1_1= 0, abs1_2= 0, abs1_3= 0; + double abs2_0= 0, abs2_1= 0, abs2_2= 0, abs2_3= 0; + size_t i= 0; + for (; i + 4 <= v_len; i+= 4) + { + float x0= get_float(v1 + i), x1= get_float(v1 + i + 1), + x2= get_float(v1 + i + 2), x3= get_float(v1 + i + 3); + float y0= get_float(v2 + i), y1= get_float(v2 + i + 1), + y2= get_float(v2 + i + 2), y3= get_float(v2 + i + 3); + dp0+= x0 * y0; + dp1+= x1 * y1; + dp2+= x2 * y2; + dp3+= x3 * y3; + abs1_0+= x0 * x0; + abs1_1+= x1 * x1; + abs1_2+= x2 * x2; + abs1_3+= x3 * x3; + abs2_0+= y0 * y0; + abs2_1+= y1 * y1; + abs2_2+= y2 * y2; + abs2_3+= y3 * y3; + } + double dotp= (dp0 + dp1) + (dp2 + dp3); + double abs1= (abs1_0 + abs1_1) + (abs1_2 + abs1_3); + double abs2= (abs2_0 + abs2_1) + (abs2_2 + abs2_3); + for (; i < v_len; i++) { - float f1= get_float(v1), f2= get_float(v2); + float f1= get_float(v1 + i), f2= get_float(v2 + i); abs1+= f1 * f1; abs2+= f2 * f2; dotp+= f1 * f2; } - return 1 - dotp/sqrt(abs1*abs2); + return 1 - dotp / sqrt(abs1 * abs2); } Item_func_vec_distance::Item_func_vec_distance(THD *thd, Item *a, Item *b,