From 86e2448067ae1b8b8c29555c3a07ffcb0e56d338 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas Date: Tue, 26 May 2026 13:58:10 +0000 Subject: [PATCH 1/5] Fix _vstore_unroll! for nested W=1 (scalar lane) VecUnroll LoopVectorization can produce a `VecUnroll{NO,1,T,VecUnroll{NI,1,T,T}}` when a `@turbo` loop has W=1 (e.g. a static length-1 inner dimension on ARM, where the SIMD register holds fewer Float64 lanes) combined with double unrolling. The innermost element type is the scalar `T` rather than `Vec{1,T}` because the `VecUnroll` constructor unwraps width-1 vectors. The existing generated `_vstore_unroll!` methods for nested unrolls all require `<:Vec{W,T}` as the innermost type, so this case hit a `MethodError`. This adds a method that handles `VecUnroll{NO,1,T,VecUnroll{NI,1,T,T}}` with a nested `Unroll{...,1,...,<:Unroll{...,1,...}}` by forwarding to the existing single-unroll handler at each outer index, which already supports the W=1 scalar case. Fixes LoopVectorization.jl issue #543 on Apple ARM (M-series) for v=1 nested static-dimension matmul-style loops at various inner sizes. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/vecunroll/memory.jl | 115 +++++++++++++++++++++++++++++++--------- 1 file changed, 91 insertions(+), 24 deletions(-) diff --git a/src/vecunroll/memory.jl b/src/vecunroll/memory.jl index 630f6537..44b7656a 100644 --- a/src/vecunroll/memory.jl +++ b/src/vecunroll/memory.jl @@ -23,7 +23,7 @@ function unrolled_indicies( end inds = Vector{Expr}(undef, N) inds[1] = baseind - for n = 1:N-1 + for n = 1:(N-1) ind = copy(baseind) i = Expr(:call, Expr(:curly, :StaticInt, n * F)) if AU == AV && W > 1 @@ -180,7 +180,7 @@ function _shuffle_load_quote( return nothing if X > 0 mask_expr = :(mask(StaticInt{$W}(), 0, vmul_nw($UN, getfield(sm, :evl)))) - for n ∈ 1:UN-1 + for n ∈ 1:(UN-1) mask_expr = :(vcat( $mask_expr, mask(StaticInt{$W}(), $(n * W), vmul_nw($UN, getfield(sm, :evl))) @@ -197,7 +197,7 @@ function _shuffle_load_quote( Val{-1}() )) mask_expr = :(($vrange + $(UN * W)) ≤ vmul_nw($UN, getfield(sm, :evl))) - for n ∈ UN-1:-1:1 + for n ∈ (UN-1):-1:1 mask_expr = :(vcat( $mask_expr, ($vrange + $(n * W)) ≤ vmul_nw($UN, getfield(sm, :evl)) @@ -208,8 +208,8 @@ function _shuffle_load_quote( end push!(q.args, :(v = $vloadexpr)) vut = Expr(:tuple) - Wrange = X > 0 ? (0:1:W-1) : (W-1:-1:0) - for n ∈ 0:UN-1 + Wrange = X > 0 ? (0:1:(W-1)) : ((W-1):-1:0) + for n ∈ 0:(UN-1) shufftup = Expr(:tuple) for w ∈ Wrange push!(shufftup.args, n + UN * w) @@ -1001,8 +1001,8 @@ function _shuffle_store_quote( Wtemp = Wnext end shufftup = Expr(:tuple) - for w ∈ ((X > 0) ? (0:1:W-1) : (W-1:-1:0)) - for n ∈ 0:UN-1 + for w ∈ ((X > 0) ? (0:1:(W-1)) : ((W-1):-1:0)) + for n ∈ 0:(UN-1) push!(shufftup.args, W * n + w) end end @@ -1117,7 +1117,7 @@ function vstore_transpose_quote( for nn ∈ 1:npartial push!(t.args, vds[i+nn]) end - for nn ∈ npartial+1:n + for nn ∈ (npartial+1):n # if W == 1 # push!(t.args, :(zero($Tsym))) # else @@ -2252,7 +2252,7 @@ function vload_double_unroll_quote( unroll = :(Unroll{$AUO,$FO,$NO,$AV,$W,$MO,$X}(Zero())) # tupvec = Vector{Expr}(undef, NI) vds = Vector{Symbol}(undef, NI) - for ui ∈ 0:NI-1 + for ui ∈ 0:(NI-1) if ui == 0 loadq = :(_vload_unroll(gptr, $unroll)) # VecUnroll($tup) else @@ -2286,7 +2286,7 @@ function vload_double_unroll_quote( else # we loop over `UO+1` and do the loads unroll = :(Unroll{$AUI,$FI,$NI,$AV,$W,$MI,$X}(Zero())) tup = Expr(:tuple) - for uo ∈ 0:NO-1 + for uo ∈ 0:(NO-1) if uo == 0 loadq = :(_vload_unroll(gptr, $unroll)) else @@ -2473,7 +2473,7 @@ function vstore_double_unroll_quote( push!(q.args, :($vdt = getfield(getfield(vd, $t, false), 1))) end # tupvec = Vector{Expr}(undef, NI) - for ui ∈ 0:NI-1 + for ui ∈ 0:(NI-1) tup = Expr(:tuple) # tup = ui == 0 ? Expr(:tuple) : tupvec[ui+1] for t ∈ 1:NO @@ -2501,7 +2501,7 @@ function vstore_double_unroll_quote( end else # we loop over `UO+1` and do the stores unroll = :(Unroll{$AUI,$FI,$NI,$AV,$W,$MI,$X}(Zero())) - for uo ∈ 0:NO-1 + for uo ∈ 0:(NO-1) if uo == 0 storeq = :(_vstore_unroll!(gptr, getfield(vd, 1, false), $unroll)) else @@ -2753,6 +2753,73 @@ end ) end +# Doubly-unrolled scalar (W=1) case. The inner `VecUnroll` holds raw scalars +# rather than `Vec{1,T}` because `VecUnroll` unwraps width-1 vectors at +# construction. The generated methods above all match +# `VecUnroll{<:Any,W,T,<:VecUnroll{<:Any,W,T,Vec{W,T}}}`, so the W=1 nested +# scalar case falls through. Forward to the existing single-unroll handler at +# each outer index. +@generated function _vstore_unroll!( + sptr::AbstractStridedPointer{T,D,C}, + v::VecUnroll{NO_m1,1,T,<:VecUnroll{NI_m1,1,T,T}}, + u::Unroll{AUO,FO,NO,AV,1,MO,X,<:Unroll{AUI,FI,NI,AV,1,MI,X}}, + ::A, + ::S, + ::NT, + ::StaticInt{RS}, + ::SVUS +) where { + T, + D, + C, + NO_m1, + NI_m1, + AUO, + FO, + NO, + AUI, + FI, + NI, + AV, + MO, + MI, + X, + A<:StaticBool, + S<:StaticBool, + NT<:StaticBool, + RS, + SVUS +} + q = Expr( + :block, + Expr(:meta, :inline), + :(vd = getfield(v, :data)), + :(id = getfield(getfield(u, :i), :i)), + :(gptr = similar_no_offset(sptr, gep(pointer(sptr), id))) + ) + aexpr = Expr(:call, A === True ? :True : :False) + sexpr = Expr(:call, S === True ? :True : :False) + ntexpr = Expr(:call, NT === True ? :True : :False) + rsexpr = Expr(:call, Expr(:curly, :StaticInt, RS)) + svusexpr = SVUS <: StaticInt ? :($(SVUS())) : :nothing + inner_unroll = :(Unroll{$AUI,$FI,$NI,$AV,1,$MI,$X}(Zero())) + for uo = 0:(NO-1) + if uo == 0 + storeq = :(_vstore_unroll!(gptr, getfield(vd, 1, false), $inner_unroll)) + else + inds = sparse_index_tuple(D, AUO, uo * FO) + storeq = :(_vstore_unroll!( + gesp(gptr, $inds), + getfield(vd, $(uo + 1), false), + $inner_unroll + )) + end + push!(storeq.args, aexpr, sexpr, ntexpr, rsexpr, svusexpr) + push!(q.args, storeq) + end + q +end + function vstore_unroll_i_quote(Nm1, Wsplit, W, A, S, NT, rs::Int, mask::Bool) N = Nm1 + 1 N * Wsplit == W || throw( @@ -2993,10 +3060,10 @@ function transposeshuffle(split, W, offset::Bool) S = 1 << split i = offset ? S : 0 while w < W - for s ∈ 0:S-1 + for s ∈ 0:(S-1) push!(tup.args, w + s + i) end - for s ∈ 0:S-1 + for s ∈ 0:(S-1) # push!(tup.args, w + W + s) push!(tup.args, w + W + s + i) end @@ -3030,7 +3097,7 @@ function horizontal_reduce_store_expr( push!(q.args, :(gptr = gesp(ptr, $gf(u, :i)))) push!(q.args, :(bptr = pointer(gptr))) extractblock = Expr(:block) - vectors = [Symbol(:v_, n) for n ∈ 0:N-1] + vectors = [Symbol(:v_, n) for n ∈ 0:(N-1)] for n ∈ 1:N push!( extractblock.args, @@ -3090,7 +3157,7 @@ function horizontal_reduce_store_expr( v0, Expr( :call, - Expr(:curly, :Val, Expr(:tuple, [w for w ∈ 0:Wh-1]...)) + Expr(:curly, :Val, Expr(:tuple, [w for w ∈ 0:(Wh-1)]...)) ) ), Expr( @@ -3099,7 +3166,7 @@ function horizontal_reduce_store_expr( v0, Expr( :call, - Expr(:curly, :Val, Expr(:tuple, [w for w ∈ Wh:Wt-1]...)) + Expr(:curly, :Val, Expr(:tuple, [w for w ∈ Wh:(Wt-1)]...)) ) ) ) @@ -3120,7 +3187,7 @@ function horizontal_reduce_store_expr( end if mask boolmask = Expr(:call, :Vec) - for n ∈ ncomp+1:ncomp+minWN + for n ∈ (ncomp+1):(ncomp+minWN) push!(boolmask.args, Expr(:call, gf, :masktuple, n, false)) end push!(storeexpr.args, Expr(:call, :tomask, boolmask)) @@ -3138,7 +3205,7 @@ function horizontal_reduce_store_expr( zeroexpr = Expr(:call, Expr(:curly, :StaticInt, 0)) ind = Expr(:tuple) foreach(_ -> push!(ind.args, zeroexpr), 1:D) - for n ∈ N+1:Ntotal + for n ∈ (N+1):Ntotal (n > N + 1) && (ind = copy(ind)) # copy to avoid overwriting old ind.args[AU] = Expr(:call, Expr(:curly, :StaticInt, F * (n - 1))) scalar = Expr(:call, reduct, Expr(:call, gf, :v, n, false)) @@ -3346,7 +3413,7 @@ function lazymulunroll_load_quote(M, O, N, maskall, masklast, align, rs) alignval = Expr(:call, align ? :True : :False) rsexpr = Expr(:call, Expr(:curly, :StaticInt, rs)) gf = GlobalRef(Core, :getfield) - for n = 1:N+1 + for n = 1:(N+1) ind = if (M != 1) | (O != 0) :(LazyMulAdd{$M,$O}(u[$n])) else @@ -3489,7 +3556,7 @@ function lazymulunroll_store_quote( noaliasval = Expr(:call, noalias ? :True : :False) nontemporalval = Expr(:call, nontemporal ? :True : :False) rsexpr = Expr(:call, Expr(:curly, :StaticInt, rs)) - for n = 1:N+1 + for n = 1:(N+1) push!( q.args, Expr( @@ -3520,7 +3587,7 @@ end v = Base.FastMath.add_fast(s + mm) end t = Expr(:tuple, :v) - for n ∈ 1:N-1 + for n ∈ 1:(N-1) # push!(t.args, :(MM{$W,$W}(Base.FastMath.add_fast(s, $(T(n*W)))))) push!( t.args, @@ -3548,7 +3615,7 @@ end else Expr(:tuple, :v) end - for n ∈ 1:N-1 + for n ∈ 1:(N-1) M >>>= 1 if M % Bool push!( @@ -3583,7 +3650,7 @@ end z = zero(v) end t = Expr(:tuple, :(ifelse(getfield(m, $1, false), v, z))) - for n ∈ 1:N-1 + for n ∈ 1:(N-1) push!( t.args, :(ifelse( From 598ba2daf03e7f33319dfbcaef07de2269381b23 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas Date: Tue, 26 May 2026 14:57:03 +0000 Subject: [PATCH 2/5] Fix BitVector dynamic-index load misalignment (Apple ARM Bernoulli_logit) `vload_quote_llvmcall_core` emits a `` load whose pointer is computed as `ptr + (index >> 3)` for the dynamic-index BitArray case (see `offset_ptr` at memory_addr.jl:308: `ashr i$ibits %indargname, 3`). That only reads the correct W bits when `index & 7 == 0`. For any other runtime index (e.g. the cleanup unroll loops of LV that step by `W * UN < 8` elements), the load reads bits 0..W-1 of the addressed byte, which are the wrong bits. This happens on every architecture, but the bug only manifests as wrong test results on Apple ARM (M-series) because NEON's natural vector width for Float64 is 2, so the SIMD-cleanup tail of the `Bernoulli_logitavx` loop in LV's `test/ifelsemasks.jl` hits non-byte-aligned bit indices for most random seeds. On x86 with AVX2 (W=4) or AVX-512 (W=8), the lane alignment happens to avoid the problem for the test inputs in question. The fix issues a wider integer load that covers W bits starting at any bit offset 0..7, shifts right by `index & 7`, then truncates back to `` so the downstream code is unchanged. It is only enabled on the dynamic-index Integer-index, non-mask, non-grv, non-reverse, W>1 BitArray path. Together with the nested W=1 `_vstore_unroll!` method, this unblocks the BitVector + ternary tests in LoopVectorization.jl's `ifelsemasks.jl` (`Bernoulli_logitavx` / `Bernoulli_logit_avx` with `BitVector` mask). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/llvm_intrin/memory_addr.jl | 78 +++++++++++++++++++++++++++++----- 1 file changed, 68 insertions(+), 10 deletions(-) diff --git a/src/llvm_intrin/memory_addr.jl b/src/llvm_intrin/memory_addr.jl index 8042e9f4..257be2dd 100644 --- a/src/llvm_intrin/memory_addr.jl +++ b/src/llvm_intrin/memory_addr.jl @@ -907,6 +907,20 @@ function vload_quote_llvmcall_core( decl = LOAD_SCOPE_TBAA dynamic_index = !(iszero(M) || ind_type === :StaticInt) + # Detect the dynamic-index bit-load path that uses `ashr index, 3` for byte + # addressing in `offset_ptr`. In that path the loaded `` only reads + # bits 0..W-1 of the addressed byte, which is wrong whenever the original + # index is not a multiple of 8 (e.g. in cleanup unroll loops of LV that step + # by W*UN < 8 elements). To handle misalignment, perform a wider integer + # load and shift right by `index & 7` before truncating to `i$W`. + bit_dyn_misalign_fix = + isbit && + dynamic_index && + (ind_type === :Integer) && + !grv && + !mask && + !reverse_load && + W > 1 vtyp = vtype(W, typ) if mask if reverse_load @@ -964,18 +978,62 @@ function vload_quote_llvmcall_core( ) end else - @static if USE_OPAQUE_PTR - push!( + if bit_dyn_misalign_fix + # Wide integer load that covers W bits starting at any bit offset 0..7. + # Need W+7 bits; round up to a power-of-2 byte width LLVM handles well. + wide_bits = max(8, nextpow2(W + 7)) + wide_typ = "i$(wide_bits)" + @static if USE_OPAQUE_PTR + push!( + instrs, + "%bitrawres = load $wide_typ, ptr %ptr.$(i-1), align 1" * + LOAD_SCOPE_TBAA_FLAGS + ) + else + push!( + instrs, + "%bitrawres = load $wide_typ, $wide_typ* %ptr.$(i-1), align 1" * + LOAD_SCOPE_TBAA_FLAGS + ) + end + # `%1` is the original (dynamic) index in `iibits`; compute `index & 7` + # and zero-extend/truncate to `wide_typ` to use as a shift amount. + push!(instrs, "%bitoff.raw = and i$(ibits) %1, 7") + if ibits < wide_bits + push!( + instrs, + "%bitoff = zext i$(ibits) %bitoff.raw to $wide_typ" + ) + elseif ibits > wide_bits + push!( + instrs, + "%bitoff = trunc i$(ibits) %bitoff.raw to $wide_typ" + ) + else + push!(instrs, "%bitoff = bitcast i$(ibits) %bitoff.raw to $wide_typ") + end + push!(instrs, "%bitshifted = lshr $wide_typ %bitrawres, %bitoff") + # Produce `` to keep downstream code identical. + if wide_bits > W + push!(instrs, "%bittrunc = trunc $wide_typ %bitshifted to i$(W)") + push!(instrs, "%res = bitcast i$(W) %bittrunc to <$W x i1>") + else + push!(instrs, "%res = bitcast $wide_typ %bitshifted to <$W x i1>") + end + else + @static if USE_OPAQUE_PTR + push!( + instrs, + "%res = load $vtyp, ptr %ptr.$(i-1), align $alignment" * + LOAD_SCOPE_TBAA_FLAGS + ) + else + push!( instrs, - "%res = load $vtyp, ptr %ptr.$(i-1), align $alignment" * + "%res = load $vtyp, $vtyp* %ptr.$(i-1), align $alignment" * LOAD_SCOPE_TBAA_FLAGS - ) - else - push!( - instrs, - "%res = load $vtyp, $vtyp* %ptr.$(i-1), align $alignment" * - LOAD_SCOPE_TBAA_FLAGS - ) + ) + end end end if isbit From 33d18a1094b59b02e2a6834a9555b2d362223477 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas Date: Tue, 26 May 2026 21:25:49 +0000 Subject: [PATCH 3/5] Bitcast pointer in BitVector dynamic-index load for non-opaque LLVM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous fix emitted `load i$wide, $wide_typ* %ptr.X` without bitcasting the `%ptr.X` value, which `offset_ptr` produces typed as `*`. Under Julia ≤ 1.10 (LLVM without opaque pointers) this fails with `'%ptr.X' defined with type '*' but expected 'iN*'`, seen on the downstream LoopVectorization.jl LTS interface tests. Insert a `bitcast * to $wide_typ*` so the wide integer load typechecks. No effect on the opaque-pointer path used by Julia 1.11+. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/llvm_intrin/memory_addr.jl | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/llvm_intrin/memory_addr.jl b/src/llvm_intrin/memory_addr.jl index 257be2dd..6c6a58f7 100644 --- a/src/llvm_intrin/memory_addr.jl +++ b/src/llvm_intrin/memory_addr.jl @@ -990,9 +990,16 @@ function vload_quote_llvmcall_core( LOAD_SCOPE_TBAA_FLAGS ) else + # `%ptr.$(i-1)` was typed as `*` (or similar) by `offset_ptr`; + # bitcast to `wide_typ*` before issuing the wide integer load so the + # non-opaque-pointer LLVM IR (Julia ≤ 1.10) typechecks. push!( instrs, - "%bitrawres = load $wide_typ, $wide_typ* %ptr.$(i-1), align 1" * + "%ptr.bit$(i-1) = bitcast $vtyp* %ptr.$(i-1) to $wide_typ*" + ) + push!( + instrs, + "%bitrawres = load $wide_typ, $wide_typ* %ptr.bit$(i-1), align 1" * LOAD_SCOPE_TBAA_FLAGS ) end From 7977c5bcc275a539dc1c306c26836ab7d0fe95f6 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas Date: Wed, 27 May 2026 14:55:42 +0000 Subject: [PATCH 4/5] Rerun CI on top of merged #128 + bumped downstream releases From 283fe18a52573d21d1f525d485b9699b367a2a45 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas Date: Fri, 29 May 2026 04:40:39 +0000 Subject: [PATCH 5/5] Rerun CI on top of SLEEFPirates v0.6.46+