From 86e2448067ae1b8b8c29555c3a07ffcb0e56d338 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas <accounts@chrisrackauckas.com>
Date: Tue, 26 May 2026 13:58:10 +0000
Subject: [PATCH 1/5] Fix _vstore_unroll! for nested W=1 (scalar lane)
 VecUnroll

LoopVectorization can produce a `VecUnroll{NO,1,T,VecUnroll{NI,1,T,T}}`
when a `@turbo` loop has W=1 (e.g. a static length-1 inner dimension on
ARM, where the SIMD register holds fewer Float64 lanes) combined with
double unrolling. The innermost element type is the scalar `T` rather
than `Vec{1,T}` because the `VecUnroll` constructor unwraps width-1
vectors. The existing generated `_vstore_unroll!` methods for nested
unrolls all require `<:Vec{W,T}` as the innermost type, so this case
hit a `MethodError`.

This adds a method that handles `VecUnroll{NO,1,T,VecUnroll{NI,1,T,T}}`
with a nested `Unroll{...,1,...,<:Unroll{...,1,...}}` by forwarding to
the existing single-unroll handler at each outer index, which already
supports the W=1 scalar case.

Fixes LoopVectorization.jl issue #543 on Apple ARM (M-series) for v=1
nested static-dimension matmul-style loops at various inner sizes.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/vecunroll/memory.jl | 115 +++++++++++++++++++++++++++++++---------
 1 file changed, 91 insertions(+), 24 deletions(-)

diff --git a/src/vecunroll/memory.jl b/src/vecunroll/memory.jl
index 630f6537..44b7656a 100644
--- a/src/vecunroll/memory.jl
+++ b/src/vecunroll/memory.jl
@@ -23,7 +23,7 @@ function unrolled_indicies(
   end
   inds = Vector{Expr}(undef, N)
   inds[1] = baseind
-  for n = 1:N-1
+  for n = 1:(N-1)
     ind = copy(baseind)
     i = Expr(:call, Expr(:curly, :StaticInt, n * F))
     if AU == AV && W > 1
@@ -180,7 +180,7 @@ function _shuffle_load_quote(
     return nothing
     if X > 0
       mask_expr = :(mask(StaticInt{$W}(), 0, vmul_nw($UN, getfield(sm, :evl))))
-      for n ∈ 1:UN-1
+      for n ∈ 1:(UN-1)
         mask_expr = :(vcat(
           $mask_expr,
           mask(StaticInt{$W}(), $(n * W), vmul_nw($UN, getfield(sm, :evl)))
@@ -197,7 +197,7 @@ function _shuffle_load_quote(
         Val{-1}()
       ))
       mask_expr = :(($vrange + $(UN * W)) ≤ vmul_nw($UN, getfield(sm, :evl)))
-      for n ∈ UN-1:-1:1
+      for n ∈ (UN-1):-1:1
         mask_expr = :(vcat(
           $mask_expr,
           ($vrange + $(n * W)) ≤ vmul_nw($UN, getfield(sm, :evl))
@@ -208,8 +208,8 @@ function _shuffle_load_quote(
   end
   push!(q.args, :(v = $vloadexpr))
   vut = Expr(:tuple)
-  Wrange = X > 0 ? (0:1:W-1) : (W-1:-1:0)
-  for n ∈ 0:UN-1
+  Wrange = X > 0 ? (0:1:(W-1)) : ((W-1):-1:0)
+  for n ∈ 0:(UN-1)
     shufftup = Expr(:tuple)
     for w ∈ Wrange
       push!(shufftup.args, n + UN * w)
@@ -1001,8 +1001,8 @@ function _shuffle_store_quote(
     Wtemp = Wnext
   end
   shufftup = Expr(:tuple)
-  for w ∈ ((X > 0) ? (0:1:W-1) : (W-1:-1:0))
-    for n ∈ 0:UN-1
+  for w ∈ ((X > 0) ? (0:1:(W-1)) : ((W-1):-1:0))
+    for n ∈ 0:(UN-1)
       push!(shufftup.args, W * n + w)
     end
   end
@@ -1117,7 +1117,7 @@ function vstore_transpose_quote(
       for nn ∈ 1:npartial
         push!(t.args, vds[i+nn])
       end
-      for nn ∈ npartial+1:n
+      for nn ∈ (npartial+1):n
         # if W == 1
         #     push!(t.args, :(zero($Tsym)))
         # else
@@ -2252,7 +2252,7 @@ function vload_double_unroll_quote(
     unroll = :(Unroll{$AUO,$FO,$NO,$AV,$W,$MO,$X}(Zero()))
     # tupvec = Vector{Expr}(undef, NI)
     vds = Vector{Symbol}(undef, NI)
-    for ui ∈ 0:NI-1
+    for ui ∈ 0:(NI-1)
       if ui == 0
         loadq = :(_vload_unroll(gptr, $unroll)) # VecUnroll($tup)
       else
@@ -2286,7 +2286,7 @@ function vload_double_unroll_quote(
   else # we loop over `UO+1` and do the loads
     unroll = :(Unroll{$AUI,$FI,$NI,$AV,$W,$MI,$X}(Zero()))
     tup = Expr(:tuple)
-    for uo ∈ 0:NO-1
+    for uo ∈ 0:(NO-1)
       if uo == 0
         loadq = :(_vload_unroll(gptr, $unroll))
       else
@@ -2473,7 +2473,7 @@ function vstore_double_unroll_quote(
       push!(q.args, :($vdt = getfield(getfield(vd, $t, false), 1)))
     end
     # tupvec = Vector{Expr}(undef, NI)
-    for ui ∈ 0:NI-1
+    for ui ∈ 0:(NI-1)
       tup = Expr(:tuple)
       # tup = ui == 0 ? Expr(:tuple) : tupvec[ui+1]
       for t ∈ 1:NO
@@ -2501,7 +2501,7 @@ function vstore_double_unroll_quote(
     end
   else # we loop over `UO+1` and do the stores
     unroll = :(Unroll{$AUI,$FI,$NI,$AV,$W,$MI,$X}(Zero()))
-    for uo ∈ 0:NO-1
+    for uo ∈ 0:(NO-1)
       if uo == 0
         storeq = :(_vstore_unroll!(gptr, getfield(vd, 1, false), $unroll))
       else
@@ -2753,6 +2753,73 @@ end
   )
 end
 
+# Doubly-unrolled scalar (W=1) case. The inner `VecUnroll` holds raw scalars
+# rather than `Vec{1,T}` because `VecUnroll` unwraps width-1 vectors at
+# construction. The generated methods above all match
+# `VecUnroll{<:Any,W,T,<:VecUnroll{<:Any,W,T,Vec{W,T}}}`, so the W=1 nested
+# scalar case falls through. Forward to the existing single-unroll handler at
+# each outer index.
+@generated function _vstore_unroll!(
+  sptr::AbstractStridedPointer{T,D,C},
+  v::VecUnroll{NO_m1,1,T,<:VecUnroll{NI_m1,1,T,T}},
+  u::Unroll{AUO,FO,NO,AV,1,MO,X,<:Unroll{AUI,FI,NI,AV,1,MI,X}},
+  ::A,
+  ::S,
+  ::NT,
+  ::StaticInt{RS},
+  ::SVUS
+) where {
+  T,
+  D,
+  C,
+  NO_m1,
+  NI_m1,
+  AUO,
+  FO,
+  NO,
+  AUI,
+  FI,
+  NI,
+  AV,
+  MO,
+  MI,
+  X,
+  A<:StaticBool,
+  S<:StaticBool,
+  NT<:StaticBool,
+  RS,
+  SVUS
+}
+  q = Expr(
+    :block,
+    Expr(:meta, :inline),
+    :(vd = getfield(v, :data)),
+    :(id = getfield(getfield(u, :i), :i)),
+    :(gptr = similar_no_offset(sptr, gep(pointer(sptr), id)))
+  )
+  aexpr = Expr(:call, A === True ? :True : :False)
+  sexpr = Expr(:call, S === True ? :True : :False)
+  ntexpr = Expr(:call, NT === True ? :True : :False)
+  rsexpr = Expr(:call, Expr(:curly, :StaticInt, RS))
+  svusexpr = SVUS <: StaticInt ? :($(SVUS())) : :nothing
+  inner_unroll = :(Unroll{$AUI,$FI,$NI,$AV,1,$MI,$X}(Zero()))
+  for uo = 0:(NO-1)
+    if uo == 0
+      storeq = :(_vstore_unroll!(gptr, getfield(vd, 1, false), $inner_unroll))
+    else
+      inds = sparse_index_tuple(D, AUO, uo * FO)
+      storeq = :(_vstore_unroll!(
+        gesp(gptr, $inds),
+        getfield(vd, $(uo + 1), false),
+        $inner_unroll
+      ))
+    end
+    push!(storeq.args, aexpr, sexpr, ntexpr, rsexpr, svusexpr)
+    push!(q.args, storeq)
+  end
+  q
+end
+
 function vstore_unroll_i_quote(Nm1, Wsplit, W, A, S, NT, rs::Int, mask::Bool)
   N = Nm1 + 1
   N * Wsplit == W || throw(
@@ -2993,10 +3060,10 @@ function transposeshuffle(split, W, offset::Bool)
   S = 1 << split
   i = offset ? S : 0
   while w < W
-    for s ∈ 0:S-1
+    for s ∈ 0:(S-1)
       push!(tup.args, w + s + i)
     end
-    for s ∈ 0:S-1
+    for s ∈ 0:(S-1)
       # push!(tup.args, w + W + s)
       push!(tup.args, w + W + s + i)
     end
@@ -3030,7 +3097,7 @@ function horizontal_reduce_store_expr(
     push!(q.args, :(gptr = gesp(ptr, $gf(u, :i))))
     push!(q.args, :(bptr = pointer(gptr)))
     extractblock = Expr(:block)
-    vectors = [Symbol(:v_, n) for n ∈ 0:N-1]
+    vectors = [Symbol(:v_, n) for n ∈ 0:(N-1)]
     for n ∈ 1:N
       push!(
         extractblock.args,
@@ -3090,7 +3157,7 @@ function horizontal_reduce_store_expr(
                 v0,
                 Expr(
                   :call,
-                  Expr(:curly, :Val, Expr(:tuple, [w for w ∈ 0:Wh-1]...))
+                  Expr(:curly, :Val, Expr(:tuple, [w for w ∈ 0:(Wh-1)]...))
                 )
               ),
               Expr(
@@ -3099,7 +3166,7 @@ function horizontal_reduce_store_expr(
                 v0,
                 Expr(
                   :call,
-                  Expr(:curly, :Val, Expr(:tuple, [w for w ∈ Wh:Wt-1]...))
+                  Expr(:curly, :Val, Expr(:tuple, [w for w ∈ Wh:(Wt-1)]...))
                 )
               )
             )
@@ -3120,7 +3187,7 @@ function horizontal_reduce_store_expr(
       end
       if mask
         boolmask = Expr(:call, :Vec)
-        for n ∈ ncomp+1:ncomp+minWN
+        for n ∈ (ncomp+1):(ncomp+minWN)
           push!(boolmask.args, Expr(:call, gf, :masktuple, n, false))
         end
         push!(storeexpr.args, Expr(:call, :tomask, boolmask))
@@ -3138,7 +3205,7 @@ function horizontal_reduce_store_expr(
     zeroexpr = Expr(:call, Expr(:curly, :StaticInt, 0))
     ind = Expr(:tuple)
     foreach(_ -> push!(ind.args, zeroexpr), 1:D)
-    for n ∈ N+1:Ntotal
+    for n ∈ (N+1):Ntotal
       (n > N + 1) && (ind = copy(ind)) # copy to avoid overwriting old
       ind.args[AU] = Expr(:call, Expr(:curly, :StaticInt, F * (n - 1)))
       scalar = Expr(:call, reduct, Expr(:call, gf, :v, n, false))
@@ -3346,7 +3413,7 @@ function lazymulunroll_load_quote(M, O, N, maskall, masklast, align, rs)
   alignval = Expr(:call, align ? :True : :False)
   rsexpr = Expr(:call, Expr(:curly, :StaticInt, rs))
   gf = GlobalRef(Core, :getfield)
-  for n = 1:N+1
+  for n = 1:(N+1)
     ind = if (M != 1) | (O != 0)
       :(LazyMulAdd{$M,$O}(u[$n]))
     else
@@ -3489,7 +3556,7 @@ function lazymulunroll_store_quote(
   noaliasval = Expr(:call, noalias ? :True : :False)
   nontemporalval = Expr(:call, nontemporal ? :True : :False)
   rsexpr = Expr(:call, Expr(:curly, :StaticInt, rs))
-  for n = 1:N+1
+  for n = 1:(N+1)
     push!(
       q.args,
       Expr(
@@ -3520,7 +3587,7 @@ end
     v = Base.FastMath.add_fast(s + mm)
   end
   t = Expr(:tuple, :v)
-  for n ∈ 1:N-1
+  for n ∈ 1:(N-1)
     # push!(t.args, :(MM{$W,$W}(Base.FastMath.add_fast(s, $(T(n*W))))))
     push!(
       t.args,
@@ -3548,7 +3615,7 @@ end
   else
     Expr(:tuple, :v)
   end
-  for n ∈ 1:N-1
+  for n ∈ 1:(N-1)
     M >>>= 1
     if M % Bool
       push!(
@@ -3583,7 +3650,7 @@ end
     z = zero(v)
   end
   t = Expr(:tuple, :(ifelse(getfield(m, $1, false), v, z)))
-  for n ∈ 1:N-1
+  for n ∈ 1:(N-1)
     push!(
       t.args,
       :(ifelse(

From 598ba2daf03e7f33319dfbcaef07de2269381b23 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas <accounts@chrisrackauckas.com>
Date: Tue, 26 May 2026 14:57:03 +0000
Subject: [PATCH 2/5] Fix BitVector dynamic-index load misalignment (Apple ARM
 Bernoulli_logit)

`vload_quote_llvmcall_core` emits a `<W x i1>` load whose pointer is
computed as `ptr + (index >> 3)` for the dynamic-index BitArray case
(see `offset_ptr` at memory_addr.jl:308: `ashr i$ibits %indargname, 3`).
That only reads the correct W bits when `index & 7 == 0`. For any other
runtime index (e.g. the cleanup unroll loops of LV that step by
`W * UN < 8` elements), the load reads bits 0..W-1 of the addressed
byte, which are the wrong bits.

This happens on every architecture, but the bug only manifests as
wrong test results on Apple ARM (M-series) because NEON's natural
vector width for Float64 is 2, so the SIMD-cleanup tail of the
`Bernoulli_logitavx` loop in LV's `test/ifelsemasks.jl` hits
non-byte-aligned bit indices for most random seeds. On x86 with AVX2
(W=4) or AVX-512 (W=8), the lane alignment happens to avoid the
problem for the test inputs in question.

The fix issues a wider integer load that covers W bits starting at
any bit offset 0..7, shifts right by `index & 7`, then truncates back
to `<W x i1>` so the downstream code is unchanged. It is only enabled
on the dynamic-index Integer-index, non-mask, non-grv, non-reverse,
W>1 BitArray path.

Together with the nested W=1 `_vstore_unroll!` method, this unblocks
the BitVector + ternary tests in LoopVectorization.jl's `ifelsemasks.jl`
(`Bernoulli_logitavx` / `Bernoulli_logit_avx` with `BitVector` mask).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/llvm_intrin/memory_addr.jl | 78 +++++++++++++++++++++++++++++-----
 1 file changed, 68 insertions(+), 10 deletions(-)

diff --git a/src/llvm_intrin/memory_addr.jl b/src/llvm_intrin/memory_addr.jl
index 8042e9f4..257be2dd 100644
--- a/src/llvm_intrin/memory_addr.jl
+++ b/src/llvm_intrin/memory_addr.jl
@@ -907,6 +907,20 @@ function vload_quote_llvmcall_core(
   decl = LOAD_SCOPE_TBAA
   dynamic_index = !(iszero(M) || ind_type === :StaticInt)
 
+  # Detect the dynamic-index bit-load path that uses `ashr index, 3` for byte
+  # addressing in `offset_ptr`. In that path the loaded `<W x i1>` only reads
+  # bits 0..W-1 of the addressed byte, which is wrong whenever the original
+  # index is not a multiple of 8 (e.g. in cleanup unroll loops of LV that step
+  # by W*UN < 8 elements). To handle misalignment, perform a wider integer
+  # load and shift right by `index & 7` before truncating to `i$W`.
+  bit_dyn_misalign_fix =
+    isbit &&
+    dynamic_index &&
+    (ind_type === :Integer) &&
+    !grv &&
+    !mask &&
+    !reverse_load &&
+    W > 1
   vtyp = vtype(W, typ)
   if mask
     if reverse_load
@@ -964,18 +978,62 @@ function vload_quote_llvmcall_core(
       )
     end
   else
-    @static if USE_OPAQUE_PTR
-      push!(
+    if bit_dyn_misalign_fix
+      # Wide integer load that covers W bits starting at any bit offset 0..7.
+      # Need W+7 bits; round up to a power-of-2 byte width LLVM handles well.
+      wide_bits = max(8, nextpow2(W + 7))
+      wide_typ = "i$(wide_bits)"
+      @static if USE_OPAQUE_PTR
+        push!(
+          instrs,
+          "%bitrawres = load $wide_typ, ptr %ptr.$(i-1), align 1" *
+          LOAD_SCOPE_TBAA_FLAGS
+        )
+      else
+        push!(
+          instrs,
+          "%bitrawres = load $wide_typ, $wide_typ* %ptr.$(i-1), align 1" *
+          LOAD_SCOPE_TBAA_FLAGS
+        )
+      end
+      # `%1` is the original (dynamic) index in `iibits`; compute `index & 7`
+      # and zero-extend/truncate to `wide_typ` to use as a shift amount.
+      push!(instrs, "%bitoff.raw = and i$(ibits) %1, 7")
+      if ibits < wide_bits
+        push!(
+          instrs,
+          "%bitoff = zext i$(ibits) %bitoff.raw to $wide_typ"
+        )
+      elseif ibits > wide_bits
+        push!(
+          instrs,
+          "%bitoff = trunc i$(ibits) %bitoff.raw to $wide_typ"
+        )
+      else
+        push!(instrs, "%bitoff = bitcast i$(ibits) %bitoff.raw to $wide_typ")
+      end
+      push!(instrs, "%bitshifted = lshr $wide_typ %bitrawres, %bitoff")
+      # Produce `<W x i1>` to keep downstream code identical.
+      if wide_bits > W
+        push!(instrs, "%bittrunc = trunc $wide_typ %bitshifted to i$(W)")
+        push!(instrs, "%res = bitcast i$(W) %bittrunc to <$W x i1>")
+      else
+        push!(instrs, "%res = bitcast $wide_typ %bitshifted to <$W x i1>")
+      end
+    else
+      @static if USE_OPAQUE_PTR
+        push!(
+          instrs,
+          "%res = load $vtyp, ptr %ptr.$(i-1), align $alignment" *
+          LOAD_SCOPE_TBAA_FLAGS
+        )
+      else
+        push!(
         instrs,
-        "%res = load $vtyp, ptr %ptr.$(i-1), align $alignment" *
+        "%res = load $vtyp, $vtyp* %ptr.$(i-1), align $alignment" *
         LOAD_SCOPE_TBAA_FLAGS
-      )
-    else
-      push!(
-      instrs,
-      "%res = load $vtyp, $vtyp* %ptr.$(i-1), align $alignment" *
-      LOAD_SCOPE_TBAA_FLAGS
-      )
+        )
+      end
     end
   end
   if isbit

From 33d18a1094b59b02e2a6834a9555b2d362223477 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas <accounts@chrisrackauckas.com>
Date: Tue, 26 May 2026 21:25:49 +0000
Subject: [PATCH 3/5] Bitcast pointer in BitVector dynamic-index load for
 non-opaque LLVM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous fix emitted `load i$wide, $wide_typ* %ptr.X` without
bitcasting the `%ptr.X` value, which `offset_ptr` produces typed as
`<W x i1>*`. Under Julia ≤ 1.10 (LLVM without opaque pointers) this
fails with `'%ptr.X' defined with type '<W x i1>*' but expected 'iN*'`,
seen on the downstream LoopVectorization.jl LTS interface tests.

Insert a `bitcast <W x i1>* to $wide_typ*` so the wide integer load
typechecks. No effect on the opaque-pointer path used by Julia 1.11+.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/llvm_intrin/memory_addr.jl | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/llvm_intrin/memory_addr.jl b/src/llvm_intrin/memory_addr.jl
index 257be2dd..6c6a58f7 100644
--- a/src/llvm_intrin/memory_addr.jl
+++ b/src/llvm_intrin/memory_addr.jl
@@ -990,9 +990,16 @@ function vload_quote_llvmcall_core(
           LOAD_SCOPE_TBAA_FLAGS
         )
       else
+        # `%ptr.$(i-1)` was typed as `<W x i1>*` (or similar) by `offset_ptr`;
+        # bitcast to `wide_typ*` before issuing the wide integer load so the
+        # non-opaque-pointer LLVM IR (Julia ≤ 1.10) typechecks.
         push!(
           instrs,
-          "%bitrawres = load $wide_typ, $wide_typ* %ptr.$(i-1), align 1" *
+          "%ptr.bit$(i-1) = bitcast $vtyp* %ptr.$(i-1) to $wide_typ*"
+        )
+        push!(
+          instrs,
+          "%bitrawres = load $wide_typ, $wide_typ* %ptr.bit$(i-1), align 1" *
           LOAD_SCOPE_TBAA_FLAGS
         )
       end

From 7977c5bcc275a539dc1c306c26836ab7d0fe95f6 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas <accounts@chrisrackauckas.com>
Date: Wed, 27 May 2026 14:55:42 +0000
Subject: [PATCH 4/5] Rerun CI on top of merged #128 + bumped downstream
 releases


From 283fe18a52573d21d1f525d485b9699b367a2a45 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas <accounts@chrisrackauckas.com>
Date: Fri, 29 May 2026 04:40:39 +0000
Subject: [PATCH 5/5] Rerun CI on top of SLEEFPirates v0.6.46+