diff --git a/example/neon.rs b/example/neon.rs index 4c2a0d9871..af10c57c19 100644 --- a/example/neon.rs +++ b/example/neon.rs @@ -375,6 +375,46 @@ unsafe fn test_vmull_p8() { assert_eq!(r, e); } +#[cfg(target_arch = "aarch64")] +unsafe fn test_vqdmulh_s16() { + // AArch64 llvm intrinsic: llvm.aarch64.neon.sqdmulh.v4i16 + let a = i16x4::from([1, 2, 4, 8]); + let b = i16x4::from([16384, 16384, 16384, 16384]); + let e = i16x4::from([0, 1, 2, 4]); + let r: i16x4 = unsafe { transmute(vqdmulh_s16(transmute(a), transmute(b))) }; + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vqdmulh_s32() { + // AArch64 llvm intrinsic: llvm.aarch64.neon.sqdmulh.v2i32 + let a = i32x2::from([1, 2]); + let b = i32x2::from([1073741824, 1073741824]); + let e = i32x2::from([0, 1]); + let r: i32x2 = unsafe { transmute(vqdmulh_s32(transmute(a), transmute(b))) }; + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vqdmulhq_s16() { + // AArch64 llvm intrinsic: llvm.aarch64.neon.sqdmulh.v8i16 + let a = i16x8::from([1, 2, 4, 8, 16, 32, 64, 128]); + let b = i16x8::from([16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384]); + let e = i16x8::from([0, 1, 2, 4, 8, 16, 32, 64]); + let r: i16x8 = unsafe { transmute(vqdmulhq_s16(transmute(a), transmute(b))) }; + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vqdmulhq_s32() { + // AArch64 llvm intrinsic: llvm.aarch64.neon.sqdmulh.v4i32 + let a = i32x4::from([1, 2, 4, 8]); + let b = i32x4::from([1073741824, 1073741824, 1073741824, 1073741824]); + let e = i32x4::from([0, 1, 2, 4]); + let r: i32x4 = unsafe { transmute(vqdmulhq_s32(transmute(a), transmute(b))) }; + assert_eq!(r, e); +} + #[cfg(target_arch = "aarch64")] fn main() { unsafe { @@ -422,6 +462,11 @@ fn main() { test_vmull_p64(); test_vmull_p8(); + + test_vqdmulh_s16(); + test_vqdmulh_s32(); + test_vqdmulhq_s16(); + test_vqdmulhq_s32(); } } diff --git a/src/intrinsics/llvm_aarch64.rs b/src/intrinsics/llvm_aarch64.rs index 6f430542fc..d2403e079a 100644 --- a/src/intrinsics/llvm_aarch64.rs +++ b/src/intrinsics/llvm_aarch64.rs @@ -787,6 +787,42 @@ pub(super) fn codegen_aarch64_llvm_intrinsic_call<'tcx>( ); } + "llvm.aarch64.neon.sqdmulh.v2i32" + | "llvm.aarch64.neon.sqdmulh.v4i16" + | "llvm.aarch64.neon.sqdmulh.v4i32" + | "llvm.aarch64.neon.sqdmulh.v8i16" => { + // https://developer.arm.com/documentation/ddi0602/2026-03/SIMD-FP-Instructions/SQDMULH--vector---Signed-saturating-doubling-multiply-returning-high-half- + intrinsic_args!(fx, args => (a, b); intrinsic); + + // Simplify the "double and shift by esize" into "shift by esize - 1". + // https://github.com/qemu/qemu/blob/81cc5f39aa3042e9c0b2ea772b42a2c8b1488e76/target/arm/tcg/mve_helper.c#L1267-L1283 + let (result_ty, product_ty, shift, max) = match intrinsic { + "llvm.aarch64.neon.sqdmulh.v4i16" | "llvm.aarch64.neon.sqdmulh.v8i16" => { + (types::I16, types::I32, 15, i64::from(i16::MAX)) + } + "llvm.aarch64.neon.sqdmulh.v2i32" | "llvm.aarch64.neon.sqdmulh.v4i32" => { + (types::I32, types::I64, 31, i64::from(i32::MAX)) + } + _ => unreachable!(), + }; + + simd_pair_for_each_lane( + fx, + a, + b, + ret, + &|fx, _lane_ty, _res_lane_ty, a_lane, b_lane| { + let a_lane = fx.bcx.ins().sextend(product_ty, a_lane); + let b_lane = fx.bcx.ins().sextend(product_ty, b_lane); + let product = fx.bcx.ins().imul(a_lane, b_lane); + let product = fx.bcx.ins().sshr_imm(product, shift); + let max = fx.bcx.ins().iconst(product_ty, max); + let result = fx.bcx.ins().smin(product, max); + fx.bcx.ins().ireduce(result_ty, result) + }, + ); + } + _ => { fx.tcx.dcx().warn(format!( "unsupported AArch64 llvm intrinsic {}; replacing with trap",