diff --git a/example/neon.rs b/example/neon.rs
index 4c2a0d9871..af10c57c19 100644
--- a/example/neon.rs
+++ b/example/neon.rs
@@ -375,6 +375,46 @@ unsafe fn test_vmull_p8() {
     assert_eq!(r, e);
 }
 
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vqdmulh_s16() {
+    // AArch64 llvm intrinsic: llvm.aarch64.neon.sqdmulh.v4i16
+    let a = i16x4::from([1, 2, 4, 8]);
+    let b = i16x4::from([16384, 16384, 16384, 16384]);
+    let e = i16x4::from([0, 1, 2, 4]);
+    let r: i16x4 = unsafe { transmute(vqdmulh_s16(transmute(a), transmute(b))) };
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vqdmulh_s32() {
+    // AArch64 llvm intrinsic: llvm.aarch64.neon.sqdmulh.v2i32
+    let a = i32x2::from([1, 2]);
+    let b = i32x2::from([1073741824, 1073741824]);
+    let e = i32x2::from([0, 1]);
+    let r: i32x2 = unsafe { transmute(vqdmulh_s32(transmute(a), transmute(b))) };
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vqdmulhq_s16() {
+    // AArch64 llvm intrinsic: llvm.aarch64.neon.sqdmulh.v8i16
+    let a = i16x8::from([1, 2, 4, 8, 16, 32, 64, 128]);
+    let b = i16x8::from([16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384]);
+    let e = i16x8::from([0, 1, 2, 4, 8, 16, 32, 64]);
+    let r: i16x8 = unsafe { transmute(vqdmulhq_s16(transmute(a), transmute(b))) };
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe fn test_vqdmulhq_s32() {
+    // AArch64 llvm intrinsic: llvm.aarch64.neon.sqdmulh.v4i32
+    let a = i32x4::from([1, 2, 4, 8]);
+    let b = i32x4::from([1073741824, 1073741824, 1073741824, 1073741824]);
+    let e = i32x4::from([0, 1, 2, 4]);
+    let r: i32x4 = unsafe { transmute(vqdmulhq_s32(transmute(a), transmute(b))) };
+    assert_eq!(r, e);
+}
+
 #[cfg(target_arch = "aarch64")]
 fn main() {
     unsafe {
@@ -422,6 +462,11 @@ fn main() {
 
         test_vmull_p64();
         test_vmull_p8();
+
+        test_vqdmulh_s16();
+        test_vqdmulh_s32();
+        test_vqdmulhq_s16();
+        test_vqdmulhq_s32();
     }
 }
 
diff --git a/src/intrinsics/llvm_aarch64.rs b/src/intrinsics/llvm_aarch64.rs
index 6f430542fc..d2403e079a 100644
--- a/src/intrinsics/llvm_aarch64.rs
+++ b/src/intrinsics/llvm_aarch64.rs
@@ -787,6 +787,42 @@ pub(super) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
             );
         }
 
+        "llvm.aarch64.neon.sqdmulh.v2i32"
+        | "llvm.aarch64.neon.sqdmulh.v4i16"
+        | "llvm.aarch64.neon.sqdmulh.v4i32"
+        | "llvm.aarch64.neon.sqdmulh.v8i16" => {
+            // https://developer.arm.com/documentation/ddi0602/2026-03/SIMD-FP-Instructions/SQDMULH--vector---Signed-saturating-doubling-multiply-returning-high-half-
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            // Simplify the "double and shift by esize" into "shift by esize - 1".
+            // https://github.com/qemu/qemu/blob/81cc5f39aa3042e9c0b2ea772b42a2c8b1488e76/target/arm/tcg/mve_helper.c#L1267-L1283
+            let (result_ty, product_ty, shift, max) = match intrinsic {
+                "llvm.aarch64.neon.sqdmulh.v4i16" | "llvm.aarch64.neon.sqdmulh.v8i16" => {
+                    (types::I16, types::I32, 15, i64::from(i16::MAX))
+                }
+                "llvm.aarch64.neon.sqdmulh.v2i32" | "llvm.aarch64.neon.sqdmulh.v4i32" => {
+                    (types::I32, types::I64, 31, i64::from(i32::MAX))
+                }
+                _ => unreachable!(),
+            };
+
+            simd_pair_for_each_lane(
+                fx,
+                a,
+                b,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, a_lane, b_lane| {
+                    let a_lane = fx.bcx.ins().sextend(product_ty, a_lane);
+                    let b_lane = fx.bcx.ins().sextend(product_ty, b_lane);
+                    let product = fx.bcx.ins().imul(a_lane, b_lane);
+                    let product = fx.bcx.ins().sshr_imm(product, shift);
+                    let max = fx.bcx.ins().iconst(product_ty, max);
+                    let result = fx.bcx.ins().smin(product, max);
+                    fx.bcx.ins().ireduce(result_ty, result)
+                },
+            );
+        }
+
         _ => {
             fx.tcx.dcx().warn(format!(
                 "unsupported AArch64 llvm intrinsic {}; replacing with trap",