From ff4d2972632cc5407e1b29f0fe7c63fae02dbf5e Mon Sep 17 00:00:00 2001
From: carlushuang <carlus.huang@amd.com>
Date: Mon, 22 Jun 2026 15:52:18 +0000
Subject: [PATCH] fix(opus): arch-guard fp8/bf8 packed-cvt builtins for
 RDNA3/3.5 (gfx1151)

opus.hpp fp32_to_fp8 / fp8_to_fp32 (non-template) and aiter_opus_plus.h
fp32_to_{fp8,bf8}_scaled_x2 use __builtin_amdgcn_cvt_pk_fp8_f32 /
v_cvt_pk_{fp8,bf8}_f32, which require the fp8-conversion-insts target
feature absent on RDNA3 (gfx1100) and RDNA3.5 (gfx115x). clang rejects the
non-template builtins eagerly, so any HIP module that includes these headers
(module_cache, module_activation, module_norm fused-quant, ...) fails to JIT
on gfx1151 even for BF16 paths that never call fp8 conversion.

Guard the instruction bodies behind the arches that have the feature
(gfx942/gfx950/gfx1200/gfx1201/gfx1250) with a compile-only fallback on
others. BF16 code paths never invoke fp8 cast at runtime on these arches.

Enables ROCm/ATOM Qwen3.x (Qwen3.5/3.6) BF16 bring-up on gfx1151 (Strix Halo /
Radeon 8060S). Companion: ROCm/ATOM PR (gfx1151 Qwen3.x bring-up).
---
 csrc/include/aiter_opus_plus.h | 10 ++++++++++
 csrc/include/opus/opus.hpp     | 10 ++++++++++
 2 files changed, 20 insertions(+)
diff --git a/csrc/include/aiter_opus_plus.h b/csrc/include/aiter_opus_plus.h
index 84134e29f0..adff09567c 100644
--- a/csrc/include/aiter_opus_plus.h
+++ b/csrc/include/aiter_opus_plus.h
@@ -51,6 +51,7 @@ OPUS_D decltype(auto) fp32_to_fp8_scaled_x2(const S& s, float inverted_scale)
     constexpr float hi = 448.0f, lo = -448.0f;
 #endif
     float a = tmp[0], b = tmp[1];
+#if defined(__gfx942__) || defined(__gfx950__)
     int w;
     asm volatile("v_med3_f32 %1, %1, %3, %4\n"
                  "v_med3_f32 %2, %2, %3, %4\n"
@@ -58,6 +59,11 @@ OPUS_D decltype(auto) fp32_to_fp8_scaled_x2(const S& s, float inverted_scale)
                  : "=v"(w), "+v"(a), "+v"(b)
                  : "v"(lo), "v"(hi));
     return __builtin_bit_cast(fp8x2_t, static_cast<int16_t>(w));
+#else
+    // Arches without packed fp8-cvt (RDNA3/3.5, host): compile-only stub.
+    // fp8 KV-cache is unused on these arches; never executed at runtime.
+    (void)a; (void)b; (void)lo; (void)hi; return fp8x2_t{};
+#endif
 }
 
 template <typename S, std::enable_if_t<std::is_same_v<S, fp32x4_t>, bool> = true>
@@ -76,6 +82,7 @@ OPUS_D decltype(auto) fp32_to_bf8_scaled_x2(const S& s, float inverted_scale)
     fp32x2_t tmp       = pk_mul_f32(s, fp32x2_t{inverted_scale, inverted_scale});
     constexpr float hi = 57344.0f, lo = -57344.0f;
     float a = tmp[0], b = tmp[1];
+#if defined(__gfx942__) || defined(__gfx950__)
     int w;
     asm volatile("v_med3_f32 %1, %1, %3, %4\n"
                  "v_med3_f32 %2, %2, %3, %4\n"
@@ -83,6 +90,9 @@ OPUS_D decltype(auto) fp32_to_bf8_scaled_x2(const S& s, float inverted_scale)
                  : "=v"(w), "+v"(a), "+v"(b)
                  : "v"(lo), "v"(hi));
     return __builtin_bit_cast(bf8x2_t, static_cast<int16_t>(w));
+#else
+    (void)a; (void)b; (void)lo; (void)hi; return bf8x2_t{};
+#endif
 }
 
 template <typename S, std::enable_if_t<std::is_same_v<S, fp32x4_t>, bool> = true>
diff --git a/csrc/include/opus/opus.hpp b/csrc/include/opus/opus.hpp
index cdbc5b5dc2..08eecad214 100644
--- a/csrc/include/opus/opus.hpp
+++ b/csrc/include/opus/opus.hpp
@@ -1125,12 +1125,22 @@ OPUS_D constexpr auto fp32_to_bf16(const fp32_t& x, number<rm> = {}) {
 // Template constexpr (packed variants, OPUS_CAST_DEFINE) survives because the check is deferred to instantiation.
 // TODO: we may remove constexpr from cast in the future
 OPUS_D auto fp32_to_fp8(const fp32_t& x) {
+#if defined(__HIP_DEVICE_COMPILE__) && !(defined(__gfx942__) || defined(__gfx950__) || defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx1250__))
+    // RDNA3/3.5 (gfx1100/gfx115x) lack fp8-conversion-insts; compile-only
+    // stub so headers build. BF16 code paths never invoke fp8 conversion.
+    (void)x; return __builtin_bit_cast(fp8_t, static_cast<signed char>(0));
+#else
     int w; w = __builtin_amdgcn_cvt_pk_fp8_f32(x, 0.0f, w, /*sel=lo*/0);
     return __builtin_bit_cast(fp8_t, static_cast<signed char>(w));
+#endif
 }
 OPUS_D auto fp8_to_fp32(const fp8_t& x) {
+#if defined(__HIP_DEVICE_COMPILE__) && !(defined(__gfx942__) || defined(__gfx950__) || defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx1250__))
+    (void)x; return fp32_t(0.0f);
+#else
     int w = static_cast<int>(__builtin_bit_cast(unsigned char, x));
     return __builtin_amdgcn_cvt_f32_fp8(w, /*byte=*/0);
+#endif
 }
 OPUS_D constexpr auto fp32_to_fp32(const fp32_t& x) { return x; }
 OPUS_D constexpr auto fp32_to_i8(const fp32_t& x) { return static_cast<i8_t>(x); }