From ff4d2972632cc5407e1b29f0fe7c63fae02dbf5e Mon Sep 17 00:00:00 2001 From: carlushuang Date: Mon, 22 Jun 2026 15:52:18 +0000 Subject: [PATCH] fix(opus): arch-guard fp8/bf8 packed-cvt builtins for RDNA3/3.5 (gfx1151) opus.hpp fp32_to_fp8 / fp8_to_fp32 (non-template) and aiter_opus_plus.h fp32_to_{fp8,bf8}_scaled_x2 use __builtin_amdgcn_cvt_pk_fp8_f32 / v_cvt_pk_{fp8,bf8}_f32, which require the fp8-conversion-insts target feature absent on RDNA3 (gfx1100) and RDNA3.5 (gfx115x). clang rejects the non-template builtins eagerly, so any HIP module that includes these headers (module_cache, module_activation, module_norm fused-quant, ...) fails to JIT on gfx1151 even for BF16 paths that never call fp8 conversion. Guard the instruction bodies behind the arches that have the feature (gfx942/gfx950/gfx1200/gfx1201/gfx1250) with a compile-only fallback on others. BF16 code paths never invoke fp8 cast at runtime on these arches. Enables ROCm/ATOM Qwen3.x (Qwen3.5/3.6) BF16 bring-up on gfx1151 (Strix Halo / Radeon 8060S). Companion: ROCm/ATOM PR (gfx1151 Qwen3.x bring-up). --- csrc/include/aiter_opus_plus.h | 10 ++++++++++ csrc/include/opus/opus.hpp | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/csrc/include/aiter_opus_plus.h b/csrc/include/aiter_opus_plus.h index 84134e29f0..adff09567c 100644 --- a/csrc/include/aiter_opus_plus.h +++ b/csrc/include/aiter_opus_plus.h @@ -51,6 +51,7 @@ OPUS_D decltype(auto) fp32_to_fp8_scaled_x2(const S& s, float inverted_scale) constexpr float hi = 448.0f, lo = -448.0f; #endif float a = tmp[0], b = tmp[1]; +#if defined(__gfx942__) || defined(__gfx950__) int w; asm volatile("v_med3_f32 %1, %1, %3, %4\n" "v_med3_f32 %2, %2, %3, %4\n" @@ -58,6 +59,11 @@ OPUS_D decltype(auto) fp32_to_fp8_scaled_x2(const S& s, float inverted_scale) : "=v"(w), "+v"(a), "+v"(b) : "v"(lo), "v"(hi)); return __builtin_bit_cast(fp8x2_t, static_cast(w)); +#else + // Arches without packed fp8-cvt (RDNA3/3.5, host): compile-only stub. + // fp8 KV-cache is unused on these arches; never executed at runtime. + (void)a; (void)b; (void)lo; (void)hi; return fp8x2_t{}; +#endif } template , bool> = true> @@ -76,6 +82,7 @@ OPUS_D decltype(auto) fp32_to_bf8_scaled_x2(const S& s, float inverted_scale) fp32x2_t tmp = pk_mul_f32(s, fp32x2_t{inverted_scale, inverted_scale}); constexpr float hi = 57344.0f, lo = -57344.0f; float a = tmp[0], b = tmp[1]; +#if defined(__gfx942__) || defined(__gfx950__) int w; asm volatile("v_med3_f32 %1, %1, %3, %4\n" "v_med3_f32 %2, %2, %3, %4\n" @@ -83,6 +90,9 @@ OPUS_D decltype(auto) fp32_to_bf8_scaled_x2(const S& s, float inverted_scale) : "=v"(w), "+v"(a), "+v"(b) : "v"(lo), "v"(hi)); return __builtin_bit_cast(bf8x2_t, static_cast(w)); +#else + (void)a; (void)b; (void)lo; (void)hi; return bf8x2_t{}; +#endif } template , bool> = true> diff --git a/csrc/include/opus/opus.hpp b/csrc/include/opus/opus.hpp index cdbc5b5dc2..08eecad214 100644 --- a/csrc/include/opus/opus.hpp +++ b/csrc/include/opus/opus.hpp @@ -1125,12 +1125,22 @@ OPUS_D constexpr auto fp32_to_bf16(const fp32_t& x, number = {}) { // Template constexpr (packed variants, OPUS_CAST_DEFINE) survives because the check is deferred to instantiation. // TODO: we may remove constexpr from cast in the future OPUS_D auto fp32_to_fp8(const fp32_t& x) { +#if defined(__HIP_DEVICE_COMPILE__) && !(defined(__gfx942__) || defined(__gfx950__) || defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx1250__)) + // RDNA3/3.5 (gfx1100/gfx115x) lack fp8-conversion-insts; compile-only + // stub so headers build. BF16 code paths never invoke fp8 conversion. + (void)x; return __builtin_bit_cast(fp8_t, static_cast(0)); +#else int w; w = __builtin_amdgcn_cvt_pk_fp8_f32(x, 0.0f, w, /*sel=lo*/0); return __builtin_bit_cast(fp8_t, static_cast(w)); +#endif } OPUS_D auto fp8_to_fp32(const fp8_t& x) { +#if defined(__HIP_DEVICE_COMPILE__) && !(defined(__gfx942__) || defined(__gfx950__) || defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx1250__)) + (void)x; return fp32_t(0.0f); +#else int w = static_cast(__builtin_bit_cast(unsigned char, x)); return __builtin_amdgcn_cvt_f32_fp8(w, /*byte=*/0); +#endif } OPUS_D constexpr auto fp32_to_fp32(const fp32_t& x) { return x; } OPUS_D constexpr auto fp32_to_i8(const fp32_t& x) { return static_cast(x); }