From c042d3f482a328b002adac16ccf430313e0bf084 Mon Sep 17 00:00:00 2001 From: tlwu Date: Tue, 9 Jun 2026 20:29:00 +0000 Subject: [PATCH 1/6] refactoring --- docs/ContribOperators.md | 2 +- .../cuda/llm/fpA_intB_gemm_preprocessors.h | 2 ++ .../llm/fpA_intB_gemm_preprocessors_impl.cu | 13 +++++++++++++ .../llm/fpA_intB_gemm_preprocessors_impl.h | 6 +++--- .../contrib_ops/cuda/moe/moe_quantization.cc | 19 ++----------------- .../core/graph/contrib_ops/contrib_defs.cc | 2 +- .../python/onnxruntime_pybind_quant.cc | 12 +----------- .../python/transformers/test_qmoe_cuda.py | 2 +- 8 files changed, 24 insertions(+), 34 deletions(-) diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md index 9d19a95136ad7..2a55a90f20425 100644 --- a/docs/ContribOperators.md +++ b/docs/ContribOperators.md @@ -4949,7 +4949,7 @@ This version of the operator has been available since version 1 of the 'com.micr
use_sparse_mixer : int
Whether to use sparse mixer
weights_prepacked : int
-
Only meaningful when quant_type='int'. Tri-state control over whether the int4/int8 fc1/fc2 weight initializers are already laid out in the CUTLASS fpA_intB format expected by the runner. -1 (auto): let the execution provider choose its own backward-compatible default; the CUDA EP treats auto as prepacked. 1: the initializers are already prepacked (e.g. produced offline by pack_weights_for_cuda_mixed_gemm) and are consumed as-is. 0: the initializers are raw, un-prepacked [E, N, K/pack] tensors as produced by quantize_matmul_{4,8}bits; the kernel runs the CUTLASS layout transform itself in PrePack(), matching the behaviour of MatMulNBits and removing the offline pre-pack requirement from exporters. Defaults to -1 (auto) so each execution provider can pick its own backward-compatible default rather than the schema imposing one.
+
Only meaningful when quant_type='int'. Tri-state control over whether the int4/int8 fc1/fc2 weight initializers are already laid out in the CUTLASS fpA_intB format expected by the runner. -1 (auto): let the execution provider choose its own backward-compatible default; the CUDA EP treats auto as prepacked. 1: the initializers are already prepacked (e.g. produced offline by pack_weights_for_cuda_mixed_gemm) and are consumed as-is. 0: the initializers are raw, un-prepacked [E, N, K/pack] tensors as produced by quantize_matmul_{4,8}bits; the kernel runs the CUTLASS layout transform itself in PrePack(), matching the behavior of MatMulNBits and removing the offline pre-pack requirement from exporters. Defaults to -1 (auto) so each execution provider can pick its own backward-compatible default rather than the schema imposing one.
#### Inputs (6 - 21) diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors.h b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors.h index b9e62443145e5..c3b734816cf84 100644 --- a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors.h +++ b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors.h @@ -31,6 +31,8 @@ enum class QuantType { W4_AFP8 }; +int get_arch_for_mixed_gemm_weight_preprocess(int arch); + void preprocess_weights_for_mixed_gemm_cuda(cudaStream_t stream, int arch, int8_t* preprocessed_quantized_weight, diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors_impl.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors_impl.cu index a006612ddadc9..7e83bdda72eab 100644 --- a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors_impl.cu +++ b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors_impl.cu @@ -521,6 +521,19 @@ void add_bias_and_interleave_quantized_tensor_inplace_cuda( } } +int get_arch_for_mixed_gemm_weight_preprocess(int arch) { + ORT_ENFORCE(arch >= 75, "Unsupported CUDA architecture: ", arch); + if (arch < 80) { + return 75; + } +#ifndef EXCLUDE_SM_90 + if (arch >= 90 && arch < 100) { + return 90; + } +#endif + return 80; +} + void preprocess_weights_for_mixed_gemm_cuda(cudaStream_t stream, int arch, int8_t* preprocessed_quantized_weight, diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors_impl.h b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors_impl.h index a8fb411ed0663..47bbe0c0e10ec 100644 --- a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors_impl.h +++ b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors_impl.h @@ -120,11 +120,11 @@ LayoutDetails getLayoutDetailsForArch(QuantType quant_type) { } LayoutDetails getLayoutDetailsForTransform(QuantType quant_type, int arch) { - ORT_ENFORCE(arch >= 75, "Unsupported CUDA architecture: ", arch); - if (arch < 80) { + arch = get_arch_for_mixed_gemm_weight_preprocess(arch); + if (arch == 75) { return getLayoutDetailsForArch(quant_type); #ifndef EXCLUDE_SM_90 - } else if (arch >= 90 && arch < 100) { + } else if (arch == 90) { return getLayoutDetailsForArch(quant_type); #endif } else { diff --git a/onnxruntime/contrib_ops/cuda/moe/moe_quantization.cc b/onnxruntime/contrib_ops/cuda/moe/moe_quantization.cc index e1ddcac0cea4f..c0ac6439bd86c 100644 --- a/onnxruntime/contrib_ops/cuda/moe/moe_quantization.cc +++ b/onnxruntime/contrib_ops/cuda/moe/moe_quantization.cc @@ -850,7 +850,7 @@ Status QMoE::ComputeInternal(OpKernelContext* context) const { // PrePack converted the raw int4/int8 weights to the CUTLASS fpA_intB // layout that the runner consumes and freed the source initializer // (``is_packed = true``). Gate on ``int_weights_consumed_by_prepack`` - // (which already requires ``packed_fc1_weights_ != nullptr``) rather than + // (which already requires both packed weight buffers) rather than // just ``is_int && !weights_prepacked_``: when prepacking is disabled at // the session level (``session.disable_prepacking``) PrePack never runs, // the prepack buffers stay null, and the raw initializer pointers read @@ -1158,22 +1158,7 @@ void QMoE::PrePackIntExpertWeights(const Tensor& tensor, cudaStream_t stream, Al const int64_t k_packed = shape[2]; const int64_t k = k_packed * pack_factor; - // Weight packing is architecture-aware (see - // docs/contrib_ops/cuda/moe_qmoe.md §7 "Cross-Architecture Packing - // Compatibility"). SM90 (Hopper) uses its own Permuted-Linear layout that - // skips column interleaving, so it is its own compatibility group. Every - // other supported arch — SM75/80/86/89 and SM100/120 (Blackwell) — shares - // the SM80 fpA_intB layout, so they all pack as SM80. SM70 and older lack - // INT8 LDSM and are unsupported. The compute-side runner selects the same - // layout from this clamped arch, so the two cannot drift. - // - // SM75 is passed through unchanged (rather than clamped to 80) even though it - // shares SM80's layout: the compute-side dispatch (getLayoutDetailsForTransform) - // still has a distinct SM75 branch, so mirroring it here avoids confusing a - // reader into thinking prepack and dispatch disagree. - ORT_ENFORCE(sm_ >= 75, - "QMoE int4/int8 weight prepack requires SM75 or newer, got sm=", sm_); - const int packing_sm = (sm_ == 90 || sm_ == 75) ? sm_ : 80; + const int packing_sm = onnxruntime::llm::kernels::weight_only::get_arch_for_mixed_gemm_weight_preprocess(sm_); // Per-expert sizes. const size_t per_expert_bytes = static_cast(n) * static_cast(k) / pack_factor; diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc index 1054fd94ef423..6eec50a400436 100644 --- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc @@ -1528,7 +1528,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA( "pack_weights_for_cuda_mixed_gemm) and are consumed as-is. 0: the initializers " "are raw, un-prepacked [E, N, K/pack] tensors as produced by " "quantize_matmul_{4,8}bits; the kernel runs the CUTLASS layout transform itself " - "in PrePack(), matching the behaviour of MatMulNBits and removing the offline " + "in PrePack(), matching the behavior of MatMulNBits and removing the offline " "pre-pack requirement from exporters. Defaults to -1 (auto) so each execution " "provider can pick its own backward-compatible default rather than the schema " "imposing one.", diff --git a/onnxruntime/python/onnxruntime_pybind_quant.cc b/onnxruntime/python/onnxruntime_pybind_quant.cc index 5b1d590a06234..7220153b4fa17 100644 --- a/onnxruntime/python/onnxruntime_pybind_quant.cc +++ b/onnxruntime/python/onnxruntime_pybind_quant.cc @@ -16,7 +16,6 @@ #endif #include #include -#include #include namespace pybind11 { @@ -252,17 +251,8 @@ py::array_t PackWeightsForMixedGemm( cudaDeviceProp device_prop; ThrowIfCudaError(cudaGetDeviceProperties(&device_prop, device_id), "cudaGetDeviceProperties"); sm = device_prop.major * 10 + device_prop.minor; - } else { - // Validate force_arch against the SM versions for which preprocess_weights_for_mixed_gemm_cuda - // has tile/permutation tables. Unknown SMs would silently produce incorrect weight layouts. - static const std::set kSupportedSm = {75, 80, 90}; - if (kSupportedSm.find(sm) == kSupportedSm.end()) { - std::ostringstream oss; - oss << "force_arch=" << sm << " is not a supported SM version. " - << "Pass -1 for auto-detect, or one of: 75, 80, 90 (arch > 90 will fallback to 80)."; - throw std::invalid_argument(oss.str()); - } } + sm = ::onnxruntime::llm::kernels::weight_only::get_arch_for_mixed_gemm_weight_preprocess(sm); auto permutation_map_buffer = make_cuda_ptr(32 * sizeof(int32_t)); diff --git a/onnxruntime/test/python/transformers/test_qmoe_cuda.py b/onnxruntime/test/python/transformers/test_qmoe_cuda.py index 993716a4c80b0..10a856c07366f 100644 --- a/onnxruntime/test/python/transformers/test_qmoe_cuda.py +++ b/onnxruntime/test/python/transformers/test_qmoe_cuda.py @@ -2110,7 +2110,7 @@ class TestQMoEIntPrePackSmoke(unittest.TestCase): hardware (the other ``test_swiglu_qmoe_parity_*`` cases in this file fail on H200 / H100 with max-diff > 1.0 on plain main, by inspection — pre-existing). A real parity check can be added once - that harness honours the runtime SM. + that harness honors the runtime SM. """ def _run_one(self, *, hidden_size, inter_size, num_experts, top_k, swiglu_fusion, batch_size): From bbfa9fb7ca32445871f7508566374b4c69a84e72 Mon Sep 17 00:00:00 2001 From: tlwu Date: Tue, 9 Jun 2026 21:01:34 +0000 Subject: [PATCH 2/6] update doc --- docs/contrib_ops/cuda/moe_qmoe.md | 52 +++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/docs/contrib_ops/cuda/moe_qmoe.md b/docs/contrib_ops/cuda/moe_qmoe.md index 6d53211ff40cb..f30cb3354f024 100644 --- a/docs/contrib_ops/cuda/moe_qmoe.md +++ b/docs/contrib_ops/cuda/moe_qmoe.md @@ -71,6 +71,7 @@ input tokens → router (top-k softmax) → permute by expert | `expert_weight_bits` (QMoE only) | int | 4 | 4 (INT4/MXFP4) or 8 (INT8/FP8). | | `block_size` (QMoE only) | int | -1 | Group size for INT4/INT8 group-wise quantization. -1 = per-output-channel. | | `quant_type` (QMoE only) | string | `"int"` | `"int"`, `"fp4"`, `"fp8"`, `"wfp4afp8"`. See [§3](#3-quantization-modes). | +| `weights_prepacked` (QMoE only) | int | -1 | Tri-state, only meaningful when `quant_type="int"`. `-1` (auto): each EP picks its own backward-compatible default — the CUDA EP treats auto as prepacked. `1`: the INT4/INT8 `fc1`/`fc2` initializers are already in the CUTLASS `fpA_intB` layout (e.g. from `pack_weights_for_cuda_mixed_gemm`) and are consumed as-is. `0`: the initializers are raw `[E, N, K/pack]` tensors (as produced by `quantize_matmul_{4,8}bits`) and the kernel runs the CUTLASS layout transform in `PrePack()`. See [§5.1](#51-weights-input-2--5--8). | ### 2.2 Type Constraints @@ -228,10 +229,39 @@ extra subtraction. ### 5.1 Weights (input 2 / 5 / 8) -Not transformed at runtime. INT4/INT8 weights must already be packed offline by -`pack_weights_for_cuda_mixed_gemm` (see [§6](#6-weight-formats)). MXFP4 weights -must be packed by `pack_fp4_weights_for_cuda_moe_gemm`. FP8 weights are stored -as raw e4m3 bytes (no packing). +**INT4/INT8** weight layout is controlled by the `weights_prepacked` attribute +([§2.1](#21-attributes)): + +- **`weights_prepacked=-1` (auto, default) or `1`** — the `fc1`/`fc2` weights are + assumed to already be in the CUTLASS `fpA_intB` layout, packed offline by + `pack_weights_for_cuda_mixed_gemm` (see [§6](#6-weight-formats)). They are + copied to GPU and consumed as-is (today's behaviour). +- **`weights_prepacked=0`** — the `fc1`/`fc2` weights are raw, schema-conformant + `[E, N, K/pack]` tensors as produced by `quantize_matmul_{4,8}bits`. `PrePack` + runs the CUTLASS layout transform itself via `PrePackIntExpertWeights`, + removing the offline pre-pack dependency. This makes integer QMoE symmetric + with `MatMulNBits::PrePack_B`. + +`PrePackIntExpertWeights` loops over the `E` experts and, per expert, applies the +same transpose + row-permutation / column-interleave / bias / pair-interleave +transform as `pack_weights_for_cuda_mixed_gemm` (see [§6.1](#61-int4-group-wise-quant_typeint-expert_weight_bits4)). +Packing is architecture-aware ([§7](#7-cross-architecture-packing-compatibility)): +SM90 (Hopper) is its own layout group, while all other supported arches +(SM75/80/86/89, SM100/120) share the SM80 layout. SM75+ is required. The source +`[E, N, K/pack]` initializers are released after their shapes are cached +(`fc1_weights_shape_` / `fc2_weights_shape_`), so peak weight memory stays ~1×. +The prepacked GPU buffers (`packed_fc1_weights_` / `packed_fc2_weights_`) are then +preferred by `ComputeInternal`. If prepacking is disabled at the session level +(`session.disable_prepacking`), the buffers stay null and the raw initializer +pointers are read at compute time instead. + +> **Note**: `weights_prepacked=0` is the only path that triggers an in-`PrePack` +> layout transform for INT weights. FP4 / FP8 / WFP4AFP8 weight handling is +> unaffected. + +MXFP4 weights must be packed by `pack_fp4_weights_for_cuda_moe_gemm`. FP8 weights +are stored as raw e4m3 bytes (no packing). + ### 5.2 INT4/INT8 scales + zero-point → bias @@ -287,7 +317,12 @@ This section covers the five distinct weight encodings supported by QMoE. INT4 packing layout within a byte: `[high_nibble | low_nibble] = [elt_1 | elt_0]`. Each INT4 element is in `[-8, 7]` (signed) before bias, `[0, 15]` after the +8 bias. -#### Preprocessing pipeline (offline, `pack_weights_for_cuda_mixed_gemm`) +#### Preprocessing pipeline (offline `pack_weights_for_cuda_mixed_gemm`, or in-`PrePack` via `PrePackIntExpertWeights`) + +This is the layout transform applied either offline by +`pack_weights_for_cuda_mixed_gemm`, or per-expert inside `PrePack` when +`weights_prepacked=0` (see [§5.1](#51-weights-input-2--5--8)). + 1. **Input layout**: `[N, K]` per expert (Out × In), 2 elements per byte for INT4. 2. **Transpose & signed conversion**: @@ -830,7 +865,7 @@ will not change the operator interface. |-----------|----------| | [test_moe_cuda.py](onnxruntime/test/python/transformers/test_moe_cuda.py) | Standard MoE on CUDA: FP16/BF16, SiLU/GeLU/SwiGLU, routing, GEMM parity. SwiGLU coverage includes both GPT-OSS (`TestSwigluMoE`: interleaved, alpha=1.702/beta=1.0/limit=7.0) and Standard/Llama-Gemma (`TestStandardSwigluMoE`: concatenated `swiglu_fusion=2`, alpha=1.0/beta=0.0/no limit → `SiLU(Gate)×Value`). | | [test_moe_cpu.py](onnxruntime/test/python/transformers/test_moe_cpu.py) | Standard MoE on CPU (smoke). | -| [test_qmoe_cuda.py](onnxruntime/test/python/transformers/test_qmoe_cuda.py) | INT4/INT8 QMoE — primary regression signal for the production QMoE path. Exercises `pack_weights_for_cuda_mixed_gemm` and dequant-then-matmul reference. | +| [test_qmoe_cuda.py](onnxruntime/test/python/transformers/test_qmoe_cuda.py) | INT4/INT8 QMoE — primary regression signal for the production QMoE path. Exercises `pack_weights_for_cuda_mixed_gemm` and dequant-then-matmul reference. `TestQMoEIntPrePackSmoke` covers the raw-weight `weights_prepacked=0` in-`PrePack` layout transform (smoke test: asserts finite output, not bit-parity). | | [test_qmoe_cpu.py](onnxruntime/test/python/transformers/test_qmoe_cpu.py) | INT4/INT8 QMoE on CPU (smoke). | | [test_qmoe_fp4_cuda.py](onnxruntime/test/python/transformers/test_qmoe_fp4_cuda.py) | MXFP4 QMoE: quantization utilities, packing, FP16/BF16, SiLU/SwiGLU, top-k and expert-count variants. End-to-end runs on SM120; on SM<120 the dequant fallback is exercised. | | [test_qmoe_fp8_cuda.py](onnxruntime/test/python/transformers/test_qmoe_fp8_cuda.py) | FP8 W8A16 QMoE on SM90+ native path and SM<90 dequant fallback. | @@ -954,6 +989,11 @@ over-aligned by-value parameters. cannot. See [§14.1](#141-msvc-and-tma-grouped-moe-gemm). - **WFP4AFP8 native** requires SM100+ hardware; only the dequant fallback path is validated end-to-end so far. +- **In-`PrePack` INT weight layout transform** (`weights_prepacked=0`) is + currently covered only by a smoke test (`TestQMoEIntPrePackSmoke`), not a + bit-parity check: the existing offline pre-pack harness hardcodes + `force_arch=80` and produces incorrect output on SM≥90, so a parity + comparison against it is omitted until that harness honours the runtime SM. - **Hopper W4A8** (INT4 weight + FP8 activation) is not supported — TRT-LLM gates its fast path to SM89 only. From a2ace900486dad6d7fe159ff343456024c063b17 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 9 Jun 2026 17:53:34 -0700 Subject: [PATCH 3/6] force sm80 for prepacking moe weights --- docs/ContribOperators.md | 354 +++++++++--------- docs/contrib_ops/cuda/moe_qmoe.md | 47 ++- .../contrib_ops/cuda/moe/moe_quantization.cc | 36 +- .../contrib_ops/cuda/moe/moe_quantization.h | 27 +- .../core/graph/contrib_ops/contrib_defs.cc | 16 +- .../test/python/transformers/test_moe_cuda.py | 12 +- .../python/transformers/test_qmoe_cuda.py | 6 +- 7 files changed, 273 insertions(+), 225 deletions(-) diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md index 2a55a90f20425..478e645664203 100644 --- a/docs/ContribOperators.md +++ b/docs/ContribOperators.md @@ -125,24 +125,24 @@ Do not modify directly.* ### **com.microsoft.Attention** Multi-Head Attention that can be either unidirectional (like GPT-2) or bidirectional (like BERT). - + The weights for input projection of Q, K and V are merged. The data is stacked on the second dimension. Its shape is (input_hidden_size, hidden_size + hidden_size + v_hidden_size). Here hidden_size is the hidden dimension of Q and K, and v_hidden_size is that of V. - + The mask_index is optional. Besides raw attention mask with shape (batch_size, total_sequence_length) or (batch_size, sequence_length, total_sequence_length) with value 0 for masked and 1 otherwise, we support other two formats: When input has right-side padding, mask_index is one dimension with shape (batch_size), where value is actual sequence length excluding padding. When input has left-side padding, mask_index has shape (2 * batch_size), where the values are the exclusive end positions followed by the inclusive start positions. - + When unidirectional is 1, each token only attends to previous tokens. - + Both past and present state are optional. They shall be used together, and not allowed to use only one of them. The qkv_hidden_sizes is required only when K and V have different hidden sizes. - + When there is past state, hidden dimension for Q, K and V shall be the same. - + The total_sequence_length is past_sequence_length + kv_sequence_length. Here kv_sequence_length is the length of K or V. For self attention, kv_sequence_length equals to sequence_length (sequence length of Q). For cross attention, query and key might have different lengths. @@ -214,133 +214,133 @@ This version of the operator has been available since version 1 of the 'com.micr Computes an one-layer RNN where its RNN Cell is an AttentionWrapper wrapped a LSTM Cell. The RNN layer contains following basic component: LSTM Cell, Bahdanau Attention Mechanism, AttentionWrapp. - + Activation functions: - + Relu(x) - max(0, x) - + Tanh(x) - (1 - e^{-2x})/(1 + e^{-2x}) - + Sigmoid(x) - 1/(1 + e^{-x}) - + (NOTE: Below are optional) - + Affine(x) - alpha*x + beta - + LeakyRelu(x) - x if x >= 0 else alpha * x - + ThresholdedRelu(x) - x if x >= alpha else 0 - + ScaledTanh(x) - alpha*Tanh(beta*x) - + HardSigmoid(x) - min(max(alpha*x + beta, 0), 1) - + Elu(x) - x if x >= 0 else alpha*(e^x - 1) - + Softsign(x) - x/(1 + |x|) - + Softplus(x) - log(1 + e^x) - + Softmax(x) - exp(x) / sum(exp(x)) - + Bahdanau Attention Mechanism: `M` - Memory tensor. - + `VALUES` - masked Memory by its real sequence length. - + `MW` - Memory layer weight. - + `KEYS` - Processed memory tensor by the memory layer. KEYS = M * MW - + `Query` - Query tensor, normally at specific time step in sequence. - + `QW` - Query layer weight in the attention mechanism - + `PQ` - processed query, = `Query` * `QW` - + `V' - attention vector - + `ALIGN` - calculated alignment based on Query and KEYS ALIGN = softmax(reduce_sum(`V` * Tanh(`KEYS` + `PQ`))) - + `CONTEXT` - context based on `ALIGN` and `VALUES` CONTEXT = `ALIGN` * `VALUES` - - + + LSTM Cell: `X` - input tensor concat with attention state in the attention wrapper - + `i` - input gate - + `o` - output gate - + `f` - forget gate - + `c` - cell gate - + `t` - time step (t-1 means previous time step) - + `W[iofc]` - W parameter weight matrix for input, output, forget, and cell gates - + `R[iofc]` - R recurrence weight matrix for input, output, forget, and cell gates - + `Wb[iofc]` - W bias vectors for input, output, forget, and cell gates - + `Rb[iofc]` - R bias vectors for input, output, forget, and cell gates - + `P[iof]` - P peephole weight vector for input, output, and forget gates - + `WB[iofc]` - W parameter weight matrix for backward input, output, forget, and cell gates - + `RB[iofc]` - R recurrence weight matrix for backward input, output, forget, and cell gates - + `WBb[iofc]` - W bias vectors for backward input, output, forget, and cell gates - + `RBb[iofc]` - R bias vectors for backward input, output, forget, and cell gates - + `PB[iof]` - P peephole weight vector for backward input, output, and forget gates - + `H` - Hidden state - + `num_directions` - 2 if direction == bidirectional else 1 - + Equations (Default: f=Sigmoid, g=Tanh, h=Tanh): - + - it = f(Xt*(Wi^T) + Ht-1*(Ri^T) + Pi (.) Ct-1 + Wbi + Rbi) - + - ft = f(Xt*(Wf^T) + Ht-1*(Rf^T) + Pf (.) Ct-1 + Wbf + Rbf) - + - ct = g(Xt*(Wc^T) + Ht-1*(Rc^T) + Wbc + Rbc) - + - Ct = ft (.) Ct-1 + it (.) ct - + - ot = f(Xt*(Wo^T) + Ht-1*(Ro^T) + Po (.) Ct + Wbo + Rbo) - + - Ht = ot (.) h(Ct) - - + + AttentionWrapp Notations: `lstm()' - wrapped inner cell. Ht, Ct = lstm(concat(Xt, ATTNt-1), Ct-1) - + `am()` - attention mechanism the wrapper used. CONTEXTt, ALIGNt = am(Ht, ALIGNt-1) - + `AW` - attention layer weights, optional. - + `ATTN` - attention state, initial is zero. If `AW` provided, it is the output of the attention layer, ATTNt = concat(Ht, CONTEXTt) * AW otherwise, ATTNt = CONTEXTt - + RNN layer output: `Y` - if needed is the sequence of Ht from lstm cell. - + `Y_h` - is the last valid H from lstm cell. - + `Y_c` - is the last valid C from lstm cell. - + #### Version @@ -594,7 +594,7 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.BiasGelu** Bias Gelu. - It's an extension of Gelu. It takes the sum of input A and bias input B as the input of Gelu activation. + It's an extension of Gelu. It takes the sum of input A and bias input B as the input of Gelu activation. #### Version @@ -819,7 +819,7 @@ This version of the operator has been available since version 1 of the 'com.micr ``` scale = 1. / (1. - ratio). ``` - + This op functions in much the same was as Dropout-11 and Dropout-13 do, except that the mask is output as a bit-packed uint32 tensor, instead of a boolean tensor. #### Version @@ -907,20 +907,20 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.CausalConvWithState** Stateful causal depthwise convolution, generalized to N spatial dimensions. - + Used by Gated DeltaNet (Qwen3.5) and Mamba (Jamba, FalconMamba) as a preprocessing step. Replaces the 3-op pattern (Concat + Conv + Slice) with a single fused operation. - + The convolution is causal (looks only at current and past positions along the last spatial dimension) and depthwise (each channel is convolved independently with its own kernel). - + Input layout is channels-first: (batch_size, channels, ...). Weight layout: (channels, 1, k_1, ...) for depthwise convolution. The carry state stores the last (k-1) positions along the causal axis for incremental decode. - + The ndim attribute generalizes the op to 1D, 2D, or 3D spatial dimensions. Causality is enforced on the last spatial dimension only. - + The optional activation attribute supports fused SiLU/Swish activation. #### Version @@ -1277,17 +1277,17 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.DecoderMaskedSelfAttention** Self attention that supports input sequence length of 1. - + The weights for input projection of Q, K and V are merged. The data is stacked on the second dimension. Its shape is (input_hidden_size, hidden_size + hidden_size + v_hidden_size). Here hidden_size is the hidden dimension of Q and K, and v_hidden_size is that of V. - + The mask_index is optional. If it is provided, only raw attention mask with shape (batch_size, total_sequence_length) is supported currently. - + Both past and present state need to be provided. - + The qkv_hidden_sizes is required only when K and V have different hidden sizes. - + The total_sequence_length is past_sequence_length + kv_sequence_length. Here kv_sequence_length is the length of K or V. Currently, only self attention is supported which means that kv_sequence_length equals to sequence_length (sequence length of Q). @@ -2349,12 +2349,12 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.GemmaRotaryEmbedding** GemmaRotaryEmbedding is the implementation of below part of rotary positional embeddings (RoPE). It implements below from modeling_gemma.py. - + Here's onnxscript that was tested - + from onnxscript import FLOAT, FLOAT16, script from onnxscript import opset18 as op - + @script() def gemma_rotary_embedding(emb: FLOAT["bs", "seq_len", "dim"], q: FLOAT16["bs", "num_heads", "seq_len", "dim"], q_rot: FLOAT16["bs", "num_heads", "seq_len", "dim"], k: FLOAT16["bs", "num_heads", "seq_len", "dim"], k_rot: FLOAT16["bs", "num_heads", "seq_len", "dim"]): sin_val = op.Sin(emb) @@ -2366,10 +2366,10 @@ This version of the operator has been available since version 1 of the 'com.micr q_embed = (q * casted_cos) + (q_rot * casted_sin) k_embed = (k * casted_cos) + (k_rot * casted_sin) return q_embed, k_embed - + onnx_model = gemma_rotary_embedding.to_model_proto() - - + + #### Version @@ -2485,7 +2485,7 @@ This version of the operator has been available since version 1 of the 'com.micr which are used to interpolate the output value `output[n, :, h, w]`. The GridSample operator is often used in doing grid generator and sampler in the [Spatial Transformer Networks](https://arxiv.org/abs/1506.02025). See also in [torch.nn.functional.grid_sample](https://pytorch.org/docs/master/generated/torch.nn.functional.grid_sample.html#torch-nn-functional-grid-sample). - + #### Version @@ -2531,13 +2531,13 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.GroupNorm** Applies Group Normalization over a mini-batch of inputs as described in the paper Group Normalization (https://arxiv.org/abs/1803.08494). - + This operator transforms input according to y = gamma * (x - mean) / sqrt(variance + epsilon) + beta - + The input channels are separated into num_groups groups, each containing num_channels / num_groups channels. num_channels must be divisible by num_groups. The mean and standard-deviation are calculated separately over the each group. The weight and bias are per-channel affine transform parameter vectors of size num_channels. - + The activation attribute can be used to enable activation after group normalization. #### Version @@ -2588,21 +2588,21 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.GroupQueryAttention** Group Query Self/Cross Attention with KV Cache Quantization Support. - + This operator implements causal grouped-query attention with past state (KV cache) support. It also supports optional float8, int8 or int4 quantization for the KV cache to reduce memory footprint. - + **Cache Format:** The past and present KV cache tensors are expected in a BNSH format: `(batch_size, num_heads, cache_sequence_length, head_size)`, where `cache_sequence_length` is the length of the cached key/value sequences, or the maximum sequence length when past and present buffer sharing is used. - + **Quantization:** When quantization is enabled, `past_key` and `past_value` inputs can be of type `float8e4m3fn`, `uint8` or `int8`. The corresponding `k_scale` and `v_scale` tensors must be provided. The operator will output `present_key` and `present_value` in same format as the `past_key` and `past_value`. - + For 4-bit quantization, the data type is uint8 where each byte contains two 4-bit values. The bit width of quantized KV cache can be set using `kv_cache_bit_width` attribute. - + The shapes of the k_scale, v_scale tensors shall be broadcastable to present_key shape. - + **Quantization Modes (`k_quant_type`, `v_quant_type` attributes):** - **"NONE"**: No quantization. - **"PER_TENSOR"**: A single scale for the entire tensor. Scale example shape: `[1]`. @@ -2779,18 +2779,18 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.LinearAttention** Unified linear attention operator for autoregressive decoding (T=1) and prefill (T>1). - + All inputs use 3D packed format [B, T, H*D]; q_num_heads and kv_num_heads are always required. The op internally unpacks to 4D for computation. - + The update_rule attribute selects the recurrence type: - "linear": S_t = S_{t-1} + k_t ⊗ v_t; o_t = scale * q_t^T S_t - "gated": S_t = exp(g_t) * S_{t-1} + k_t ⊗ v_t; o_t = scale * q_t^T S_t - "delta": S_t = S_{t-1} + β_t * k_t ⊗ (v_t - S_{t-1}^T k_t); o_t = scale * q_t^T S_t - "gated_delta": S_t = exp(g_t) * S_{t-1} + β_t * k_t ⊗ (v_t - exp(g_t) * S_{t-1}^T k_t); o_t = scale * q_t^T S_t - + where g_t is the decay (in log-space), β_t is the update rate, and ⊗ denotes outer product. - + Semantics: Equivalent to running the recurrent update sequentially for each token, but may be implemented using chunk-parallel algorithms for GPU efficiency. @@ -2854,10 +2854,10 @@ This version of the operator has been available since version 1 of the 'com.micr Longformer Self Attention with a local context and a global context. Tokens attend locally: Each token attends to its W previous tokens and W succeeding tokens with W being the window length. A selected few tokens attend globally to all other tokens. - + The attention mask is of shape (batch_size, sequence_length), where sequence_length is a multiple of 2W after padding. Mask value < 0 (like -10000.0) means the token is masked, 0 otherwise. - + Global attention flags have value 1 for the tokens attend globally and 0 otherwise. #### Version @@ -2916,32 +2916,32 @@ This version of the operator has been available since version 1 of the 'com.micr 2. Input B is quantized with 4 bits with quantization data type specified by attribute 'quant_type'. It is transposed, flattened and quantized blockwisely with block size specified by attribute 'block_size'. And block_size is not an arbitrary number and must be a power of 2 and not smaller than 16, like 16, 32, 64, 128,.. 3. Input B's quantization constants or scales are specified by input 'absmax'. - + Input B is stored as uint8_t with shape: [(N * K + 1) / 2]. Input absmax is stored in same type as original type of B(float32, float16) with shape like: [(N * K + block_size - 1) / block_size]. - - + + 1. (Default value) transB=True (Majorly used for forward pass) Shape of A: [D0, D1, ..., Dn, K] Shape of Dequanted B: [N, K], this is aligned with how PyTorch defined the linear weight, .e.g [out_features, in_features]. - + The computation math: dequant_B = dequant(B, absmax, quant_type, block_size) transposed_dequant_B = dequant_B^T output = A @ transposed_dequant_B - + Shape of output: [D0, D1, ..., Dn, N] - + 2. transB=False (Majorly used for backward pass) Shape of A: [D0, D1, ..., Dn, N] Shape of Dequanted B: [N, K], this is aligned with how PyTorch defined the linear weight, .e.g [out_features, in_features]. - + The computation math: dequant_B = dequant(B, absmax, quant_type, block_size) output = A @ dequant_B - + Shape of output: [D0, D1, ..., Dn, K] - + #### Version @@ -3127,17 +3127,17 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.MatMulNBits** MatMulNBits performs a matrix multiplication where the right-hand-side matrix (weights) is quantized to N bits. - + It is a fusion of two operations: 1. Linear dequantization of the quantized weights using scale and (optionally) zero-point with formula: dequantized_weight = (quantized_weight - zero_point) * scale 2. Matrix multiplication between the input matrix A and the dequantized weight matrix. - + The weight matrix is a 2D constant matrix with the input feature count and output feature count specified by attributes 'K' and 'N'. It is quantized block-wise along the K dimension with a block size specified by the 'block_size' attribute. The block size must be a power of 2 and not smaller than 16 (e.g., 16, 32, 64, 128). Each block has its own scale and zero-point. The quantization is performed using a bit-width specified by the 'bits' attribute, which can take values from 2 to 8. - + The quantized weights are stored in a bit-packed format along the K dimension, with each block being represented by a blob of uint8. For example, for 4 bits, the first 4 bits are stored in the lower 4 bits of a byte, and the second 4 bits are stored in the higher 4 bits of a byte. @@ -3201,32 +3201,32 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.MatMulNBitsMlp** MatMulNBitsMlp fuses two MatMulNBits projections that share the same input and computes - + gate = MatMulNBits(A, gate_weight) + gate_bias up = MatMulNBits(A, up_weight) + up_bias Y = activation(gate) * up - + It can also optionally fuse SimplifiedLayerNormalization or SkipSimplifiedLayerNormalization before the two projections: - + A_norm = SimplifiedLayerNormalization(A, norm_scale, epsilon) gate = MatMulNBits(A_norm, gate_weight) + gate_bias up = MatMulNBits(A_norm, up_weight) + up_bias Y = activation(gate) * up - + A_norm = SkipSimplifiedLayerNormalization(A, skip, norm_scale, epsilon) gate = MatMulNBits(A_norm, gate_weight) + gate_bias up = MatMulNBits(A_norm, up_weight) + up_bias Y = activation(gate) * up - + This operator is intended for decoder MLP patterns such as Qwen-style gate and up projections, but it remains semantically valid for both prefill and decode because the output shape is the standard MatMul result shape derived from the runtime shape of A and the shared attributes K and N. - + The operator contract includes a string attribute describing the fused gate activation. - + When fused from SkipSimplifiedLayerNormalization, the optional residual-sum output may also be materialized: - + A_norm, input_skip_bias_sum = SkipSimplifiedLayerNormalization(A, skip, norm_scale, epsilon) gate = MatMulNBits(A_norm, gate_weight) + gate_bias up = MatMulNBits(A_norm, up_weight) + up_bias @@ -3302,15 +3302,15 @@ This version of the operator has been available since version 1 of the 'com.micr MatMulNBitsQkv fuses either SimplifiedLayerNormalization (RMSNorm) or SkipSimplifiedLayerNormalization with three MatMulNBits projections that share the same normalized activation. - + A_norm = SimplifiedLayerNormalization(A, norm_scale, epsilon) Q = MatMulNBits(A_norm, q_weight) + q_bias K = MatMulNBits(A_norm, k_weight) + k_bias V = MatMulNBits(A_norm, v_weight) + v_bias - + If skip is provided, the operator computes the SkipSimplifiedLayerNormalization variant and may also return the input+skip residual sum as output 3. - + This operator is intended as a decode-oriented QKV fusion primitive. #### Version @@ -3440,7 +3440,7 @@ This version of the operator has been available since version 1 of the 'com.micr Mixture of experts. Examples: Switch transformer(https://arxiv.org/pdf/2101.03961.pdf) use top 1, GLaM(https://arxiv.org/abs/2112.06905) activates top 2 FFN, Vision MOE(https://arxiv.org/pdf/2106.05974.pdf) usually uses top 32 experts and Mixtral(https://huggingface.co/blog/mixtral). - + The SwiGLU (Swish-Gated Linear Unit) activation function is like: g = xW + b l = xV + c @@ -3451,7 +3451,7 @@ This version of the operator has been available since version 1 of the 'com.micr When swiglu_fusion=0, two GEMMs are not fused, and they are FC1 and FC3 in the inputs. When swiglu_fusion=1, two GEMMs are fused so that g and l are computed in a single GEMM (FC1), and g and l are interleaved on each row of size 2 * inter_size. When swiglu_fusion=2, two GEMMs are fused, and g and l are concatenated on each row. - + #### Version @@ -3519,11 +3519,11 @@ This version of the operator has been available since version 1 of the 'com.micr Performs element-wise binary quantized multiplication (with Numpy-style broadcasting support). "This operator supports **multidirectional (i.e., Numpy-style) broadcasting**" The output of this op is the int32 accumulated result of the mul operation - + ``` C (int32) = (A - A_zero_point) * (B - B_zero_point) ``` - + #### Version @@ -3562,7 +3562,7 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.MultiHeadAttention** Multi-Head Self/Cross Attention. Bias from input projection is included. - + The key padding mask is optional. When its shape is (batch_size, kv_sequence_length), value 0 means padding or 1 otherwise. When key has right-side padding, its shape could be (batch_size): it is actual length of each key sequence excluding paddings. @@ -3870,25 +3870,25 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.PackedAttention** This is the packed version of Attention. - + Sequences in one batch usually don't have same length and they are padded to have same length, e.g., below is a batch with 3 sequences and tokens* are padded. Sequence_0: 0, 1*, 2*, 3* Sequence_1: 4, 5, 6*, 7* Sequence_2: 8, 9, 10, 11 - + PackedAttention is designed to takes in packed input, i.e., only the real tokens without padding. An input as above will be packed into 3 tensors like below: - input ([h0, h4, h5, h8, h9, h10, h11]) - token_offset: 0, 4, 5, 8, 9, 10, 11, 1*, 2*, 3*, 6*, 7* - cumulated_token_count: 0, 1, 1+2, 1+2+4 - + Input tensors contains the hidden embedding of real tokens. Token_offset records the offset of token in the unpacked input. cumulated_token_count records cumulated length of each sequence length. - + The operator only supports BERT like model with padding on right now. - + #### Version @@ -3942,13 +3942,13 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.PackedMultiHeadAttention** This is the packed version of MultiHeadAttention. - + Sequences in one batch usually don't have same length and they are padded to have same length, e.g., below is a batch with 3 sequences and * is padding token. Sequence_0: 0, 1*, 2*, 3* Sequence_1: 4, 5, 6*, 7* Sequence_2: 8, 9, 10, 11 - + PackedMultiHeadAttention is designed to takes in packed input, i.e., only the real tokens without padding. An input as above will be packed into 3 tensors like below: - query ([q0, q4, q5, q8, q9, q10, q11]) @@ -3956,11 +3956,11 @@ This version of the operator has been available since version 1 of the 'com.micr - value ([v0, v4, v5, v8, v9, v10, v11]) - token_offset: 0, 4, 5, 8, 9, 10, 11, 1*, 2*, 3*, 6*, 7* - cumulative_sequence_length: 0, 1, 1+2, 1+2+4 - + The query, key and value tensors contain result of hidden embedding of real tokens after input projections. Token_offset records the offset of token in the unpacked input. cumulative_sequence_length records cumulated length of each sequence length. - + The operator only supports BERT like model with padding on right now. #### Version @@ -4032,7 +4032,7 @@ This version of the operator has been available since version 1 of the 'com.micr [0.0, 0.0, 4.5, 5.7], ], ] - + #### Version @@ -4074,16 +4074,16 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.PagedAttention** Paged Attention. - + This op leverages a block-based KV cache to enable continuous batching for LLMs. Currently, it is designed to work with the CUDA Execution Provider only. - + In other attention ops, batch entries typically aren't of the same length, so they are padded. Below is a batch with 3 sequences where * denotes a padding token. Sequence_0: 0, 1*, 2*, 3* Sequence_1: 4, 5, 6*, 7* Sequence_2: 8, 9, 10, 11 - + PagedAttention is designed to take in packed input, i.e., only the real tokens without padding. For example, the input shown above will be packed into 3 tensors like below: - query ([q0, q4, q5, q8, q9, q10, q11]) @@ -4091,10 +4091,10 @@ This version of the operator has been available since version 1 of the 'com.micr - value ([v0, v4, v5, v8, v9, v10, v11]) - cumulative_sequence_length: 0, 1, 1+2, 1+2+4 This packing omits padding tokens. - + The query, key and value tensors contain result of hidden embedding of real tokens after input projections. cumulative_sequence_length records cumulated length of each sequence length. - + #### Version @@ -4306,7 +4306,7 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.QLinearAdd** Performs element-wise binary addition on 8 bit data types (with Numpy-style broadcasting support). - + C = (A_scale * (A - A_zero_point) + B_scale * (B - B_zero_point))/C_scale + C_zero_point #### Version @@ -4364,11 +4364,11 @@ This version of the operator has been available since version 1 of the 'com.micr output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1) ``` if ceil_mode is enabled - + ``` * pad_shape[i] is sum of pads along axis i ``` - + `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following: ``` VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - kernel_spatial_shape[i] + 1) / strides_spatial_shape[i]) @@ -4378,9 +4378,9 @@ This version of the operator has been available since version 1 of the 'com.micr ``` pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + kernel_spatial_shape[i] - input_spatial_shape[i] ``` - + The output of each pooling window is divided by the number of elements (exclude pad when attribute count_include_pad is zero). - + Input and output scales and zero points are used to convert the output to a new quantization range. Output = Dequantize(Input) -> AveragePool on fp32 data -> Quantize(output) @@ -4648,7 +4648,7 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.QLinearMul** Performs element-wise binary multiplication on 8 bit data types (with Numpy-style broadcasting support). - + C = ((A - A_zero_point) * (B - B_zero_point)) * (A_scale * B_scale)/C_scale + C_zero_point #### Version @@ -4699,10 +4699,10 @@ This version of the operator has been available since version 1 of the 'com.micr with the exception that numpy default keepdims to False instead of True. Input and Output scales and zero points are used to requantize the output in a new range. This helps to improve accuracy as after ReduceMean operation the range of the output is expected to decrease. - + ``` "Output = Dequantize(Input) -> ReduceMean on fp32 data -> Quantize(output)", - + ``` #### Version @@ -4752,7 +4752,7 @@ This version of the operator has been available since version 1 of the 'com.micr QLinearSigmoid takes quantized input data (Tensor), and quantize parameter for output, and produces one output data (Tensor) where the function `f(x) = quantize(Sigmoid(dequantize(x)))`, is applied to the data tensor elementwise. - Wwhere the function `Sigmoid(x) = 1 / (1 + exp(-x))` + Wwhere the function `Sigmoid(x) = 1 / (1 + exp(-x))` #### Version @@ -4893,20 +4893,20 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.QMoE** Quantized mixture of experts (MoE). - + The quantized weights are stored in column major order per expert. The quantization block size can be specified. If not provided, column wise quantization is used. - + The formula of linear dequantization of the quantized weights using scale and (optionally) zero-point is: dequantized_weight = (quantized_weight - zero_point) * scale When zero_point is not provided, the default value is 2^(bits-1): 2 for 2 bits, 8 for 4 bits, 128 for 8 bits. - + If block_size is provided, both hidden_size and inter_size must be divisible by the block size, and the dequantization is performed per block of size block_size along the K (input feature) dimension. - + If block_size and zero_point are provided, both hidden_size and inter_size must be divisible by block_size * pack_size, where pack_size = 8 / expert_weight_bits. - + The SwiGLU (Swish-Gated Linear Unit) activation function is like: g = xW + b l = xV + c @@ -4917,7 +4917,7 @@ This version of the operator has been available since version 1 of the 'com.micr When swiglu_fusion=0, two GEMMs are not fused, and they are FC1 and FC3 in the inputs. When swiglu_fusion=1, two GEMMs are fused so that g and l are computed in a single GEMM (FC1), and g and l are interleaved on each row of size 2 * inter_size. When swiglu_fusion=2, two GEMMs are fused, and g and l are concatenated on each row. - + #### Version @@ -4949,7 +4949,7 @@ This version of the operator has been available since version 1 of the 'com.micr
use_sparse_mixer : int
Whether to use sparse mixer
weights_prepacked : int
-
Only meaningful when quant_type='int'. Tri-state control over whether the int4/int8 fc1/fc2 weight initializers are already laid out in the CUTLASS fpA_intB format expected by the runner. -1 (auto): let the execution provider choose its own backward-compatible default; the CUDA EP treats auto as prepacked. 1: the initializers are already prepacked (e.g. produced offline by pack_weights_for_cuda_mixed_gemm) and are consumed as-is. 0: the initializers are raw, un-prepacked [E, N, K/pack] tensors as produced by quantize_matmul_{4,8}bits; the kernel runs the CUTLASS layout transform itself in PrePack(), matching the behavior of MatMulNBits and removing the offline pre-pack requirement from exporters. Defaults to -1 (auto) so each execution provider can pick its own backward-compatible default rather than the schema imposing one.
+
Only meaningful when quant_type='int'. Tri-state control over the layout of the int4/int8 fc1/fc2 weight initializers. The concrete prepacked layouts selected by -1 and 1 are determined by the execution provider. 0: the initializers are raw, un-prepacked [E, N, K/pack] tensors as produced by quantize_matmul_{4,8}bits. Defaults to -1.
#### Inputs (6 - 21) @@ -5668,10 +5668,10 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.RemovePadding** Compress transformer input by removing paddings. It assumes padding is on the right side of sequence. - + The input has padding with shape (batch_size, sequence_length, hidden_size). This will generate two outputs: output has shape (total_tokens, hidden_size); token_offset with shape (batch_size, sequence_length). - + token_offset has offsets of all non-padding tokens first, then offset of all padding tokens. It is a list of batch_size * sequence_length elements, which is reshaped to 2D for convenience of shape inference. @@ -5714,7 +5714,7 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.RestorePadding** Restore paddings and fill padding with zeros. - + The input has padding with shape (total_tokens, hidden_size) and token_offset with shape (batch_size, sequence_length). The output has shape (batch_size, sequence_length, hidden_size). @@ -5961,16 +5961,16 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.SkipGroupNorm** This operator element-wise adds x, skip and bias, then apply group normalization and optional activation. - + This operator transforms input according to s = x + skip + bias y = gamma * (s - mean) / sqrt(variance + epsilon) + beta - + The input channels are separated into num_groups groups, each containing num_channels / num_groups channels. The num_channels must be divisible by num_groups. The mean and standard-deviation of s are calculated separately over the each group. The weight and bias are per-channel affine transform parameter vectors of size num_channels. - + The activation attribute can be used to enable activation after group normalization. #### Version @@ -6174,36 +6174,36 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.SparseAttention** Block Sparse Attention used in Phi-3-small (https://arxiv.org/pdf/2404.14219). - + It is inspired by Sparse Transformers (https://arxiv.org/pdf/1904.10509) and BigBird (https://arxiv.org/pdf/2007.14062). - + block_mask can be used to configure sparse layout for different head. When number of sparse layout is 1, all heads have same sparse layout. Otherwise, different layouts are used cyclically. For example, given 4 layouts (S0, S1, S2, S3), 8 heads will have layouts like (S0, S1, S2, S3, S0, S1, S2, S3). - + The block_row_indices and block_col_indices are the CSR representation of block mask. The block_col_indices might contain paddings at the right side when different layout has different number of non-zeros in block mask. - + An example of block mask with 2 layouts where each layout is 4 x 4 blocks: [[[1, 0, 0, 0], [1, 1, 0, 0], [0, 1, 1, 0], [0, 1, 1, 1]], - + [[1, 0, 0, 0], [1, 1, 0, 0], [1, 1, 1, 0], [1, 0, 1, 1]]] - + The corresponding CSR format: block_col_indices = [[0, 0, 1, 1, 2, 1, 2, 3, -1], [0, 0, 1, 0, 1, 2, 0, 2, 3]] block_row_indices = [[0, 1, 3, 5, 8], [0, 1, 3, 6, 9]] - + When do_rotary is True, cos_cache and sin_cache are required. Note that the maximum sequence length supported by cos or sin cache can be different from the maximum sequence length used by kv cache. - + Only supports unidirectional attention with cache of past key and value in linear buffers. - + For performance, past_key and present_key share same memory buffer, and past_value and present_value too. #### Version @@ -6397,7 +6397,7 @@ This version of the operator has been available since version 1 of the 'com.micr Based on Torch operator Embedding, creates a lookup table of embedding vectors of fixed size, for a dictionary of fixed size. - + #### Version @@ -6487,7 +6487,7 @@ This version of the operator has been available since version 1 of the 'com.micr the main diagonal. A negative k value includes as many diagonals below the main diagonal. If upper is set to false, a positive k retains the lower triangular matrix including k diagonals above the main diagonal. A negative k value excludes as many diagonals below the main diagonal. - + #### Version @@ -6579,7 +6579,7 @@ This version of the operator has been available since version 1 of the 'com.micr output_uniques = [2, 1, 3, 4] output_idx = [0, 1, 1, 2, 3, 2] output_counts = [1, 2, 2, 1] - + #### Version @@ -6891,5 +6891,3 @@ No versioning maintained for experimental ops.
T : tensor(float)
Constrain input and output types to float32 tensors.
- - diff --git a/docs/contrib_ops/cuda/moe_qmoe.md b/docs/contrib_ops/cuda/moe_qmoe.md index f30cb3354f024..6380fbe9a17f5 100644 --- a/docs/contrib_ops/cuda/moe_qmoe.md +++ b/docs/contrib_ops/cuda/moe_qmoe.md @@ -71,7 +71,7 @@ input tokens → router (top-k softmax) → permute by expert | `expert_weight_bits` (QMoE only) | int | 4 | 4 (INT4/MXFP4) or 8 (INT8/FP8). | | `block_size` (QMoE only) | int | -1 | Group size for INT4/INT8 group-wise quantization. -1 = per-output-channel. | | `quant_type` (QMoE only) | string | `"int"` | `"int"`, `"fp4"`, `"fp8"`, `"wfp4afp8"`. See [§3](#3-quantization-modes). | -| `weights_prepacked` (QMoE only) | int | -1 | Tri-state, only meaningful when `quant_type="int"`. `-1` (auto): each EP picks its own backward-compatible default — the CUDA EP treats auto as prepacked. `1`: the INT4/INT8 `fc1`/`fc2` initializers are already in the CUTLASS `fpA_intB` layout (e.g. from `pack_weights_for_cuda_mixed_gemm`) and are consumed as-is. `0`: the initializers are raw `[E, N, K/pack]` tensors (as produced by `quantize_matmul_{4,8}bits`) and the kernel runs the CUTLASS layout transform in `PrePack()`. See [§5.1](#51-weights-input-2--5--8). | +| `weights_prepacked` (QMoE only) | int | -1 | Tri-state, only meaningful when `quant_type="int"`. The prepacked layouts selected by `-1` and `1` are **EP-determined**. `-1` (default): the INT4/INT8 `fc1`/`fc2` initializers are already prepacked in the EP's default layout (e.g. from `pack_weights_for_cuda_mixed_gemm` for the CUDA EP). `1`: already prepacked in the EP's SM90 (Hopper) layout. `0`: the initializers are raw `[E, N, K/pack]` tensors (as produced by `quantize_matmul_{4,8}bits`) and the kernel runs the CUTLASS layout transform in `PrePack()` for the runtime arch. **Note:** the CUDA EP INT4/INT8 MoE GEMM always runs the Ampere (SM80) kernel — even on SM90 — so it consumes the SM80 `fpA_intB` layout on all architectures; `-1` and `1` are therefore equivalent for the CUDA EP today, and `1` is reserved for a possible future Hopper-specific layout. See [§5.1](#51-weights-input-2--5--8). | ### 2.2 Type Constraints @@ -230,24 +230,38 @@ extra subtraction. ### 5.1 Weights (input 2 / 5 / 8) **INT4/INT8** weight layout is controlled by the `weights_prepacked` attribute -([§2.1](#21-attributes)): - -- **`weights_prepacked=-1` (auto, default) or `1`** — the `fc1`/`fc2` weights are - assumed to already be in the CUTLASS `fpA_intB` layout, packed offline by - `pack_weights_for_cuda_mixed_gemm` (see [§6](#6-weight-formats)). They are - copied to GPU and consumed as-is (today's behaviour). +([§2.1](#21-attributes)). The prepacked layouts selected by `-1` and `1` are +determined by the execution provider: + +- **`weights_prepacked=-1` (default)** — the `fc1`/`fc2` weights are already in + the EP's default prepacked layout (e.g. packed offline by + `pack_weights_for_cuda_mixed_gemm` for the CUDA EP). They are copied to GPU + and consumed as-is. +- **`weights_prepacked=1`** — the `fc1`/`fc2` weights are already in the EP's + **SM90** (Hopper) prepacked layout (reserved; see the note below). - **`weights_prepacked=0`** — the `fc1`/`fc2` weights are raw, schema-conformant `[E, N, K/pack]` tensors as produced by `quantize_matmul_{4,8}bits`. `PrePack` runs the CUTLASS layout transform itself via `PrePackIntExpertWeights`, removing the offline pre-pack dependency. This makes integer QMoE symmetric with `MatMulNBits::PrePack_B`. +> **Single layout on the CUDA EP.** The CUDA EP INT4/INT8 MoE GEMM always +> dispatches to the Ampere (**SM80**) grouped-GEMM kernel — even on SM90 — +> because mixed int-weight + fp16/bf16 activation is not a valid Hopper TMA +> warp-specialized specialisation (`isValidHopperMOESpecialisation` is `false`). +> This matches **TensorRT-LLM**, which likewise routes `W4A16`/`W8A16` MoE to the +> SM80 kernel on Hopper; its Hopper TMA-WS mixed-dtype MoE kernel is reserved for +> `W4A8` (FP8 activation) and `WFP4A16` (FP4 weight). Consequently the CUDA EP +> consumes the **SM80 `fpA_intB` layout on every GPU**, `PrePack` always packs +> for SM80, and `weights_prepacked=-1` and `=1` are equivalent today. `1` is +> accepted and reserved for a possible future Hopper-specific layout (e.g. +> `W4A8`). There is therefore no architecture-match constraint: SM80-format +> weights run correctly on SM90 via the SM80 kernel. + `PrePackIntExpertWeights` loops over the `E` experts and, per expert, applies the same transpose + row-permutation / column-interleave / bias / pair-interleave -transform as `pack_weights_for_cuda_mixed_gemm` (see [§6.1](#61-int4-group-wise-quant_typeint-expert_weight_bits4)). -Packing is architecture-aware ([§7](#7-cross-architecture-packing-compatibility)): -SM90 (Hopper) is its own layout group, while all other supported arches -(SM75/80/86/89, SM100/120) share the SM80 layout. SM75+ is required. The source +transform as `pack_weights_for_cuda_mixed_gemm` (see [§6.1](#61-int4-group-wise-quant_typeint-expert_weight_bits4)), +always targeting the SM80 layout. SM75+ is required. The source `[E, N, K/pack]` initializers are released after their shapes are cached (`fc1_weights_shape_` / `fc2_weights_shape_`), so peak weight memory stays ~1×. The prepacked GPU buffers (`packed_fc1_weights_` / `packed_fc2_weights_`) are then @@ -440,6 +454,17 @@ weights are interchangeable across SMs: — does not use `pack_weights_for_cuda_mixed_gemm`. - **FP8**: no packing. +> **QMoE uses Group A on every GPU.** The table above describes the layouts the +> `pack_weights_for_cuda_mixed_gemm` *preprocessor* can emit. The QMoE INT4/INT8 +> MoE GEMM, however, always dispatches to the Ampere (SM80) grouped-GEMM kernel — +> even on SM90 — because mixed int-weight + fp16/bf16 activation is not a valid +> Hopper TMA warp-specialized specialisation (the same is true in TensorRT-LLM). +> It therefore consumes the **Group A (SM80) layout on all architectures, +> including Hopper**. For QMoE, always pack INT4/INT8 weights for SM80 (`arch=80`), +> and `PrePackIntExpertWeights` (`weights_prepacked=0`) does exactly that +> regardless of the runtime device SM. Group B (SM90) layout is currently unused +> by QMoE. + --- ## 8. SwiGLU Fusion diff --git a/onnxruntime/contrib_ops/cuda/moe/moe_quantization.cc b/onnxruntime/contrib_ops/cuda/moe/moe_quantization.cc index c0ac6439bd86c..637ee8957e142 100644 --- a/onnxruntime/contrib_ops/cuda/moe/moe_quantization.cc +++ b/onnxruntime/contrib_ops/cuda/moe/moe_quantization.cc @@ -62,18 +62,26 @@ QMoE::QMoE(const OpKernelInfo& op_kernel_info) : CudaKernel(op_kernel_info), MoE this->quant_type_ = op_kernel_info.GetAttrOrDefault("quant_type", "int"); ORT_ENFORCE(quant_type_ == "int" || quant_type_ == "fp4" || quant_type_ == "fp8" || quant_type_ == "wfp4afp8", "quant_type must be 'int', 'fp4', 'fp8', or 'wfp4afp8', but got '", quant_type_, "'"); - // ``weights_prepacked`` is an optional tri-state attribute that defaults to - // -1 (auto) in the schema, so each EP picks its own backward-compatible - // default rather than the schema imposing one: - // -1 (auto, also the schema default): the EP decides. The CUDA EP's - // backward-compatible default is "prepacked" because all pre-existing - // tooling ships CUTLASS-prepacked weights. - // 1: initializers are already prepacked; the compute path reads them as-is. - // 0: initializers are raw [E, N, K/pack]; the PrePack hook lays them out. + // ``weights_prepacked`` is an optional tri-state attribute (default -1) that + // declares the layout of the int4/int8 fc1/fc2 weight initializers. The + // concrete prepacked layouts selected by -1 and 1 are determined by the + // execution provider. The CUDA EP maps the tri-state as: + // -1 (default): already prepacked in the EP's default int weight layout. + // 1: already prepacked in the EP's SM90 (Hopper) int weight layout. + // 0: raw [E, N, K/pack] initializers; the PrePack hook lays them out. + // + // Important: the CUDA QMoE int4/int8 MoE GEMM always dispatches to the + // Ampere (SM80) grouped-GEMM kernel -- even on SM90 -- because mixed + // int-weight + fp16/bf16 activation is not a valid Hopper TMA warp-specialized + // specialisation (see isValidHopperMOESpecialisation). The kernel therefore + // consumes the SM80/Ampere CUTLASS fpA_intB layout on every GPU. As a result + // the EP default (-1) is the SM80 layout regardless of the runtime device SM, + // and SM80-format weights are valid on SM90 (they run via the SM80 kernel). + // PrePack (weights_prepacked=0) packs for the SM80 layout accordingly. const int64_t weights_prepacked_mode = op_kernel_info.GetAttrOrDefault("weights_prepacked", static_cast(-1)); ORT_ENFORCE(weights_prepacked_mode == -1 || weights_prepacked_mode == 0 || weights_prepacked_mode == 1, - "weights_prepacked must be -1 (auto), 0, or 1, but got ", weights_prepacked_mode); + "weights_prepacked must be -1 (default), 0, or 1, but got ", weights_prepacked_mode); weights_prepacked_ = (weights_prepacked_mode != 0); #if !defined(ENABLE_FP4) || !defined(USE_FP4_QMOE) ORT_ENFORCE(quant_type_ != "fp4", "QMoE quant_type='fp4' requires USE_FP4_QMOE with CUDA 12.8 or newer."); @@ -1158,7 +1166,15 @@ void QMoE::PrePackIntExpertWeights(const Tensor& tensor, cudaStream_t stream, Al const int64_t k_packed = shape[2]; const int64_t k = k_packed * pack_factor; - const int packing_sm = onnxruntime::llm::kernels::weight_only::get_arch_for_mixed_gemm_weight_preprocess(sm_); + // The CUDA QMoE int4/int8 MoE GEMM always dispatches to the Ampere (SM80) + // grouped-GEMM kernel -- even on SM90 -- because mixed int-weight + fp16/bf16 + // is not a valid Hopper TMA warp-specialized specialisation. The kernel thus + // consumes the SM80 CUTLASS fpA_intB layout on every GPU, so the weights must + // always be preprocessed for SM80 regardless of the runtime device SM. + // (Using get_arch_for_mixed_gemm_weight_preprocess(sm_) here would emit the + // SM90 layout on Hopper, which the SM80 kernel cannot consume -> wrong output.) + const int packing_sm = + onnxruntime::llm::kernels::weight_only::get_arch_for_mixed_gemm_weight_preprocess(80); // Per-expert sizes. const size_t per_expert_bytes = static_cast(n) * static_cast(k) / pack_factor; diff --git a/onnxruntime/contrib_ops/cuda/moe/moe_quantization.h b/onnxruntime/contrib_ops/cuda/moe/moe_quantization.h index 5722ac41cc470..2bbadc205b5d8 100644 --- a/onnxruntime/contrib_ops/cuda/moe/moe_quantization.h +++ b/onnxruntime/contrib_ops/cuda/moe/moe_quantization.h @@ -46,16 +46,23 @@ class QMoE final : public CudaKernel, public MoEBase { IAllocatorUniquePtr& packed_buf, bool& is_packed); int64_t expert_weight_bits_; bool is_fp16_; - // When true (the schema default), the int4/int8 fc1/fc2 weight - // initializers are already in the CUTLASS fpA_intB layout — produced - // offline e.g. via ``pack_weights_for_cuda_mixed_gemm`` — and the - // compute path reads them as-is. When false, the raw schema-conformant - // ``[E, N, K/pack]`` layout (as produced by - // ``quantize_matmul_{4,8}bits``) is rewritten inside the PrePack hook - // via ``PrePackIntExpertWeights``, removing the offline prepack - // dependency. Only meaningful when ``quant_type_ == "int"``. Derived from - // the optional tri-state ``weights_prepacked`` attribute: -1/auto (or - // absent) maps to true on the CUDA EP, 1 maps to true, 0 maps to false. + // When true, the int4/int8 fc1/fc2 weight initializers are already in a + // CUTLASS fpA_intB layout — produced offline e.g. via + // ``pack_weights_for_cuda_mixed_gemm`` — and the compute path reads them + // as-is. When false, the raw schema-conformant ``[E, N, K/pack]`` layout + // (as produced by ``quantize_matmul_{4,8}bits``) is rewritten inside the + // PrePack hook via ``PrePackIntExpertWeights``, removing the offline + // prepack dependency. Only meaningful when ``quant_type_ == "int"``. + // Derived from the optional tri-state ``weights_prepacked`` attribute: + // -1 (default) and 1 both map to true; 0 maps to false. The concrete + // prepacked layouts selected by -1 and 1 are determined by the execution + // provider. For the CUDA EP the int4/int8 MoE GEMM always dispatches to the + // Ampere (SM80) grouped-GEMM kernel -- even on SM90 -- because mixed + // int-weight + fp16/bf16 activation is not a valid Hopper TMA warp-specialized + // specialisation (matches TensorRT-LLM, which also routes W4A16/W8A16 MoE to + // the SM80 kernel on Hopper). The kernel therefore consumes the SM80 fpA_intB + // layout on every GPU, so -1 and 1 are currently equivalent for the CUDA EP; + // 1 is reserved for a possible future Hopper-specific layout (e.g. W4A8). bool weights_prepacked_ = true; // Cached source weight shapes captured at PrePack time. When the // PrePack hook consumed and released the original int4/int8 weight diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc index 6eec50a400436..f3f2f521ecab2 100644 --- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc @@ -1520,18 +1520,10 @@ ONNX_MS_OPERATOR_SET_SCHEMA( AttributeProto::STRING, std::string("int")) .Attr("weights_prepacked", - "Only meaningful when quant_type='int'. Tri-state control over whether the " - "int4/int8 fc1/fc2 weight initializers are already laid out in the CUTLASS " - "fpA_intB format expected by the runner. -1 (auto): let the execution provider " - "choose its own backward-compatible default; the CUDA EP treats auto as " - "prepacked. 1: the initializers are already prepacked (e.g. produced offline by " - "pack_weights_for_cuda_mixed_gemm) and are consumed as-is. 0: the initializers " - "are raw, un-prepacked [E, N, K/pack] tensors as produced by " - "quantize_matmul_{4,8}bits; the kernel runs the CUTLASS layout transform itself " - "in PrePack(), matching the behavior of MatMulNBits and removing the offline " - "pre-pack requirement from exporters. Defaults to -1 (auto) so each execution " - "provider can pick its own backward-compatible default rather than the schema " - "imposing one.", + "Only meaningful when quant_type='int'. Tri-state control over the layout of the " + "int4/int8 fc1/fc2 weight initializers. The concrete prepacked layouts selected by " + "-1 and 1 are determined by the execution provider. 0: the initializers are raw, " + "un-prepacked [E, N, K/pack] tensors as produced by quantize_matmul_{4,8}bits. Defaults to -1.", AttributeProto::INT, static_cast(-1)) .Input(0, diff --git a/onnxruntime/test/python/transformers/test_moe_cuda.py b/onnxruntime/test/python/transformers/test_moe_cuda.py index c5fc826a5a6ed..cdf0f05f222fe 100644 --- a/onnxruntime/test/python/transformers/test_moe_cuda.py +++ b/onnxruntime/test/python/transformers/test_moe_cuda.py @@ -152,7 +152,10 @@ def quant_dequant(weights, is_4_bit_quantization: bool = True): q_weight_reshaped = q_weight.reshape(n, -1) # Pack weights for CUDA mixed-gemm kernel (FpA_IntB format), and qMoE kernel uses the same format. - processed_q_weight = _quantize.pack_weights_for_cuda_mixed_gemm(q_weight_reshaped, n, k, 4) + # Pin arch=80: the QMoE grouped MoE GEMM always runs the Ampere (SM80) kernel -- even on SM90 -- + # so it consumes the SM80 (column-interleaved) layout on every GPU. Auto-detect (force_arch=-1) + # would emit the non-interleaved SM90 layout on Hopper and produce wrong results. + processed_q_weight = _quantize.pack_weights_for_cuda_mixed_gemm(q_weight_reshaped, n, k, 4, 80) # So we need to DEQUANTIZE back to get `result`. # scale is [n, block_per_k] @@ -232,8 +235,11 @@ def quant_dequant(weights, is_4_bit_quantization: bool = True): ) q_weight_reshaped = q_weight.reshape(n, -1) - # Pack weights for CUDA mixed-gemm kernel (FpA_IntB format) - processed_q_weight = _quantize.pack_weights_for_cuda_mixed_gemm(q_weight_reshaped, n, k, 8) + # Pack weights for CUDA mixed-gemm kernel (FpA_IntB format). + # Pin arch=80: the QMoE grouped MoE GEMM always runs the Ampere (SM80) kernel -- even on SM90 -- + # so it consumes the SM80 (column-interleaved) layout on every GPU. Auto-detect (force_arch=-1) + # would emit the non-interleaved SM90 layout on Hopper and produce wrong results. + processed_q_weight = _quantize.pack_weights_for_cuda_mixed_gemm(q_weight_reshaped, n, k, 8, 80) # Dequantize for reference # (q - 128) * scale if using 128 offset? or (q) * scale if symmetric around 0? diff --git a/onnxruntime/test/python/transformers/test_qmoe_cuda.py b/onnxruntime/test/python/transformers/test_qmoe_cuda.py index 10a856c07366f..38636daedae4b 100644 --- a/onnxruntime/test/python/transformers/test_qmoe_cuda.py +++ b/onnxruntime/test/python/transformers/test_qmoe_cuda.py @@ -138,8 +138,12 @@ def print_diff_statistics(diff_tensor: torch.Tensor, prefix: str = ""): def preprocess_weights_for_mixed_gemm( - tensor: torch.Tensor, quant_bits: int, sm: int = -1, do_weight_interleave: bool = True + tensor: torch.Tensor, quant_bits: int, sm: int = 80, do_weight_interleave: bool = True ) -> torch.Tensor: + # ``sm`` defaults to 80: the QMoE grouped MoE GEMM always runs the Ampere (SM80) + # kernel -- even on SM90 -- so it consumes the SM80 (column-interleaved) layout on + # every GPU. Passing sm=-1 (auto-detect) would emit the non-interleaved SM90 layout + # on Hopper and produce wrong results. if len(tensor.shape) == 2: tensor = tensor.unsqueeze(0) From 6550b204f6d9f53fac1694785efd9db76d26a339 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Wed, 10 Jun 2026 01:53:16 -0700 Subject: [PATCH 4/6] clean up tests --- .../test/python/transformers/test_moe_cuda.py | 4 +- .../python/transformers/test_qmoe_cuda.py | 41 ------------------- 2 files changed, 2 insertions(+), 43 deletions(-) diff --git a/onnxruntime/test/python/transformers/test_moe_cuda.py b/onnxruntime/test/python/transformers/test_moe_cuda.py index cdf0f05f222fe..9677542270a53 100644 --- a/onnxruntime/test/python/transformers/test_moe_cuda.py +++ b/onnxruntime/test/python/transformers/test_moe_cuda.py @@ -1090,8 +1090,8 @@ def parity_check(self): ort_dtype_quant_bits_tolerance_map = { "FP32:0": (5e-3, 1e-3), "FP16:0": (0.3, 0.05), - "FP16:4": (3.0, 1e-2), - "FP16:8": (2.0, 1e-2), + "FP16:4": (0.5, 1e-2), + "FP16:8": (0.5, 1e-2), "BF16:0": (1.0, 1e-2), "BF16:4": (30.0, 1e-1), "BF16:8": (20.0, 1e-1), diff --git a/onnxruntime/test/python/transformers/test_qmoe_cuda.py b/onnxruntime/test/python/transformers/test_qmoe_cuda.py index 38636daedae4b..c56383d2851d3 100644 --- a/onnxruntime/test/python/transformers/test_qmoe_cuda.py +++ b/onnxruntime/test/python/transformers/test_qmoe_cuda.py @@ -137,47 +137,6 @@ def print_diff_statistics(diff_tensor: torch.Tensor, prefix: str = ""): ) -def preprocess_weights_for_mixed_gemm( - tensor: torch.Tensor, quant_bits: int, sm: int = 80, do_weight_interleave: bool = True -) -> torch.Tensor: - # ``sm`` defaults to 80: the QMoE grouped MoE GEMM always runs the Ampere (SM80) - # kernel -- even on SM90 -- so it consumes the SM80 (column-interleaved) layout on - # every GPU. Passing sm=-1 (auto-detect) would emit the non-interleaved SM90 layout - # on Hopper and produce wrong results. - if len(tensor.shape) == 2: - tensor = tensor.unsqueeze(0) - - # Input tensor shape is [Experts, n, k_packed]. k_packed is k/2 for 4-bit, k for 8-bit. - num_experts = tensor.shape[0] - n = tensor.shape[1] - k_packed = tensor.shape[2] - k = k_packed * 2 if quant_bits == 4 else k_packed - - packed_list = [] - - if _pybind and hasattr(_pybind, "pack_weights_for_cuda_mixed_gemm") and torch.cuda.is_available(): - for i in range(num_experts): - if tensor[i].dtype == torch.bfloat16: - weight = tensor[i].to(torch.float32).cpu().numpy() - else: - weight = tensor[i].cpu().numpy() - packed = _pybind.pack_weights_for_cuda_mixed_gemm(weight, n, k, quant_bits, sm) - # pack_weights_for_cuda_mixed_gemm returns int8 array of shape [packed_size] - # We need to reshape it to (k, n/2) for 4-bit, (k, n) for 8-bit. - output_rows = k - output_cols = n // 2 if quant_bits == 4 else n - packed_tensor = torch.from_numpy(packed).to(tensor.device) - packed_tensor = packed_tensor.view(torch.uint8).view(output_rows, output_cols) - packed_list.append(packed_tensor) - - return torch.stack(packed_list) - else: - # This shall not happen unless older version of onnxruntime is used. - raise ImportError( - "onnxruntime._pybind_state.pack_weights_for_cuda_mixed_gemm not found. Cannot preprocess weights." - ) - - def quant_dequant_blockwise(weights, block_size, is_4_bit_quantization: bool = True, asymmetric: bool = False): # DEBUG # print(f"DEBUG: quant_dequant input shape={weights.shape}, 4bit={is_4_bit_quantization}, asym={asymmetric}") From d8c226474cf024a05d0231ee890c40ffc3f6e07e Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Wed, 10 Jun 2026 20:07:50 +0000 Subject: [PATCH 5/6] update doc --- docs/ContribOperators.md | 352 ++++++++++++++++++++------------------- 1 file changed, 177 insertions(+), 175 deletions(-) diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md index 478e645664203..38d101786b41a 100644 --- a/docs/ContribOperators.md +++ b/docs/ContribOperators.md @@ -125,24 +125,24 @@ Do not modify directly.* ### **com.microsoft.Attention** Multi-Head Attention that can be either unidirectional (like GPT-2) or bidirectional (like BERT). - + The weights for input projection of Q, K and V are merged. The data is stacked on the second dimension. Its shape is (input_hidden_size, hidden_size + hidden_size + v_hidden_size). Here hidden_size is the hidden dimension of Q and K, and v_hidden_size is that of V. - + The mask_index is optional. Besides raw attention mask with shape (batch_size, total_sequence_length) or (batch_size, sequence_length, total_sequence_length) with value 0 for masked and 1 otherwise, we support other two formats: When input has right-side padding, mask_index is one dimension with shape (batch_size), where value is actual sequence length excluding padding. When input has left-side padding, mask_index has shape (2 * batch_size), where the values are the exclusive end positions followed by the inclusive start positions. - + When unidirectional is 1, each token only attends to previous tokens. - + Both past and present state are optional. They shall be used together, and not allowed to use only one of them. The qkv_hidden_sizes is required only when K and V have different hidden sizes. - + When there is past state, hidden dimension for Q, K and V shall be the same. - + The total_sequence_length is past_sequence_length + kv_sequence_length. Here kv_sequence_length is the length of K or V. For self attention, kv_sequence_length equals to sequence_length (sequence length of Q). For cross attention, query and key might have different lengths. @@ -214,133 +214,133 @@ This version of the operator has been available since version 1 of the 'com.micr Computes an one-layer RNN where its RNN Cell is an AttentionWrapper wrapped a LSTM Cell. The RNN layer contains following basic component: LSTM Cell, Bahdanau Attention Mechanism, AttentionWrapp. - + Activation functions: - + Relu(x) - max(0, x) - + Tanh(x) - (1 - e^{-2x})/(1 + e^{-2x}) - + Sigmoid(x) - 1/(1 + e^{-x}) - + (NOTE: Below are optional) - + Affine(x) - alpha*x + beta - + LeakyRelu(x) - x if x >= 0 else alpha * x - + ThresholdedRelu(x) - x if x >= alpha else 0 - + ScaledTanh(x) - alpha*Tanh(beta*x) - + HardSigmoid(x) - min(max(alpha*x + beta, 0), 1) - + Elu(x) - x if x >= 0 else alpha*(e^x - 1) - + Softsign(x) - x/(1 + |x|) - + Softplus(x) - log(1 + e^x) - + Softmax(x) - exp(x) / sum(exp(x)) - + Bahdanau Attention Mechanism: `M` - Memory tensor. - + `VALUES` - masked Memory by its real sequence length. - + `MW` - Memory layer weight. - + `KEYS` - Processed memory tensor by the memory layer. KEYS = M * MW - + `Query` - Query tensor, normally at specific time step in sequence. - + `QW` - Query layer weight in the attention mechanism - + `PQ` - processed query, = `Query` * `QW` - + `V' - attention vector - + `ALIGN` - calculated alignment based on Query and KEYS ALIGN = softmax(reduce_sum(`V` * Tanh(`KEYS` + `PQ`))) - + `CONTEXT` - context based on `ALIGN` and `VALUES` CONTEXT = `ALIGN` * `VALUES` - - + + LSTM Cell: `X` - input tensor concat with attention state in the attention wrapper - + `i` - input gate - + `o` - output gate - + `f` - forget gate - + `c` - cell gate - + `t` - time step (t-1 means previous time step) - + `W[iofc]` - W parameter weight matrix for input, output, forget, and cell gates - + `R[iofc]` - R recurrence weight matrix for input, output, forget, and cell gates - + `Wb[iofc]` - W bias vectors for input, output, forget, and cell gates - + `Rb[iofc]` - R bias vectors for input, output, forget, and cell gates - + `P[iof]` - P peephole weight vector for input, output, and forget gates - + `WB[iofc]` - W parameter weight matrix for backward input, output, forget, and cell gates - + `RB[iofc]` - R recurrence weight matrix for backward input, output, forget, and cell gates - + `WBb[iofc]` - W bias vectors for backward input, output, forget, and cell gates - + `RBb[iofc]` - R bias vectors for backward input, output, forget, and cell gates - + `PB[iof]` - P peephole weight vector for backward input, output, and forget gates - + `H` - Hidden state - + `num_directions` - 2 if direction == bidirectional else 1 - + Equations (Default: f=Sigmoid, g=Tanh, h=Tanh): - + - it = f(Xt*(Wi^T) + Ht-1*(Ri^T) + Pi (.) Ct-1 + Wbi + Rbi) - + - ft = f(Xt*(Wf^T) + Ht-1*(Rf^T) + Pf (.) Ct-1 + Wbf + Rbf) - + - ct = g(Xt*(Wc^T) + Ht-1*(Rc^T) + Wbc + Rbc) - + - Ct = ft (.) Ct-1 + it (.) ct - + - ot = f(Xt*(Wo^T) + Ht-1*(Ro^T) + Po (.) Ct + Wbo + Rbo) - + - Ht = ot (.) h(Ct) - - + + AttentionWrapp Notations: `lstm()' - wrapped inner cell. Ht, Ct = lstm(concat(Xt, ATTNt-1), Ct-1) - + `am()` - attention mechanism the wrapper used. CONTEXTt, ALIGNt = am(Ht, ALIGNt-1) - + `AW` - attention layer weights, optional. - + `ATTN` - attention state, initial is zero. If `AW` provided, it is the output of the attention layer, ATTNt = concat(Ht, CONTEXTt) * AW otherwise, ATTNt = CONTEXTt - + RNN layer output: `Y` - if needed is the sequence of Ht from lstm cell. - + `Y_h` - is the last valid H from lstm cell. - + `Y_c` - is the last valid C from lstm cell. - + #### Version @@ -594,7 +594,7 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.BiasGelu** Bias Gelu. - It's an extension of Gelu. It takes the sum of input A and bias input B as the input of Gelu activation. + It's an extension of Gelu. It takes the sum of input A and bias input B as the input of Gelu activation. #### Version @@ -819,7 +819,7 @@ This version of the operator has been available since version 1 of the 'com.micr ``` scale = 1. / (1. - ratio). ``` - + This op functions in much the same was as Dropout-11 and Dropout-13 do, except that the mask is output as a bit-packed uint32 tensor, instead of a boolean tensor. #### Version @@ -907,20 +907,20 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.CausalConvWithState** Stateful causal depthwise convolution, generalized to N spatial dimensions. - + Used by Gated DeltaNet (Qwen3.5) and Mamba (Jamba, FalconMamba) as a preprocessing step. Replaces the 3-op pattern (Concat + Conv + Slice) with a single fused operation. - + The convolution is causal (looks only at current and past positions along the last spatial dimension) and depthwise (each channel is convolved independently with its own kernel). - + Input layout is channels-first: (batch_size, channels, ...). Weight layout: (channels, 1, k_1, ...) for depthwise convolution. The carry state stores the last (k-1) positions along the causal axis for incremental decode. - + The ndim attribute generalizes the op to 1D, 2D, or 3D spatial dimensions. Causality is enforced on the last spatial dimension only. - + The optional activation attribute supports fused SiLU/Swish activation. #### Version @@ -1277,17 +1277,17 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.DecoderMaskedSelfAttention** Self attention that supports input sequence length of 1. - + The weights for input projection of Q, K and V are merged. The data is stacked on the second dimension. Its shape is (input_hidden_size, hidden_size + hidden_size + v_hidden_size). Here hidden_size is the hidden dimension of Q and K, and v_hidden_size is that of V. - + The mask_index is optional. If it is provided, only raw attention mask with shape (batch_size, total_sequence_length) is supported currently. - + Both past and present state need to be provided. - + The qkv_hidden_sizes is required only when K and V have different hidden sizes. - + The total_sequence_length is past_sequence_length + kv_sequence_length. Here kv_sequence_length is the length of K or V. Currently, only self attention is supported which means that kv_sequence_length equals to sequence_length (sequence length of Q). @@ -2349,12 +2349,12 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.GemmaRotaryEmbedding** GemmaRotaryEmbedding is the implementation of below part of rotary positional embeddings (RoPE). It implements below from modeling_gemma.py. - + Here's onnxscript that was tested - + from onnxscript import FLOAT, FLOAT16, script from onnxscript import opset18 as op - + @script() def gemma_rotary_embedding(emb: FLOAT["bs", "seq_len", "dim"], q: FLOAT16["bs", "num_heads", "seq_len", "dim"], q_rot: FLOAT16["bs", "num_heads", "seq_len", "dim"], k: FLOAT16["bs", "num_heads", "seq_len", "dim"], k_rot: FLOAT16["bs", "num_heads", "seq_len", "dim"]): sin_val = op.Sin(emb) @@ -2366,10 +2366,10 @@ This version of the operator has been available since version 1 of the 'com.micr q_embed = (q * casted_cos) + (q_rot * casted_sin) k_embed = (k * casted_cos) + (k_rot * casted_sin) return q_embed, k_embed - + onnx_model = gemma_rotary_embedding.to_model_proto() - - + + #### Version @@ -2485,7 +2485,7 @@ This version of the operator has been available since version 1 of the 'com.micr which are used to interpolate the output value `output[n, :, h, w]`. The GridSample operator is often used in doing grid generator and sampler in the [Spatial Transformer Networks](https://arxiv.org/abs/1506.02025). See also in [torch.nn.functional.grid_sample](https://pytorch.org/docs/master/generated/torch.nn.functional.grid_sample.html#torch-nn-functional-grid-sample). - + #### Version @@ -2531,13 +2531,13 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.GroupNorm** Applies Group Normalization over a mini-batch of inputs as described in the paper Group Normalization (https://arxiv.org/abs/1803.08494). - + This operator transforms input according to y = gamma * (x - mean) / sqrt(variance + epsilon) + beta - + The input channels are separated into num_groups groups, each containing num_channels / num_groups channels. num_channels must be divisible by num_groups. The mean and standard-deviation are calculated separately over the each group. The weight and bias are per-channel affine transform parameter vectors of size num_channels. - + The activation attribute can be used to enable activation after group normalization. #### Version @@ -2588,21 +2588,21 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.GroupQueryAttention** Group Query Self/Cross Attention with KV Cache Quantization Support. - + This operator implements causal grouped-query attention with past state (KV cache) support. It also supports optional float8, int8 or int4 quantization for the KV cache to reduce memory footprint. - + **Cache Format:** The past and present KV cache tensors are expected in a BNSH format: `(batch_size, num_heads, cache_sequence_length, head_size)`, where `cache_sequence_length` is the length of the cached key/value sequences, or the maximum sequence length when past and present buffer sharing is used. - + **Quantization:** When quantization is enabled, `past_key` and `past_value` inputs can be of type `float8e4m3fn`, `uint8` or `int8`. The corresponding `k_scale` and `v_scale` tensors must be provided. The operator will output `present_key` and `present_value` in same format as the `past_key` and `past_value`. - + For 4-bit quantization, the data type is uint8 where each byte contains two 4-bit values. The bit width of quantized KV cache can be set using `kv_cache_bit_width` attribute. - + The shapes of the k_scale, v_scale tensors shall be broadcastable to present_key shape. - + **Quantization Modes (`k_quant_type`, `v_quant_type` attributes):** - **"NONE"**: No quantization. - **"PER_TENSOR"**: A single scale for the entire tensor. Scale example shape: `[1]`. @@ -2779,18 +2779,18 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.LinearAttention** Unified linear attention operator for autoregressive decoding (T=1) and prefill (T>1). - + All inputs use 3D packed format [B, T, H*D]; q_num_heads and kv_num_heads are always required. The op internally unpacks to 4D for computation. - + The update_rule attribute selects the recurrence type: - "linear": S_t = S_{t-1} + k_t ⊗ v_t; o_t = scale * q_t^T S_t - "gated": S_t = exp(g_t) * S_{t-1} + k_t ⊗ v_t; o_t = scale * q_t^T S_t - "delta": S_t = S_{t-1} + β_t * k_t ⊗ (v_t - S_{t-1}^T k_t); o_t = scale * q_t^T S_t - "gated_delta": S_t = exp(g_t) * S_{t-1} + β_t * k_t ⊗ (v_t - exp(g_t) * S_{t-1}^T k_t); o_t = scale * q_t^T S_t - + where g_t is the decay (in log-space), β_t is the update rate, and ⊗ denotes outer product. - + Semantics: Equivalent to running the recurrent update sequentially for each token, but may be implemented using chunk-parallel algorithms for GPU efficiency. @@ -2854,10 +2854,10 @@ This version of the operator has been available since version 1 of the 'com.micr Longformer Self Attention with a local context and a global context. Tokens attend locally: Each token attends to its W previous tokens and W succeeding tokens with W being the window length. A selected few tokens attend globally to all other tokens. - + The attention mask is of shape (batch_size, sequence_length), where sequence_length is a multiple of 2W after padding. Mask value < 0 (like -10000.0) means the token is masked, 0 otherwise. - + Global attention flags have value 1 for the tokens attend globally and 0 otherwise. #### Version @@ -2916,32 +2916,32 @@ This version of the operator has been available since version 1 of the 'com.micr 2. Input B is quantized with 4 bits with quantization data type specified by attribute 'quant_type'. It is transposed, flattened and quantized blockwisely with block size specified by attribute 'block_size'. And block_size is not an arbitrary number and must be a power of 2 and not smaller than 16, like 16, 32, 64, 128,.. 3. Input B's quantization constants or scales are specified by input 'absmax'. - + Input B is stored as uint8_t with shape: [(N * K + 1) / 2]. Input absmax is stored in same type as original type of B(float32, float16) with shape like: [(N * K + block_size - 1) / block_size]. - - + + 1. (Default value) transB=True (Majorly used for forward pass) Shape of A: [D0, D1, ..., Dn, K] Shape of Dequanted B: [N, K], this is aligned with how PyTorch defined the linear weight, .e.g [out_features, in_features]. - + The computation math: dequant_B = dequant(B, absmax, quant_type, block_size) transposed_dequant_B = dequant_B^T output = A @ transposed_dequant_B - + Shape of output: [D0, D1, ..., Dn, N] - + 2. transB=False (Majorly used for backward pass) Shape of A: [D0, D1, ..., Dn, N] Shape of Dequanted B: [N, K], this is aligned with how PyTorch defined the linear weight, .e.g [out_features, in_features]. - + The computation math: dequant_B = dequant(B, absmax, quant_type, block_size) output = A @ dequant_B - + Shape of output: [D0, D1, ..., Dn, K] - + #### Version @@ -3127,17 +3127,17 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.MatMulNBits** MatMulNBits performs a matrix multiplication where the right-hand-side matrix (weights) is quantized to N bits. - + It is a fusion of two operations: 1. Linear dequantization of the quantized weights using scale and (optionally) zero-point with formula: dequantized_weight = (quantized_weight - zero_point) * scale 2. Matrix multiplication between the input matrix A and the dequantized weight matrix. - + The weight matrix is a 2D constant matrix with the input feature count and output feature count specified by attributes 'K' and 'N'. It is quantized block-wise along the K dimension with a block size specified by the 'block_size' attribute. The block size must be a power of 2 and not smaller than 16 (e.g., 16, 32, 64, 128). Each block has its own scale and zero-point. The quantization is performed using a bit-width specified by the 'bits' attribute, which can take values from 2 to 8. - + The quantized weights are stored in a bit-packed format along the K dimension, with each block being represented by a blob of uint8. For example, for 4 bits, the first 4 bits are stored in the lower 4 bits of a byte, and the second 4 bits are stored in the higher 4 bits of a byte. @@ -3201,32 +3201,32 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.MatMulNBitsMlp** MatMulNBitsMlp fuses two MatMulNBits projections that share the same input and computes - + gate = MatMulNBits(A, gate_weight) + gate_bias up = MatMulNBits(A, up_weight) + up_bias Y = activation(gate) * up - + It can also optionally fuse SimplifiedLayerNormalization or SkipSimplifiedLayerNormalization before the two projections: - + A_norm = SimplifiedLayerNormalization(A, norm_scale, epsilon) gate = MatMulNBits(A_norm, gate_weight) + gate_bias up = MatMulNBits(A_norm, up_weight) + up_bias Y = activation(gate) * up - + A_norm = SkipSimplifiedLayerNormalization(A, skip, norm_scale, epsilon) gate = MatMulNBits(A_norm, gate_weight) + gate_bias up = MatMulNBits(A_norm, up_weight) + up_bias Y = activation(gate) * up - + This operator is intended for decoder MLP patterns such as Qwen-style gate and up projections, but it remains semantically valid for both prefill and decode because the output shape is the standard MatMul result shape derived from the runtime shape of A and the shared attributes K and N. - + The operator contract includes a string attribute describing the fused gate activation. - + When fused from SkipSimplifiedLayerNormalization, the optional residual-sum output may also be materialized: - + A_norm, input_skip_bias_sum = SkipSimplifiedLayerNormalization(A, skip, norm_scale, epsilon) gate = MatMulNBits(A_norm, gate_weight) + gate_bias up = MatMulNBits(A_norm, up_weight) + up_bias @@ -3302,15 +3302,15 @@ This version of the operator has been available since version 1 of the 'com.micr MatMulNBitsQkv fuses either SimplifiedLayerNormalization (RMSNorm) or SkipSimplifiedLayerNormalization with three MatMulNBits projections that share the same normalized activation. - + A_norm = SimplifiedLayerNormalization(A, norm_scale, epsilon) Q = MatMulNBits(A_norm, q_weight) + q_bias K = MatMulNBits(A_norm, k_weight) + k_bias V = MatMulNBits(A_norm, v_weight) + v_bias - + If skip is provided, the operator computes the SkipSimplifiedLayerNormalization variant and may also return the input+skip residual sum as output 3. - + This operator is intended as a decode-oriented QKV fusion primitive. #### Version @@ -3440,7 +3440,7 @@ This version of the operator has been available since version 1 of the 'com.micr Mixture of experts. Examples: Switch transformer(https://arxiv.org/pdf/2101.03961.pdf) use top 1, GLaM(https://arxiv.org/abs/2112.06905) activates top 2 FFN, Vision MOE(https://arxiv.org/pdf/2106.05974.pdf) usually uses top 32 experts and Mixtral(https://huggingface.co/blog/mixtral). - + The SwiGLU (Swish-Gated Linear Unit) activation function is like: g = xW + b l = xV + c @@ -3451,7 +3451,7 @@ This version of the operator has been available since version 1 of the 'com.micr When swiglu_fusion=0, two GEMMs are not fused, and they are FC1 and FC3 in the inputs. When swiglu_fusion=1, two GEMMs are fused so that g and l are computed in a single GEMM (FC1), and g and l are interleaved on each row of size 2 * inter_size. When swiglu_fusion=2, two GEMMs are fused, and g and l are concatenated on each row. - + #### Version @@ -3519,11 +3519,11 @@ This version of the operator has been available since version 1 of the 'com.micr Performs element-wise binary quantized multiplication (with Numpy-style broadcasting support). "This operator supports **multidirectional (i.e., Numpy-style) broadcasting**" The output of this op is the int32 accumulated result of the mul operation - + ``` C (int32) = (A - A_zero_point) * (B - B_zero_point) ``` - + #### Version @@ -3562,7 +3562,7 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.MultiHeadAttention** Multi-Head Self/Cross Attention. Bias from input projection is included. - + The key padding mask is optional. When its shape is (batch_size, kv_sequence_length), value 0 means padding or 1 otherwise. When key has right-side padding, its shape could be (batch_size): it is actual length of each key sequence excluding paddings. @@ -3870,25 +3870,25 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.PackedAttention** This is the packed version of Attention. - + Sequences in one batch usually don't have same length and they are padded to have same length, e.g., below is a batch with 3 sequences and tokens* are padded. Sequence_0: 0, 1*, 2*, 3* Sequence_1: 4, 5, 6*, 7* Sequence_2: 8, 9, 10, 11 - + PackedAttention is designed to takes in packed input, i.e., only the real tokens without padding. An input as above will be packed into 3 tensors like below: - input ([h0, h4, h5, h8, h9, h10, h11]) - token_offset: 0, 4, 5, 8, 9, 10, 11, 1*, 2*, 3*, 6*, 7* - cumulated_token_count: 0, 1, 1+2, 1+2+4 - + Input tensors contains the hidden embedding of real tokens. Token_offset records the offset of token in the unpacked input. cumulated_token_count records cumulated length of each sequence length. - + The operator only supports BERT like model with padding on right now. - + #### Version @@ -3942,13 +3942,13 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.PackedMultiHeadAttention** This is the packed version of MultiHeadAttention. - + Sequences in one batch usually don't have same length and they are padded to have same length, e.g., below is a batch with 3 sequences and * is padding token. Sequence_0: 0, 1*, 2*, 3* Sequence_1: 4, 5, 6*, 7* Sequence_2: 8, 9, 10, 11 - + PackedMultiHeadAttention is designed to takes in packed input, i.e., only the real tokens without padding. An input as above will be packed into 3 tensors like below: - query ([q0, q4, q5, q8, q9, q10, q11]) @@ -3956,11 +3956,11 @@ This version of the operator has been available since version 1 of the 'com.micr - value ([v0, v4, v5, v8, v9, v10, v11]) - token_offset: 0, 4, 5, 8, 9, 10, 11, 1*, 2*, 3*, 6*, 7* - cumulative_sequence_length: 0, 1, 1+2, 1+2+4 - + The query, key and value tensors contain result of hidden embedding of real tokens after input projections. Token_offset records the offset of token in the unpacked input. cumulative_sequence_length records cumulated length of each sequence length. - + The operator only supports BERT like model with padding on right now. #### Version @@ -4032,7 +4032,7 @@ This version of the operator has been available since version 1 of the 'com.micr [0.0, 0.0, 4.5, 5.7], ], ] - + #### Version @@ -4074,16 +4074,16 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.PagedAttention** Paged Attention. - + This op leverages a block-based KV cache to enable continuous batching for LLMs. Currently, it is designed to work with the CUDA Execution Provider only. - + In other attention ops, batch entries typically aren't of the same length, so they are padded. Below is a batch with 3 sequences where * denotes a padding token. Sequence_0: 0, 1*, 2*, 3* Sequence_1: 4, 5, 6*, 7* Sequence_2: 8, 9, 10, 11 - + PagedAttention is designed to take in packed input, i.e., only the real tokens without padding. For example, the input shown above will be packed into 3 tensors like below: - query ([q0, q4, q5, q8, q9, q10, q11]) @@ -4091,10 +4091,10 @@ This version of the operator has been available since version 1 of the 'com.micr - value ([v0, v4, v5, v8, v9, v10, v11]) - cumulative_sequence_length: 0, 1, 1+2, 1+2+4 This packing omits padding tokens. - + The query, key and value tensors contain result of hidden embedding of real tokens after input projections. cumulative_sequence_length records cumulated length of each sequence length. - + #### Version @@ -4306,7 +4306,7 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.QLinearAdd** Performs element-wise binary addition on 8 bit data types (with Numpy-style broadcasting support). - + C = (A_scale * (A - A_zero_point) + B_scale * (B - B_zero_point))/C_scale + C_zero_point #### Version @@ -4364,11 +4364,11 @@ This version of the operator has been available since version 1 of the 'com.micr output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1) ``` if ceil_mode is enabled - + ``` * pad_shape[i] is sum of pads along axis i ``` - + `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following: ``` VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - kernel_spatial_shape[i] + 1) / strides_spatial_shape[i]) @@ -4378,9 +4378,9 @@ This version of the operator has been available since version 1 of the 'com.micr ``` pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + kernel_spatial_shape[i] - input_spatial_shape[i] ``` - + The output of each pooling window is divided by the number of elements (exclude pad when attribute count_include_pad is zero). - + Input and output scales and zero points are used to convert the output to a new quantization range. Output = Dequantize(Input) -> AveragePool on fp32 data -> Quantize(output) @@ -4648,7 +4648,7 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.QLinearMul** Performs element-wise binary multiplication on 8 bit data types (with Numpy-style broadcasting support). - + C = ((A - A_zero_point) * (B - B_zero_point)) * (A_scale * B_scale)/C_scale + C_zero_point #### Version @@ -4699,10 +4699,10 @@ This version of the operator has been available since version 1 of the 'com.micr with the exception that numpy default keepdims to False instead of True. Input and Output scales and zero points are used to requantize the output in a new range. This helps to improve accuracy as after ReduceMean operation the range of the output is expected to decrease. - + ``` "Output = Dequantize(Input) -> ReduceMean on fp32 data -> Quantize(output)", - + ``` #### Version @@ -4752,7 +4752,7 @@ This version of the operator has been available since version 1 of the 'com.micr QLinearSigmoid takes quantized input data (Tensor), and quantize parameter for output, and produces one output data (Tensor) where the function `f(x) = quantize(Sigmoid(dequantize(x)))`, is applied to the data tensor elementwise. - Wwhere the function `Sigmoid(x) = 1 / (1 + exp(-x))` + Wwhere the function `Sigmoid(x) = 1 / (1 + exp(-x))` #### Version @@ -4893,20 +4893,20 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.QMoE** Quantized mixture of experts (MoE). - + The quantized weights are stored in column major order per expert. The quantization block size can be specified. If not provided, column wise quantization is used. - + The formula of linear dequantization of the quantized weights using scale and (optionally) zero-point is: dequantized_weight = (quantized_weight - zero_point) * scale When zero_point is not provided, the default value is 2^(bits-1): 2 for 2 bits, 8 for 4 bits, 128 for 8 bits. - + If block_size is provided, both hidden_size and inter_size must be divisible by the block size, and the dequantization is performed per block of size block_size along the K (input feature) dimension. - + If block_size and zero_point are provided, both hidden_size and inter_size must be divisible by block_size * pack_size, where pack_size = 8 / expert_weight_bits. - + The SwiGLU (Swish-Gated Linear Unit) activation function is like: g = xW + b l = xV + c @@ -4917,7 +4917,7 @@ This version of the operator has been available since version 1 of the 'com.micr When swiglu_fusion=0, two GEMMs are not fused, and they are FC1 and FC3 in the inputs. When swiglu_fusion=1, two GEMMs are fused so that g and l are computed in a single GEMM (FC1), and g and l are interleaved on each row of size 2 * inter_size. When swiglu_fusion=2, two GEMMs are fused, and g and l are concatenated on each row. - + #### Version @@ -5668,10 +5668,10 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.RemovePadding** Compress transformer input by removing paddings. It assumes padding is on the right side of sequence. - + The input has padding with shape (batch_size, sequence_length, hidden_size). This will generate two outputs: output has shape (total_tokens, hidden_size); token_offset with shape (batch_size, sequence_length). - + token_offset has offsets of all non-padding tokens first, then offset of all padding tokens. It is a list of batch_size * sequence_length elements, which is reshaped to 2D for convenience of shape inference. @@ -5714,7 +5714,7 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.RestorePadding** Restore paddings and fill padding with zeros. - + The input has padding with shape (total_tokens, hidden_size) and token_offset with shape (batch_size, sequence_length). The output has shape (batch_size, sequence_length, hidden_size). @@ -5961,16 +5961,16 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.SkipGroupNorm** This operator element-wise adds x, skip and bias, then apply group normalization and optional activation. - + This operator transforms input according to s = x + skip + bias y = gamma * (s - mean) / sqrt(variance + epsilon) + beta - + The input channels are separated into num_groups groups, each containing num_channels / num_groups channels. The num_channels must be divisible by num_groups. The mean and standard-deviation of s are calculated separately over the each group. The weight and bias are per-channel affine transform parameter vectors of size num_channels. - + The activation attribute can be used to enable activation after group normalization. #### Version @@ -6174,36 +6174,36 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.SparseAttention** Block Sparse Attention used in Phi-3-small (https://arxiv.org/pdf/2404.14219). - + It is inspired by Sparse Transformers (https://arxiv.org/pdf/1904.10509) and BigBird (https://arxiv.org/pdf/2007.14062). - + block_mask can be used to configure sparse layout for different head. When number of sparse layout is 1, all heads have same sparse layout. Otherwise, different layouts are used cyclically. For example, given 4 layouts (S0, S1, S2, S3), 8 heads will have layouts like (S0, S1, S2, S3, S0, S1, S2, S3). - + The block_row_indices and block_col_indices are the CSR representation of block mask. The block_col_indices might contain paddings at the right side when different layout has different number of non-zeros in block mask. - + An example of block mask with 2 layouts where each layout is 4 x 4 blocks: [[[1, 0, 0, 0], [1, 1, 0, 0], [0, 1, 1, 0], [0, 1, 1, 1]], - + [[1, 0, 0, 0], [1, 1, 0, 0], [1, 1, 1, 0], [1, 0, 1, 1]]] - + The corresponding CSR format: block_col_indices = [[0, 0, 1, 1, 2, 1, 2, 3, -1], [0, 0, 1, 0, 1, 2, 0, 2, 3]] block_row_indices = [[0, 1, 3, 5, 8], [0, 1, 3, 6, 9]] - + When do_rotary is True, cos_cache and sin_cache are required. Note that the maximum sequence length supported by cos or sin cache can be different from the maximum sequence length used by kv cache. - + Only supports unidirectional attention with cache of past key and value in linear buffers. - + For performance, past_key and present_key share same memory buffer, and past_value and present_value too. #### Version @@ -6397,7 +6397,7 @@ This version of the operator has been available since version 1 of the 'com.micr Based on Torch operator Embedding, creates a lookup table of embedding vectors of fixed size, for a dictionary of fixed size. - + #### Version @@ -6487,7 +6487,7 @@ This version of the operator has been available since version 1 of the 'com.micr the main diagonal. A negative k value includes as many diagonals below the main diagonal. If upper is set to false, a positive k retains the lower triangular matrix including k diagonals above the main diagonal. A negative k value excludes as many diagonals below the main diagonal. - + #### Version @@ -6579,7 +6579,7 @@ This version of the operator has been available since version 1 of the 'com.micr output_uniques = [2, 1, 3, 4] output_idx = [0, 1, 1, 2, 3, 2] output_counts = [1, 2, 2, 1] - + #### Version @@ -6891,3 +6891,5 @@ No versioning maintained for experimental ops.
T : tensor(float)
Constrain input and output types to float32 tensors.
+ + From 7cf33d36f7f739bb3c8d0938b5fa91bdf83341bd Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 10 Jun 2026 20:46:42 +0000 Subject: [PATCH 6/6] Address QMoE review feedback on SM80 prepack docs and checks --- docs/contrib_ops/cuda/moe_qmoe.md | 6 +++--- onnxruntime/contrib_ops/cuda/moe/moe_quantization.cc | 7 ++++++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/docs/contrib_ops/cuda/moe_qmoe.md b/docs/contrib_ops/cuda/moe_qmoe.md index 6380fbe9a17f5..36b68889ae582 100644 --- a/docs/contrib_ops/cuda/moe_qmoe.md +++ b/docs/contrib_ops/cuda/moe_qmoe.md @@ -71,7 +71,7 @@ input tokens → router (top-k softmax) → permute by expert | `expert_weight_bits` (QMoE only) | int | 4 | 4 (INT4/MXFP4) or 8 (INT8/FP8). | | `block_size` (QMoE only) | int | -1 | Group size for INT4/INT8 group-wise quantization. -1 = per-output-channel. | | `quant_type` (QMoE only) | string | `"int"` | `"int"`, `"fp4"`, `"fp8"`, `"wfp4afp8"`. See [§3](#3-quantization-modes). | -| `weights_prepacked` (QMoE only) | int | -1 | Tri-state, only meaningful when `quant_type="int"`. The prepacked layouts selected by `-1` and `1` are **EP-determined**. `-1` (default): the INT4/INT8 `fc1`/`fc2` initializers are already prepacked in the EP's default layout (e.g. from `pack_weights_for_cuda_mixed_gemm` for the CUDA EP). `1`: already prepacked in the EP's SM90 (Hopper) layout. `0`: the initializers are raw `[E, N, K/pack]` tensors (as produced by `quantize_matmul_{4,8}bits`) and the kernel runs the CUTLASS layout transform in `PrePack()` for the runtime arch. **Note:** the CUDA EP INT4/INT8 MoE GEMM always runs the Ampere (SM80) kernel — even on SM90 — so it consumes the SM80 `fpA_intB` layout on all architectures; `-1` and `1` are therefore equivalent for the CUDA EP today, and `1` is reserved for a possible future Hopper-specific layout. See [§5.1](#51-weights-input-2--5--8). | +| `weights_prepacked` (QMoE only) | int | -1 | Tri-state, only meaningful when `quant_type="int"`. The prepacked layouts selected by `-1` and `1` are **EP-determined**. `-1` (default): the INT4/INT8 `fc1`/`fc2` initializers are already prepacked in the EP's default layout (e.g. from `pack_weights_for_cuda_mixed_gemm` for the CUDA EP). `1`: already prepacked in an alternate EP-selected layout. `0`: the initializers are raw `[E, N, K/pack]` tensors (as produced by `quantize_matmul_{4,8}bits`) and the kernel runs the CUTLASS layout transform in `PrePack()`. **Note:** the CUDA EP INT4/INT8 MoE GEMM always runs the Ampere (SM80) kernel — even on SM90 — so it consumes the SM80 `fpA_intB` layout on all architectures; `-1` and `1` are therefore equivalent for the CUDA EP today, and `1` is reserved for a possible future Hopper-specific layout. See [§5.1](#51-weights-input-2--5--8). | ### 2.2 Type Constraints @@ -1017,8 +1017,8 @@ over-aligned by-value parameters. - **In-`PrePack` INT weight layout transform** (`weights_prepacked=0`) is currently covered only by a smoke test (`TestQMoEIntPrePackSmoke`), not a bit-parity check: the existing offline pre-pack harness hardcodes - `force_arch=80` and produces incorrect output on SM≥90, so a parity - comparison against it is omitted until that harness honours the runtime SM. + `force_arch=80` (the same SM80 layout consumed by the CUDA EP on all GPUs), + so a separate parity harness for this path is still pending. - **Hopper W4A8** (INT4 weight + FP8 activation) is not supported — TRT-LLM gates its fast path to SM89 only. diff --git a/onnxruntime/contrib_ops/cuda/moe/moe_quantization.cc b/onnxruntime/contrib_ops/cuda/moe/moe_quantization.cc index 637ee8957e142..7d1291e004d78 100644 --- a/onnxruntime/contrib_ops/cuda/moe/moe_quantization.cc +++ b/onnxruntime/contrib_ops/cuda/moe/moe_quantization.cc @@ -67,7 +67,7 @@ QMoE::QMoE(const OpKernelInfo& op_kernel_info) : CudaKernel(op_kernel_info), MoE // concrete prepacked layouts selected by -1 and 1 are determined by the // execution provider. The CUDA EP maps the tri-state as: // -1 (default): already prepacked in the EP's default int weight layout. - // 1: already prepacked in the EP's SM90 (Hopper) int weight layout. + // 1: already prepacked in an alternate EP-selected int weight layout. // 0: raw [E, N, K/pack] initializers; the PrePack hook lays them out. // // Important: the CUDA QMoE int4/int8 MoE GEMM always dispatches to the @@ -77,6 +77,8 @@ QMoE::QMoE(const OpKernelInfo& op_kernel_info) : CudaKernel(op_kernel_info), MoE // consumes the SM80/Ampere CUTLASS fpA_intB layout on every GPU. As a result // the EP default (-1) is the SM80 layout regardless of the runtime device SM, // and SM80-format weights are valid on SM90 (they run via the SM80 kernel). + // For CUDA today, -1 and 1 are equivalent (both SM80 layout), and 1 is + // reserved for a possible future Hopper-specific layout. // PrePack (weights_prepacked=0) packs for the SM80 layout accordingly. const int64_t weights_prepacked_mode = op_kernel_info.GetAttrOrDefault("weights_prepacked", static_cast(-1)); @@ -1154,6 +1156,9 @@ void QMoE::PrePackIntExpertWeights(const Tensor& tensor, cudaStream_t stream, Al IAllocatorUniquePtr& packed_buf, bool& is_packed) { ORT_ENFORCE(expert_weight_bits_ == 4 || expert_weight_bits_ == 8, "PrePackIntExpertWeights: only 4 and 8 bits are supported, got ", expert_weight_bits_); + ORT_ENFORCE(sm_ >= 75, + "PrePackIntExpertWeights: quant_type='int' with weights_prepacked=0 requires SM75+ CUDA hardware, got SM", + sm_); const auto& shape = tensor.Shape(); ORT_ENFORCE(shape.NumDimensions() == 3, "PrePackIntExpertWeights: expected 3-D weight tensor [E, N, K/pack], got ndim=",