From 8e3f22eaedcbed7c46ff9ae33a61e9ad94b157d5 Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Fri, 8 May 2026 16:53:51 -0500
Subject: [PATCH 01/32] Initial AI implementation from prototype

---
 src/targets/gpu/jit/nonmaxsuppression.cpp     | 215 ++++++++++++
 .../migraphx/kernels/nonmaxsuppression.hpp    | 305 ++++++++++++++++++
 .../kernels/include/migraphx/kernels/sort.hpp |  28 ++
 src/targets/gpu/lowering.cpp                  |   8 +
 test/verify/test_nms.cpp                      | 113 ++++++-
 5 files changed, 668 insertions(+), 1 deletion(-)
 create mode 100644 src/targets/gpu/jit/nonmaxsuppression.cpp
 create mode 100644 src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp
new file mode 100644
index 00000000000..1b6ba4bb62c
--- /dev/null
+++ b/src/targets/gpu/jit/nonmaxsuppression.cpp
@@ -0,0 +1,215 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/bit.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/literal.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/module.hpp>
+
+#include <algorithm>
+#include <cstdint>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+// nms_data is laid out as { float score; float box[4]; int box_index; } for a
+// total of 24 bytes per entry. The scratch workspace is allocated as raw int8
+// and reinterpreted in the kernel.
+static constexpr std::size_t nms_bytes_per_data = 24;
+
+// NOLINTNEXTLINE
+static const char* const nms_kernel_src = R"__migraphx__(
+#include <migraphx/kernels/nonmaxsuppression.hpp>
+#include <args.hpp>
+
+namespace migraphx {
+
+extern "C" {
+
+MIGRAPHX_GLOBAL void nms_kernel(${params})
+{
+    make_tensors()(${args})([](auto boxes,
+                                auto scores,
+                                auto max_p,
+                                auto iou_p,
+                                auto thr_p,
+                                auto sorted,
+                                auto mask,
+                                auto count,
+                                auto out) {
+        nonmaxsuppression<${center_point_box},
+                          ${num_batches},
+                          ${num_classes},
+                          ${num_boxes},
+                          ${aligned_num_boxes}>(
+            boxes, scores, max_p, iou_p, thr_p, sorted, mask, count, out);
+    });
+}
+
+}
+
+} // namespace migraphx
+)__migraphx__";
+
+struct nms_compiler : compiler<nms_compiler>
+{
+    std::vector<std::string> names() const { return {"nonmaxsuppression"}; }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        // inputs (in order): boxes, scores, max, iou, score_thr,
+        //                    sorted_data, iou_mask, global_count, output.
+        const auto& boxes_s   = inputs[0];
+        const auto& scores_s  = inputs[1];
+        const auto nb         = boxes_s.lens()[0];
+        const auto b          = boxes_s.lens()[1];
+        const auto nc         = scores_s.lens()[1];
+        const auto aligned_b  = static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(b)));
+        const auto block_size = std::min<std::size_t>(aligned_b, std::size_t{1024});
+
+        hip_compile_options options;
+        options.inputs         = inputs;
+        options.output         = inputs.back();
+        options.kernel_name    = "nms_kernel";
+        options.virtual_inputs = inputs;
+        options.set_launch_params(v, block_size * nb * nc, block_size);
+
+        auto src = interpolate_string(
+            nms_kernel_src,
+            {{"params", enum_params(inputs.size(), "void * private_p")},
+             {"args", enum_params(inputs.size(), "private_p")},
+             {"num_batches", std::to_string(nb)},
+             {"num_classes", std::to_string(nc)},
+             {"num_boxes", std::to_string(b)},
+             {"aligned_num_boxes", std::to_string(aligned_b)},
+             {"center_point_box",
+              v.at("center_point_box").to<bool>() ? "true" : "false"}});
+        return compile_hip_code_object(ctx, src, options);
+    }
+
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        // ins->inputs() is [user_inputs..., output_alloc] from
+        // insert_precompile_op. user_inputs has 2..5 entries per ONNX NMS.
+        auto raw = ins->inputs();
+        if(raw.size() < 3 or raw.size() > 6)
+            MIGRAPHX_THROW("nms_compiler: unexpected input count " + std::to_string(raw.size()));
+
+        std::vector<shape> raw_shapes;
+        raw_shapes.reserve(raw.size() - 1);
+        std::transform(raw.begin(),
+                       raw.end() - 1,
+                       std::back_inserter(raw_shapes),
+                       [](auto i) { return i->get_shape(); });
+
+        // Default shapes for missing optional scalar inputs. The literals
+        // inserted by the replace lambda use these same shapes so the
+        // compiled kernel's tensor_view types match the runtime arguments.
+        const shape default_max_s{shape::int64_type, {1}};
+        const shape default_iou_s{shape::float_type, {1}};
+        const shape default_thr_s{shape::float_type, {1}};
+        if(raw_shapes.size() < 3)
+            raw_shapes.push_back(default_max_s);
+        if(raw_shapes.size() < 4)
+            raw_shapes.push_back(default_iou_s);
+        if(raw_shapes.size() < 5)
+            raw_shapes.push_back(default_thr_s);
+
+        const auto& boxes_s  = raw_shapes[0];
+        const auto& scores_s = raw_shapes[1];
+        if(boxes_s.lens().size() != 3 or scores_s.lens().size() != 3)
+            MIGRAPHX_THROW("nms_compiler: boxes and scores must be 3-D");
+
+        const auto nb         = boxes_s.lens()[0];
+        const auto b          = boxes_s.lens()[1];
+        const auto nc         = scores_s.lens()[1];
+        const auto aligned_b  = static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(b)));
+        const auto iou_packed = (b > 1) ? (b * (b - 1) / 2) : std::size_t{1};
+
+        shape sorted_s{shape::int8_type, {nb * nc * aligned_b * nms_bytes_per_data}};
+        shape mask_s{shape::uint8_type, {nb * nc * iou_packed}};
+        shape count_s{shape::int64_type, {1}};
+
+        std::vector<shape> kshapes = raw_shapes;
+        kshapes.push_back(sorted_s);
+        kshapes.push_back(mask_s);
+        kshapes.push_back(count_s);
+        kshapes.push_back(raw.back()->get_shape());
+
+        auto kop = compile_op(ctx, kshapes, op.to_value());
+
+        return {kop, [=](module& m, instruction_ref ins2, const operation& cop) {
+                    auto args = ins2->inputs();
+                    auto out  = args.back();
+                    args.pop_back();
+
+                    if(args.size() < 3)
+                    {
+                        args.push_back(m.insert_literal(
+                            ins2, literal{default_max_s, {std::int64_t{0}}}));
+                    }
+                    if(args.size() < 4)
+                    {
+                        args.push_back(
+                            m.insert_literal(ins2, literal{default_iou_s, {0.0f}}));
+                    }
+                    if(args.size() < 5)
+                    {
+                        args.push_back(
+                            m.insert_literal(ins2, literal{default_thr_s, {0.0f}}));
+                    }
+
+                    auto sorted = m.insert_instruction(
+                        ins2, make_op("hip::allocate", {{"shape", to_value(sorted_s)}}));
+                    auto mask = m.insert_instruction(
+                        ins2, make_op("hip::allocate", {{"shape", to_value(mask_s)}}));
+                    auto count = m.insert_instruction(
+                        ins2, make_op("hip::allocate", {{"shape", to_value(count_s)}}));
+
+                    // Reset the global atomic counter to zero each launch and
+                    // pre-zero the output buffer so unwritten rows match the
+                    // CPU implementation's behavior.
+                    count = m.insert_instruction(
+                        ins2, make_op("hip::fill", {{"value", 0}}), count);
+                    out = m.insert_instruction(
+                        ins2, make_op("hip::fill", {{"value", 0}}), out);
+
+                    args.push_back(sorted);
+                    args.push_back(mask);
+                    args.push_back(count);
+                    args.push_back(out);
+
+                    m.replace_instruction(ins2, cop, args);
+                }};
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
new file mode 100644
index 00000000000..f9f4a88a69a
--- /dev/null
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
@@ -0,0 +1,305 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_NONMAXSUPPRESSION_HPP
+#define MIGRAPHX_GUARD_KERNELS_NONMAXSUPPRESSION_HPP
+
+#include <migraphx/kernels/algorithm.hpp>
+#include <migraphx/kernels/array.hpp>
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/math.hpp>
+#include <migraphx/kernels/sort.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+#include <migraphx/kernels/types.hpp>
+
+namespace migraphx {
+
+// Per-box record carried through the sort. Box corners are stored normalized
+// to (xmin, ymin, xmax, ymax) so the IoU computation is independent of the
+// center_point_box attribute.
+struct nms_data
+{
+    float score;
+    array<float, 4> box;
+    int box_index;
+};
+
+// Decode a single box into (xmin, ymin, xmax, ymax) corners.
+template <bool CenterPointBox>
+__device__ inline array<float, 4> nms_normalize_box(const float* b)
+{
+    if constexpr(CenterPointBox)
+    {
+        const float xc = b[0];
+        const float yc = b[1];
+        const float hw = b[2] * 0.5f;
+        const float hh = b[3] * 0.5f;
+        return {xc - hw, yc - hh, xc + hw, yc + hh};
+    }
+    else
+    {
+        // ONNX layout: [y1, x1, y2, x2]; corners may be in either order.
+        const float y1   = b[0];
+        const float x1   = b[1];
+        const float y2   = b[2];
+        const float x2   = b[3];
+        const float xmin = min(x1, x2);
+        const float xmax = max(x1, x2);
+        const float ymin = min(y1, y2);
+        const float ymax = max(y1, y2);
+        return {xmin, ymin, xmax, ymax};
+    }
+}
+
+__device__ inline bool
+nms_iou_over_threshold(const array<float, 4>& a, const array<float, 4>& b, float threshold)
+{
+    const float left   = max(a[0], b[0]);
+    const float right  = min(a[2], b[2]);
+    const float top    = max(a[1], b[1]);
+    const float bottom = min(a[3], b[3]);
+    const float w      = max(right - left, 0.f);
+    const float h      = max(bottom - top, 0.f);
+    const float inter  = w * h;
+    const float area_a = max(a[2] - a[0], 0.f) * max(a[3] - a[1], 0.f);
+    const float area_b = max(b[2] - b[0], 0.f) * max(b[3] - b[1], 0.f);
+    const float un     = area_a + area_b - inter;
+    if(area_a <= 0.f or area_b <= 0.f or un <= 0.f)
+        return false;
+    return (inter / un) > threshold;
+}
+
+// Packed upper-triangular index for j > i within an N x N matrix.
+__device__ inline index_int nms_packed_idx(index_int i, index_int j, index_int N)
+{
+    return (i * N - (i * (i + 1)) / 2) + j - (i + 1);
+}
+
+struct nms_score_greater
+{
+    constexpr bool operator()(const nms_data& a, const nms_data& b) const
+    {
+        return a.score > b.score;
+    }
+};
+
+// Phase 1: load (score, box, box_index) tuples into a per-block buffer of
+// AlignedN entries (power of two), padding the [N, AlignedN) tail with sentinel
+// values, then sort the buffer in descending order by score.
+template <bool CenterPointBox, index_int N, index_int AlignedN>
+__device__ void nms_load_and_sort(index idx,
+                                  const float* boxes_b,   // [N, 4]
+                                  const float* scores_bc, // [N]
+                                  nms_data* sorted)
+{
+    idx.local_stride(AlignedN, [&](auto i) {
+        nms_data d;
+        if(i < N)
+        {
+            d.score     = scores_bc[i];
+            d.box       = nms_normalize_box<CenterPointBox>(boxes_b + i * 4);
+            d.box_index = static_cast<int>(i);
+        }
+        else
+        {
+            // Sentinel: -inf score so it never beats any real entry, and a
+            // negative box_index so accidental dereferencing is detectable.
+            d.score     = -__FLT_MAX__;
+            d.box       = array<float, 4>{0.f, 0.f, 0.f, 0.f};
+            d.box_index = -1;
+        }
+        sorted[i] = d;
+    });
+    __syncthreads();
+    bitonic_sort<nms_score_greater>{nms_score_greater{}}.template block_sort<AlignedN>(idx, sorted);
+}
+
+// Phase 2: build the packed upper-triangular IoU mask for the N sorted boxes.
+// Work is striped (i, N-1-i) per thread so each thread does roughly the same
+// amount of work regardless of where it falls in the triangle.
+template <index_int N>
+__device__ void nms_make_iou_mask(index idx, const nms_data* sorted, uint8_t* mask, float iou_thr)
+{
+    constexpr index_int half = N / 2;
+
+    auto fill_row = [&](index_int i) {
+        for(index_int j = i + 1; j < N; ++j)
+        {
+            mask[nms_packed_idx(i, j, N)] =
+                nms_iou_over_threshold(sorted[i].box, sorted[j].box, iou_thr) ? 1 : 0;
+        }
+    };
+
+    idx.local_stride(half, [&](auto i) {
+        fill_row(i);
+        fill_row(N - 1 - i);
+    });
+
+    if constexpr((N & 1) != 0 and N > 1)
+    {
+        if(idx.local == 0)
+            fill_row(half);
+    }
+}
+
+// Phase 3: greedy filter, mirroring the prototype but using a global atomic
+// counter to compact outputs from all (batch, class) blocks into a single
+// dense output buffer.
+template <index_int N>
+__device__ void nms_filter_atomic(index idx,
+                                  const nms_data* sorted,
+                                  const uint8_t* mask,
+                                  int batch_idx,
+                                  int class_idx,
+                                  index_int max_output,
+                                  float score_thr,
+                                  unsigned long long* global_count, // NOLINT
+                                  int64_t* output,
+                                  index_int output_capacity)
+{
+    __shared__ uint8_t removed[N > 0 ? N : 1];
+    // Match the CPU op: only filter by score when score_threshold > 0 (the CPU
+    // takes the same branch). With a non-positive (or sentinel) threshold, all
+    // boxes are kept regardless of sign.
+    const bool do_filter = score_thr > 0.f;
+    idx.local_stride(N, [&](auto i) {
+        removed[i] = (do_filter and sorted[i].score < score_thr) ? 1 : 0;
+    });
+    __syncthreads();
+
+    index_int output_idx = 0;
+    for(index_int i = 0; i < N; ++i)
+    {
+        if(output_idx >= max_output)
+        {
+            __syncthreads();
+            break;
+        }
+        if(removed[i] == 0)
+        {
+            if(idx.local == 0)
+            {
+                const unsigned long long slot = atomicAdd(global_count, 1ull); // NOLINT
+                if(slot < static_cast<unsigned long long>(output_capacity))
+                {
+                    output[slot * 3 + 0] = batch_idx;
+                    output[slot * 3 + 1] = class_idx;
+                    output[slot * 3 + 2] = sorted[i].box_index;
+                }
+            }
+            ++output_idx;
+            // Update removed[] using row i of the IoU mask. Each thread handles
+            // a stride of the row to balance work.
+            for(index_int j = i + 1 + idx.local; j < N; j += idx.nlocal())
+            {
+                removed[j] |= mask[nms_packed_idx(i, j, N)];
+            }
+        }
+        __syncthreads();
+    }
+}
+
+// Per-block driver: one block per (batch_idx, class_idx). Workspace pointers
+// are sliced into per-block segments using idx.group.
+template <bool CenterPointBox,
+          index_int NumBatches,
+          index_int NumClasses,
+          index_int NumBoxes,
+          index_int AlignedNumBoxes,
+          class Boxes,
+          class Scores,
+          class MaxOut,
+          class IouThr,
+          class ScoreThr,
+          class Sorted,
+          class Mask,
+          class Count,
+          class Output>
+__device__ void nonmaxsuppression(Boxes boxes,
+                                  Scores scores,
+                                  MaxOut max_out_p,
+                                  IouThr iou_thr_p,
+                                  ScoreThr score_thr_p,
+                                  Sorted sorted_buf,
+                                  Mask mask_buf,
+                                  Count count_buf,
+                                  Output output)
+{
+    static_assert(NumBatches > 0, "num_batches must be > 0");
+    static_assert(NumClasses > 0, "num_classes must be > 0");
+
+    auto idx                            = make_index();
+    const index_int block_id            = idx.group;
+    const int batch_idx                 = static_cast<int>(block_id / NumClasses);
+    const int class_idx                 = static_cast<int>(block_id % NumClasses);
+    constexpr index_int iou_packed_size = (NumBoxes > 1) ? (NumBoxes * (NumBoxes - 1)) / 2 : 1;
+
+    nms_data* my_sorted =
+        reinterpret_cast<nms_data*>(sorted_buf.data()) + block_id * AlignedNumBoxes;
+    uint8_t* my_mask = reinterpret_cast<uint8_t*>(mask_buf.data()) + block_id * iou_packed_size;
+
+    const float* boxes_b   = boxes.data() + batch_idx * NumBoxes * 4;
+    const float* scores_bc = scores.data() + (batch_idx * NumClasses + class_idx) * NumBoxes;
+
+    // Pull scalar tensor inputs once. They're broadcast to all threads via the
+    // common load (each thread reads the same single element).
+    const int64_t max_out_val = max_out_p[0];
+    const float iou_thr_val   = iou_thr_p[0];
+    const float score_thr_val = score_thr_p[0];
+
+    nms_load_and_sort<CenterPointBox, NumBoxes, AlignedNumBoxes>(
+        idx, boxes_b, scores_bc, my_sorted);
+    __syncthreads();
+
+    if constexpr(NumBoxes > 1)
+    {
+        nms_make_iou_mask<NumBoxes>(idx, my_sorted, my_mask, iou_thr_val);
+        __syncthreads();
+    }
+
+    // The CPU op reads max_output_boxes_per_class as std::size_t, so a negative
+    // signed value is treated as a very large unsigned (effectively unlimited).
+    // Mirror that here by reinterpreting as unsigned and then capping at
+    // NumBoxes, which is the most we could ever emit per (batch, class) block.
+    const auto max_unsigned         = static_cast<uint64_t>(max_out_val);
+    const index_int max_output      = (max_unsigned > static_cast<uint64_t>(NumBoxes))
+                                          ? static_cast<index_int>(NumBoxes)
+                                          : static_cast<index_int>(max_unsigned);
+    const index_int output_capacity = output.get_shape().lens[0];
+    auto* count_addr =
+        reinterpret_cast<unsigned long long*>(count_buf.data()); // NOLINT
+    nms_filter_atomic<NumBoxes>(idx,
+                                my_sorted,
+                                my_mask,
+                                batch_idx,
+                                class_idx,
+                                max_output,
+                                score_thr_val,
+                                count_addr,
+                                output.data(),
+                                output_capacity);
+}
+
+} // namespace migraphx
+
+#endif // MIGRAPHX_GUARD_KERNELS_NONMAXSUPPRESSION_HPP
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp
index 1e3cc019558..fa4d1c981e2 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp
@@ -139,6 +139,34 @@ struct bitonic_sort
                 lane_merge(get_bit(id, w), x);
         });
     }
+
+    // Block-level bitonic sort over a power-of-two buffer in shared or global
+    // memory. All threads in the block cooperate; buf must point to N elements
+    // visible to every thread. The compare_function determines the final order
+    // (e.g. greater{} -> descending). The buffer must be sized to N (a
+    // compile-time power of 2); callers pad with sentinel values when the
+    // logical length is smaller.
+    template <index_int N, class T>
+    __device__ void block_sort(index idx, T* buf) const
+    {
+        static_assert(is_power_of_2(N), "N must be a power of 2");
+        for(index_int k = 2; k <= N; k <<= 1)
+        {
+            for(index_int j = k >> 1; j > 0; j >>= 1)
+            {
+                idx.local_stride(N, [&](auto tid) {
+                    index_int partner = tid ^ j;
+                    if(partner > tid)
+                    {
+                        const bool reverse = (tid & k) != 0;
+                        if(this->compare(buf[tid], buf[partner], reverse))
+                            swap(buf[tid], buf[partner]);
+                    }
+                });
+                __syncthreads();
+            }
+        }
+    }
 };
 
 MIGRAPHX_AUTO_DEDUCE(bitonic_sort);
diff --git a/src/targets/gpu/lowering.cpp b/src/targets/gpu/lowering.cpp
index 6b6def4721d..108d994057a 100644
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -449,6 +449,14 @@ struct miopen_apply
     void add_nms_op()
     {
         apply_map.emplace("nonmaxsuppression", [=](instruction_ref ins) {
+            // Fixed-output NMS is handled by the JIT kernel registered via
+            // jit/nonmaxsuppression.cpp; route it through insert_precompile_op
+            // so compile_ops picks it up later. The dynamic-output mode still
+            // falls back to the CPU implementation.
+            auto op_val = ins->get_operator().to_value();
+            if(not op_val.at("use_dyn_output").to<bool>())
+                return insert_precompile_op(ins);
+
             auto s      = ins->get_shape();
             auto output = insert_allocation(ins, s);
             std::vector<instruction_ref> cpu_inputs;
diff --git a/test/verify/test_nms.cpp b/test/verify/test_nms.cpp
index 6b3e56bafd6..98702a12b5e 100644
--- a/test/verify/test_nms.cpp
+++ b/test/verify/test_nms.cpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -57,3 +57,114 @@ struct test_nms : verify_program<test_nms>
         return p;
     }
 };
+
+// Multi-batch fixed-output NMS exercises the (batch_idx, class_idx) -> block_id
+// dispatch in the GPU kernel.
+struct test_nms_multi_batch : verify_program<test_nms_multi_batch>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+
+        migraphx::shape boxes_s{migraphx::shape::float_type, {2, 6, 4}};
+        std::vector<float> boxes_vec = {0.5, 0.5, 1.0, 1.0, 0.5, 0.6,   1.0, 1.0, 0.5, 0.4,  1.0,
+                                        1.0, 0.5, 10.5, 1.0, 1.0, 0.5,  10.6, 1.0, 1.0, 0.5, 100.5,
+                                        1.0, 1.0, 0.5, 0.5, 1.0, 1.0,   0.5, 0.6, 1.0, 1.0, 0.5,
+                                        0.4, 1.0, 1.0, 0.5, 10.5, 1.0,  1.0, 0.5, 10.6, 1.0, 1.0,
+                                        0.5, 100.5, 1.0, 1.0};
+
+        migraphx::shape scores_s{migraphx::shape::float_type, {2, 1, 6}};
+        std::vector<float> scores_vec = {
+            0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f, 0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f};
+
+        auto boxes_l         = mm->add_literal(migraphx::literal(boxes_s, boxes_vec));
+        auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
+        auto max_out_l       = mm->add_literal(int64_t{4});
+        auto iou_threshold   = mm->add_literal(0.5f);
+        auto score_threshold = mm->add_literal(0.0f);
+
+        auto r =
+            mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", 1}}),
+                                boxes_l,
+                                scores_l,
+                                max_out_l,
+                                iou_threshold,
+                                score_threshold);
+        mm->add_return({r});
+
+        return p;
+    }
+};
+
+// Multi-class fixed-output NMS exercises per-class greedy filtering with
+// outputs interleaved by the global atomic counter.
+struct test_nms_multi_class : verify_program<test_nms_multi_class>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+
+        migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}};
+        std::vector<float> boxes_vec = {0.0, 0.0, 1.0,   1.0,   0.0, 0.1,   1.0, 1.1, 0.0,
+                                        -0.1, 1.0, 0.9,  0.0,   10.0, 1.0,  11.0, 0.0, 10.1,
+                                        1.0, 11.1, 0.0,  100.0, 1.0,  101.0};
+
+        migraphx::shape scores_s{migraphx::shape::float_type, {1, 2, 6}};
+        std::vector<float> scores_vec = {
+            0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f, 0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f};
+
+        auto boxes_l         = mm->add_literal(migraphx::literal(boxes_s, boxes_vec));
+        auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
+        auto max_out_l       = mm->add_literal(int64_t{2});
+        auto iou_threshold   = mm->add_literal(0.5f);
+        auto score_threshold = mm->add_literal(0.0f);
+
+        auto r =
+            mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", 1}}),
+                                boxes_l,
+                                scores_l,
+                                max_out_l,
+                                iou_threshold,
+                                score_threshold);
+        mm->add_return({r});
+
+        return p;
+    }
+};
+
+// center_point_box=0 path with potentially flipped corner coordinates.
+struct test_nms_not_center : verify_program<test_nms_not_center>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+
+        migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}};
+        std::vector<float> boxes_vec = {1.0, 1.0,  0.0, 0.0,  0.0, 0.1,   1.0, 1.1,
+                                        0.0, 0.9,  1.0, -0.1, 0.0, 10.0,  1.0, 11.0,
+                                        1.0, 10.1, 0.0, 11.1, 1.0, 101.0, 0.0, 100.0};
+
+        migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}};
+        std::vector<float> scores_vec = {0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f};
+
+        auto boxes_l         = mm->add_literal(migraphx::literal(boxes_s, boxes_vec));
+        auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
+        auto max_out_l       = mm->add_literal(int64_t{4});
+        auto iou_threshold   = mm->add_literal(0.5f);
+        auto score_threshold = mm->add_literal(0.0f);
+
+        auto r =
+            mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", 0}}),
+                                boxes_l,
+                                scores_l,
+                                max_out_l,
+                                iou_threshold,
+                                score_threshold);
+        mm->add_return({r});
+
+        return p;
+    }
+};

From 4ec2fe13103402fbca7f13b687eab13b809821b2 Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Tue, 12 May 2026 16:00:09 -0500
Subject: [PATCH 02/32] AI edit with "compact" kernel

---
 src/targets/gpu/jit/nonmaxsuppression.cpp     | 160 ++++++++++++++----
 .../migraphx/kernels/nonmaxsuppression.hpp    | 135 ++++++++++-----
 test/verify/test_nms.cpp                      |  35 +---
 3 files changed, 230 insertions(+), 100 deletions(-)

diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp
index 1b6ba4bb62c..ff7dd14e289 100644
--- a/src/targets/gpu/jit/nonmaxsuppression.cpp
+++ b/src/targets/gpu/jit/nonmaxsuppression.cpp
@@ -43,8 +43,12 @@ namespace gpu {
 // and reinterpreted in the kernel.
 static constexpr std::size_t nms_bytes_per_data = 24;
 
+// Phase-1 ("compute") kernel: each block runs NMS for its (batch, class) and
+// writes selections into a per-block region of the raw_output scratch plus a
+// per-block count. No global atomic counter is used, so per-block contents
+// are deterministic.
 // NOLINTNEXTLINE
-static const char* const nms_kernel_src = R"__migraphx__(
+static const char* const nms_compute_kernel_src = R"__migraphx__(
 #include <migraphx/kernels/nonmaxsuppression.hpp>
 #include <args.hpp>
 
@@ -61,14 +65,40 @@ MIGRAPHX_GLOBAL void nms_kernel(${params})
                                 auto thr_p,
                                 auto sorted,
                                 auto mask,
-                                auto count,
-                                auto out) {
+                                auto counts,
+                                auto raw_out) {
         nonmaxsuppression<${center_point_box},
                           ${num_batches},
                           ${num_classes},
                           ${num_boxes},
                           ${aligned_num_boxes}>(
-            boxes, scores, max_p, iou_p, thr_p, sorted, mask, count, out);
+            boxes, scores, max_p, iou_p, thr_p, sorted, mask, counts, raw_out);
+    });
+}
+
+}
+
+} // namespace migraphx
+)__migraphx__";
+
+// Phase-2 ("compact") kernel: a single thread walks the per-block raw_output
+// regions in block_id order and copies the first counts[b] selections from
+// each region into a contiguous prefix of the final output. The order of
+// (block_id 0, 1, ...) is the same as the CPU op's (batch, class) iteration
+// order, so the resulting output matches the CPU op exactly.
+// NOLINTNEXTLINE
+static const char* const nms_compact_kernel_src = R"__migraphx__(
+#include <migraphx/kernels/nonmaxsuppression.hpp>
+#include <args.hpp>
+
+namespace migraphx {
+
+extern "C" {
+
+MIGRAPHX_GLOBAL void nms_compact_kernel(${params})
+{
+    make_tensors()(${args})([](auto counts, auto raw_out, auto out) {
+        nonmaxsuppression_compact<${num_blocks}, ${num_boxes}>(counts, raw_out, out);
     });
 }
 
@@ -81,10 +111,13 @@ struct nms_compiler : compiler<nms_compiler>
 {
     std::vector<std::string> names() const { return {"nonmaxsuppression"}; }
 
-    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    // Compile the per-block compute kernel. `inputs` is:
+    //   [boxes, scores, max, iou, score_thr, sorted, mask, counts, raw_output]
+    // `raw_output` is the last input so the framework treats it as the
+    // kernel's output buffer; the per-block counts is an in/out scratch.
+    operation
+    compile_compute(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
-        // inputs (in order): boxes, scores, max, iou, score_thr,
-        //                    sorted_data, iou_mask, global_count, output.
         const auto& boxes_s   = inputs[0];
         const auto& scores_s  = inputs[1];
         const auto nb         = boxes_s.lens()[0];
@@ -95,13 +128,13 @@ struct nms_compiler : compiler<nms_compiler>
 
         hip_compile_options options;
         options.inputs         = inputs;
-        options.output         = inputs.back();
+        options.output         = inputs.back(); // raw_output buffer
         options.kernel_name    = "nms_kernel";
         options.virtual_inputs = inputs;
         options.set_launch_params(v, block_size * nb * nc, block_size);
 
         auto src = interpolate_string(
-            nms_kernel_src,
+            nms_compute_kernel_src,
             {{"params", enum_params(inputs.size(), "void * private_p")},
              {"args", enum_params(inputs.size(), "private_p")},
              {"num_batches", std::to_string(nb)},
@@ -113,6 +146,47 @@ struct nms_compiler : compiler<nms_compiler>
         return compile_hip_code_object(ctx, src, options);
     }
 
+    // Compile the serial compaction kernel. `inputs` is:
+    //   [counts, raw_output, output]
+    // Launched with one thread (single block, single thread) since the work
+    // is intentionally serial: it walks per-block regions in fixed order to
+    // produce the exact byte-for-byte output the CPU op produces.
+    operation
+    compile_compact(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        // Derive num_blocks (length of counts) and per-block stride NumBoxes
+        // (raw_output is sized nb*nc*NumBoxes*3 int64 entries).
+        const auto& cnt_s     = inputs[0];
+        const auto& raw_s     = inputs[1];
+        const auto num_blocks = cnt_s.elements();
+        const auto num_boxes  = (num_blocks > 0)
+                                    ? raw_s.elements() / (num_blocks * std::size_t{3})
+                                    : std::size_t{0};
+
+        hip_compile_options options;
+        options.inputs         = inputs;
+        options.output         = inputs.back();
+        options.kernel_name    = "nms_compact_kernel";
+        options.virtual_inputs = inputs;
+        options.set_launch_params(v, std::size_t{1}, std::size_t{1});
+
+        auto src = interpolate_string(
+            nms_compact_kernel_src,
+            {{"params", enum_params(inputs.size(), "void * private_p")},
+             {"args", enum_params(inputs.size(), "private_p")},
+             {"num_blocks", std::to_string(num_blocks)},
+             {"num_boxes", std::to_string(num_boxes)}});
+        return compile_hip_code_object(ctx, src, options);
+    }
+
+    // Required compiler<> hook: return the compute kernel based on the raw
+    // input shapes. The full two-kernel chain is handled in `compile()`; this
+    // entry point is only used by callers that ask for a single op view.
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        return compile_compute(ctx, inputs, v);
+    }
+
     compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
     {
         // ins->inputs() is [user_inputs..., output_alloc] from
@@ -154,17 +228,32 @@ struct nms_compiler : compiler<nms_compiler>
 
         shape sorted_s{shape::int8_type, {nb * nc * aligned_b * nms_bytes_per_data}};
         shape mask_s{shape::uint8_type, {nb * nc * iou_packed}};
-        shape count_s{shape::int64_type, {1}};
+        // Per-block raw output: nb*nc blocks, each can write up to b
+        // selections of (batch, class, box_idx) int64 triples.
+        shape raw_output_s{shape::int64_type, {nb * nc * b * 3}};
+        // Per-block selection counts (one int32 per (batch, class) block).
+        shape counts_s{shape::int32_type, {nb * nc}};
+
+        // Compute kernel input shapes: [user inputs..., sorted, mask, counts, raw_out]
+        std::vector<shape> compute_shapes = raw_shapes;
+        compute_shapes.push_back(sorted_s);
+        compute_shapes.push_back(mask_s);
+        compute_shapes.push_back(counts_s);
+        compute_shapes.push_back(raw_output_s);
+
+        // Compact kernel input shapes: [counts, raw_out, output]
+        std::vector<shape> compact_shapes;
+        compact_shapes.push_back(counts_s);
+        compact_shapes.push_back(raw_output_s);
+        compact_shapes.push_back(raw.back()->get_shape());
 
-        std::vector<shape> kshapes = raw_shapes;
-        kshapes.push_back(sorted_s);
-        kshapes.push_back(mask_s);
-        kshapes.push_back(count_s);
-        kshapes.push_back(raw.back()->get_shape());
+        auto compute_kop = compile_compute(ctx, compute_shapes, op.to_value());
+        auto compact_kop = compile_compact(ctx, compact_shapes, op.to_value());
 
-        auto kop = compile_op(ctx, kshapes, op.to_value());
+        std::vector<operation> kops = {compute_kop, compact_kop};
 
-        return {kop, [=](module& m, instruction_ref ins2, const operation& cop) {
+        return {kops,
+                [=](module& m, instruction_ref ins2, const std::vector<operation>& cops) {
                     auto args = ins2->inputs();
                     auto out  = args.back();
                     args.pop_back();
@@ -189,23 +278,34 @@ struct nms_compiler : compiler<nms_compiler>
                         ins2, make_op("hip::allocate", {{"shape", to_value(sorted_s)}}));
                     auto mask = m.insert_instruction(
                         ins2, make_op("hip::allocate", {{"shape", to_value(mask_s)}}));
-                    auto count = m.insert_instruction(
-                        ins2, make_op("hip::allocate", {{"shape", to_value(count_s)}}));
-
-                    // Reset the global atomic counter to zero each launch and
-                    // pre-zero the output buffer so unwritten rows match the
-                    // CPU implementation's behavior.
-                    count = m.insert_instruction(
-                        ins2, make_op("hip::fill", {{"value", 0}}), count);
+                    auto raw_out = m.insert_instruction(
+                        ins2, make_op("hip::allocate", {{"shape", to_value(raw_output_s)}}));
+                    auto counts = m.insert_instruction(
+                        ins2, make_op("hip::allocate", {{"shape", to_value(counts_s)}}));
+
+                    // Pre-zero the final output buffer so unwritten rows match
+                    // the CPU implementation's behavior (trailing zeros). The
+                    // counts and raw_out scratch don't need zeroing: each
+                    // block writes its count exactly once and the compact
+                    // kernel only reads counts[b] entries from each block.
                     out = m.insert_instruction(
                         ins2, make_op("hip::fill", {{"value", 0}}), out);
 
-                    args.push_back(sorted);
-                    args.push_back(mask);
-                    args.push_back(count);
-                    args.push_back(out);
+                    auto compute_args = args;
+                    compute_args.push_back(sorted);
+                    compute_args.push_back(mask);
+                    compute_args.push_back(counts);
+                    compute_args.push_back(raw_out);
+
+                    auto compute_ins =
+                        m.insert_instruction(ins2, cops[0], compute_args);
 
-                    m.replace_instruction(ins2, cop, args);
+                    // Use compute_ins (returned raw_out) as the dataflow edge
+                    // so the compact kernel is ordered after the compute
+                    // kernel and the raw_out buffer remains live.
+                    std::vector<instruction_ref> compact_args = {
+                        counts, compute_ins, out};
+                    m.replace_instruction(ins2, cops[1], compact_args);
                 }};
     }
 };
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
index f9f4a88a69a..ead79aed578 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
@@ -162,20 +162,20 @@ __device__ void nms_make_iou_mask(index idx, const nms_data* sorted, uint8_t* ma
     }
 }
 
-// Phase 3: greedy filter, mirroring the prototype but using a global atomic
-// counter to compact outputs from all (batch, class) blocks into a single
-// dense output buffer.
+// Phase 3: greedy filter that writes selections into a per-block region of a
+// scratch buffer (block_id * N entries) and stores the per-block count. A
+// follow-up serial compaction kernel gathers per-block regions in block_id
+// order to produce a deterministic compacted output that matches the CPU op.
 template <index_int N>
-__device__ void nms_filter_atomic(index idx,
-                                  const nms_data* sorted,
-                                  const uint8_t* mask,
-                                  int batch_idx,
-                                  int class_idx,
-                                  index_int max_output,
-                                  float score_thr,
-                                  unsigned long long* global_count, // NOLINT
-                                  int64_t* output,
-                                  index_int output_capacity)
+__device__ void nms_filter_per_block(index idx,
+                                     const nms_data* sorted,
+                                     const uint8_t* mask,
+                                     int batch_idx,
+                                     int class_idx,
+                                     index_int max_output,
+                                     float score_thr,
+                                     int64_t* raw_output,    // [num_blocks * N * 3]
+                                     int32_t* block_counts)  // [num_blocks]
 {
     __shared__ uint8_t removed[N > 0 ? N : 1];
     // Match the CPU op: only filter by score when score_threshold > 0 (the CPU
@@ -187,6 +187,9 @@ __device__ void nms_filter_atomic(index idx,
     });
     __syncthreads();
 
+    const index_int block_id = idx.group;
+    int64_t* my_output       = raw_output + block_id * N * 3;
+
     index_int output_idx = 0;
     for(index_int i = 0; i < N; ++i)
     {
@@ -199,13 +202,9 @@ __device__ void nms_filter_atomic(index idx,
         {
             if(idx.local == 0)
             {
-                const unsigned long long slot = atomicAdd(global_count, 1ull); // NOLINT
-                if(slot < static_cast<unsigned long long>(output_capacity))
-                {
-                    output[slot * 3 + 0] = batch_idx;
-                    output[slot * 3 + 1] = class_idx;
-                    output[slot * 3 + 2] = sorted[i].box_index;
-                }
+                my_output[output_idx * 3 + 0] = batch_idx;
+                my_output[output_idx * 3 + 1] = class_idx;
+                my_output[output_idx * 3 + 2] = sorted[i].box_index;
             }
             ++output_idx;
             // Update removed[] using row i of the IoU mask. Each thread handles
@@ -217,10 +216,47 @@ __device__ void nms_filter_atomic(index idx,
         }
         __syncthreads();
     }
+
+    if(idx.local == 0)
+        block_counts[block_id] = static_cast<int32_t>(output_idx);
+}
+
+// Serial compaction: a single thread walks per-block regions in block_id order
+// (which equals the CPU op's (batch, class) iteration order) and copies the
+// first block_counts[b] entries of each region into a contiguous prefix of the
+// final output buffer. Trailing slots are left as the zero fill applied before
+// this kernel runs.
+template <index_int NumBlocks, index_int NumBoxes>
+__device__ void
+nms_compact(index idx, const int64_t* raw_output, const int32_t* block_counts, int64_t* output)
+{
+    if(idx.global == 0)
+    {
+        index_int dst = 0;
+        for(index_int b = 0; b < NumBlocks; ++b)
+        {
+            const int32_t cnt  = block_counts[b];
+            const int64_t* src = raw_output + b * NumBoxes * 3;
+            for(int32_t i = 0; i < cnt; ++i)
+            {
+                output[dst * 3 + 0] = src[i * 3 + 0];
+                output[dst * 3 + 1] = src[i * 3 + 1];
+                output[dst * 3 + 2] = src[i * 3 + 2];
+                ++dst;
+            }
+        }
+    }
 }
 
 // Per-block driver: one block per (batch_idx, class_idx). Workspace pointers
-// are sliced into per-block segments using idx.group.
+// are sliced into per-block segments using idx.group. Selections are written
+// to a per-block region of `raw_output` and the per-block count is written to
+// `block_counts`; a follow-up compact kernel produces the final compacted
+// output that matches the CPU op's ordering.
+//
+// `raw_output_buf` is intentionally the last parameter so that JIT-compiled
+// callers (which use `inputs.back()` as the kernel's output buffer) treat it
+// as the chained output flowing into the compact kernel.
 template <bool CenterPointBox,
           index_int NumBatches,
           index_int NumClasses,
@@ -233,8 +269,8 @@ template <bool CenterPointBox,
           class ScoreThr,
           class Sorted,
           class Mask,
-          class Count,
-          class Output>
+          class Counts,
+          class RawOutput>
 __device__ void nonmaxsuppression(Boxes boxes,
                                   Scores scores,
                                   MaxOut max_out_p,
@@ -242,8 +278,8 @@ __device__ void nonmaxsuppression(Boxes boxes,
                                   ScoreThr score_thr_p,
                                   Sorted sorted_buf,
                                   Mask mask_buf,
-                                  Count count_buf,
-                                  Output output)
+                                  Counts counts_buf,
+                                  RawOutput raw_output_buf)
 {
     static_assert(NumBatches > 0, "num_batches must be > 0");
     static_assert(NumClasses > 0, "num_classes must be > 0");
@@ -281,23 +317,38 @@ __device__ void nonmaxsuppression(Boxes boxes,
     // signed value is treated as a very large unsigned (effectively unlimited).
     // Mirror that here by reinterpreting as unsigned and then capping at
     // NumBoxes, which is the most we could ever emit per (batch, class) block.
-    const auto max_unsigned         = static_cast<uint64_t>(max_out_val);
-    const index_int max_output      = (max_unsigned > static_cast<uint64_t>(NumBoxes))
-                                          ? static_cast<index_int>(NumBoxes)
-                                          : static_cast<index_int>(max_unsigned);
-    const index_int output_capacity = output.get_shape().lens[0];
-    auto* count_addr =
-        reinterpret_cast<unsigned long long*>(count_buf.data()); // NOLINT
-    nms_filter_atomic<NumBoxes>(idx,
-                                my_sorted,
-                                my_mask,
-                                batch_idx,
-                                class_idx,
-                                max_output,
-                                score_thr_val,
-                                count_addr,
-                                output.data(),
-                                output_capacity);
+    const auto max_unsigned    = static_cast<uint64_t>(max_out_val);
+    const index_int max_output = (max_unsigned > static_cast<uint64_t>(NumBoxes))
+                                     ? static_cast<index_int>(NumBoxes)
+                                     : static_cast<index_int>(max_unsigned);
+    nms_filter_per_block<NumBoxes>(idx,
+                                   my_sorted,
+                                   my_mask,
+                                   batch_idx,
+                                   class_idx,
+                                   max_output,
+                                   score_thr_val,
+                                   reinterpret_cast<int64_t*>(raw_output_buf.data()),
+                                   reinterpret_cast<int32_t*>(counts_buf.data()));
+}
+
+// Serial compact wrapper invoked from the second JIT kernel. Reads the
+// per-block counts and raw_output produced by `nonmaxsuppression` and copies
+// selections into the final output in block_id (i.e. (batch, class)) order.
+// `output` is last to match the JIT convention of using `inputs.back()` as
+// the kernel's logical output buffer.
+template <index_int NumBlocks, index_int NumBoxes, class Counts, class RawOutput, class Output>
+__device__ void nonmaxsuppression_compact(Counts counts_buf,
+                                          RawOutput raw_output_buf,
+                                          Output output)
+{
+    static_assert(NumBlocks > 0, "num_blocks must be > 0");
+
+    auto idx = make_index();
+    nms_compact<NumBlocks, NumBoxes>(idx,
+                                     reinterpret_cast<const int64_t*>(raw_output_buf.data()),
+                                     reinterpret_cast<const int32_t*>(counts_buf.data()),
+                                     output.data());
 }
 
 } // namespace migraphx
diff --git a/test/verify/test_nms.cpp b/test/verify/test_nms.cpp
index c99364396f1..94828ae35a2 100644
--- a/test/verify/test_nms.cpp
+++ b/test/verify/test_nms.cpp
@@ -35,12 +35,10 @@ struct test_nms : verify_program<test_nms>
         auto* mm = p.get_main_module();
 
         migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}};
-
         migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}};
-        std::vector<float> scores_vec = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
 
         auto boxes_l         = mm->add_parameter("boxes", boxes_s);
-        auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
+        auto scores_l        = mm->add_parameter("scores", scores_s);
         auto max_out_l       = mm->add_literal(int64_t{4});
         auto iou_threshold   = mm->add_literal(0.5f);
         auto score_threshold = mm->add_literal(0.0f);
@@ -68,18 +66,10 @@ struct test_nms_multi_batch : verify_program<test_nms_multi_batch>
         auto* mm = p.get_main_module();
 
         migraphx::shape boxes_s{migraphx::shape::float_type, {2, 6, 4}};
-        std::vector<float> boxes_vec = {0.5, 0.5, 1.0, 1.0, 0.5, 0.6,   1.0, 1.0, 0.5, 0.4,  1.0,
-                                        1.0, 0.5, 10.5, 1.0, 1.0, 0.5,  10.6, 1.0, 1.0, 0.5, 100.5,
-                                        1.0, 1.0, 0.5, 0.5, 1.0, 1.0,   0.5, 0.6, 1.0, 1.0, 0.5,
-                                        0.4, 1.0, 1.0, 0.5, 10.5, 1.0,  1.0, 0.5, 10.6, 1.0, 1.0,
-                                        0.5, 100.5, 1.0, 1.0};
-
         migraphx::shape scores_s{migraphx::shape::float_type, {2, 1, 6}};
-        std::vector<float> scores_vec = {
-            0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f, 0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f};
 
-        auto boxes_l         = mm->add_literal(migraphx::literal(boxes_s, boxes_vec));
-        auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
+        auto boxes_l         = mm->add_parameter("boxes", boxes_s);
+        auto scores_l        = mm->add_parameter("scores", scores_s);
         auto max_out_l       = mm->add_literal(int64_t{4});
         auto iou_threshold   = mm->add_literal(0.5f);
         auto score_threshold = mm->add_literal(0.0f);
@@ -107,16 +97,10 @@ struct test_nms_multi_class : verify_program<test_nms_multi_class>
         auto* mm = p.get_main_module();
 
         migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}};
-        std::vector<float> boxes_vec = {0.0, 0.0, 1.0,   1.0,   0.0, 0.1,   1.0, 1.1, 0.0,
-                                        -0.1, 1.0, 0.9,  0.0,   10.0, 1.0,  11.0, 0.0, 10.1,
-                                        1.0, 11.1, 0.0,  100.0, 1.0,  101.0};
-
         migraphx::shape scores_s{migraphx::shape::float_type, {1, 2, 6}};
-        std::vector<float> scores_vec = {
-            0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f, 0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f};
 
-        auto boxes_l         = mm->add_literal(migraphx::literal(boxes_s, boxes_vec));
-        auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
+        auto boxes_l         = mm->add_parameter("boxes", boxes_s);
+        auto scores_l        = mm->add_parameter("scores", scores_s);
         auto max_out_l       = mm->add_literal(int64_t{2});
         auto iou_threshold   = mm->add_literal(0.5f);
         auto score_threshold = mm->add_literal(0.0f);
@@ -143,15 +127,10 @@ struct test_nms_not_center : verify_program<test_nms_not_center>
         auto* mm = p.get_main_module();
 
         migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}};
-        std::vector<float> boxes_vec = {1.0, 1.0,  0.0, 0.0,  0.0, 0.1,   1.0, 1.1,
-                                        0.0, 0.9,  1.0, -0.1, 0.0, 10.0,  1.0, 11.0,
-                                        1.0, 10.1, 0.0, 11.1, 1.0, 101.0, 0.0, 100.0};
-
         migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}};
-        std::vector<float> scores_vec = {0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f};
 
-        auto boxes_l         = mm->add_literal(migraphx::literal(boxes_s, boxes_vec));
-        auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
+        auto boxes_l         = mm->add_parameter("boxes", boxes_s);
+        auto scores_l        = mm->add_parameter("scores", scores_s);
         auto max_out_l       = mm->add_literal(int64_t{4});
         auto iou_threshold   = mm->add_literal(0.5f);
         auto score_threshold = mm->add_literal(0.0f);

From 18ae57e89a7eb38a1c47104e0cb3eca1aef1337c Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Wed, 13 May 2026 14:32:14 -0500
Subject: [PATCH 03/32] AI split into 3 kernels

---
 src/targets/gpu/jit/nonmaxsuppression.cpp     | 262 +++++++++++++-----
 .../migraphx/kernels/nonmaxsuppression.hpp    | 143 ++++++----
 test/verify/test_nms.cpp                      |   1 +
 3 files changed, 286 insertions(+), 120 deletions(-)

diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp
index ff7dd14e289..189ba2057e8 100644
--- a/src/targets/gpu/jit/nonmaxsuppression.cpp
+++ b/src/targets/gpu/jit/nonmaxsuppression.cpp
@@ -43,12 +43,12 @@ namespace gpu {
 // and reinterpreted in the kernel.
 static constexpr std::size_t nms_bytes_per_data = 24;
 
-// Phase-1 ("compute") kernel: each block runs NMS for its (batch, class) and
-// writes selections into a per-block region of the raw_output scratch plus a
-// per-block count. No global atomic counter is used, so per-block contents
-// are deterministic.
+// Phase-1 ("sort") kernel: each block normalizes its (batch, class)'s boxes
+// and bitonic-sorts them by descending score into a per-block region of the
+// `sorted` scratch buffer. Launch dimensions are sized to AlignedNumBoxes so
+// the sort has enough parallelism even when NumBoxes is small relative to it.
 // NOLINTNEXTLINE
-static const char* const nms_compute_kernel_src = R"__migraphx__(
+static const char* const nms_sort_kernel_src = R"__migraphx__(
 #include <migraphx/kernels/nonmaxsuppression.hpp>
 #include <args.hpp>
 
@@ -56,23 +56,50 @@ namespace migraphx {
 
 extern "C" {
 
-MIGRAPHX_GLOBAL void nms_kernel(${params})
+MIGRAPHX_GLOBAL void nms_sort_kernel(${params})
 {
-    make_tensors()(${args})([](auto boxes,
-                                auto scores,
+    make_tensors()(${args})([](auto boxes, auto scores, auto sorted) {
+        nonmaxsuppression_sort<${center_point_box},
+                               ${num_batches},
+                               ${num_classes},
+                               ${num_boxes},
+                               ${aligned_num_boxes}>(boxes, scores, sorted);
+    });
+}
+
+}
+
+} // namespace migraphx
+)__migraphx__";
+
+// Phase-2 ("filter") kernel: each block reads its (batch, class)'s sorted
+// records out of the shared `sorted` buffer, builds the IoU mask, runs the
+// greedy filter, and writes selections into a per-block region of the
+// `raw_output` scratch plus a per-block count. No global atomic counter is
+// used, so per-block contents are deterministic.
+// NOLINTNEXTLINE
+static const char* const nms_filter_kernel_src = R"__migraphx__(
+#include <migraphx/kernels/nonmaxsuppression.hpp>
+#include <args.hpp>
+
+namespace migraphx {
+
+extern "C" {
+
+MIGRAPHX_GLOBAL void nms_filter_kernel(${params})
+{
+    make_tensors()(${args})([](auto sorted,
                                 auto max_p,
                                 auto iou_p,
                                 auto thr_p,
-                                auto sorted,
                                 auto mask,
                                 auto counts,
                                 auto raw_out) {
-        nonmaxsuppression<${center_point_box},
-                          ${num_batches},
-                          ${num_classes},
-                          ${num_boxes},
-                          ${aligned_num_boxes}>(
-            boxes, scores, max_p, iou_p, thr_p, sorted, mask, counts, raw_out);
+        nonmaxsuppression_filter<${num_batches},
+                                 ${num_classes},
+                                 ${num_boxes},
+                                 ${aligned_num_boxes}>(
+            sorted, max_p, iou_p, thr_p, mask, counts, raw_out);
     });
 }
 
@@ -81,11 +108,12 @@ MIGRAPHX_GLOBAL void nms_kernel(${params})
 } // namespace migraphx
 )__migraphx__";
 
-// Phase-2 ("compact") kernel: a single thread walks the per-block raw_output
-// regions in block_id order and copies the first counts[b] selections from
-// each region into a contiguous prefix of the final output. The order of
-// (block_id 0, 1, ...) is the same as the CPU op's (batch, class) iteration
-// order, so the resulting output matches the CPU op exactly.
+// Phase-3 ("compact") kernel: a single block does an exclusive prefix scan
+// over the per-block counts to obtain output offsets, then its threads
+// scatter selections from each per-block region of `raw_output` into the
+// contiguous prefix of the final output. The order of (block_id 0, 1, ...)
+// is the same as the CPU op's (batch, class) iteration order, so the
+// resulting output matches the CPU op exactly.
 // NOLINTNEXTLINE
 static const char* const nms_compact_kernel_src = R"__migraphx__(
 #include <migraphx/kernels/nonmaxsuppression.hpp>
@@ -111,30 +139,36 @@ struct nms_compiler : compiler<nms_compiler>
 {
     std::vector<std::string> names() const { return {"nonmaxsuppression"}; }
 
-    // Compile the per-block compute kernel. `inputs` is:
-    //   [boxes, scores, max, iou, score_thr, sorted, mask, counts, raw_output]
-    // `raw_output` is the last input so the framework treats it as the
-    // kernel's output buffer; the per-block counts is an in/out scratch.
+    // Compile the per-block sort kernel. `inputs` is:
+    //   [boxes, scores, sorted]
+    // `sorted` is the last input so the framework treats it as the kernel's
+    // chained output flowing into the filter kernel. Launch is sized to
+    // AlignedNumBoxes so the bitonic sort has enough lane-parallelism even
+    // when NumBoxes is small relative to it.
     operation
-    compile_compute(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    compile_sort(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
-        const auto& boxes_s   = inputs[0];
-        const auto& scores_s  = inputs[1];
-        const auto nb         = boxes_s.lens()[0];
-        const auto b          = boxes_s.lens()[1];
-        const auto nc         = scores_s.lens()[1];
-        const auto aligned_b  = static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(b)));
-        const auto block_size = std::min<std::size_t>(aligned_b, std::size_t{1024});
+        const auto& boxes_s  = inputs[0];
+        const auto& scores_s = inputs[1];
+        const auto nb        = boxes_s.lens()[0];
+        const auto b         = boxes_s.lens()[1];
+        const auto nc        = scores_s.lens()[1];
+        const auto aligned_b = static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(b)));
+        // bitonic block_sort uses __syncthreads between every stage; pad up
+        // to a wavefront so degenerate cases (e.g. NumBoxes <= 1) still
+        // launch a valid block.
+        const auto block_size = std::min<std::size_t>(
+            std::max<std::size_t>(aligned_b, std::size_t{64}), std::size_t{1024});
 
         hip_compile_options options;
         options.inputs         = inputs;
-        options.output         = inputs.back(); // raw_output buffer
-        options.kernel_name    = "nms_kernel";
+        options.output         = inputs.back(); // sorted buffer
+        options.kernel_name    = "nms_sort_kernel";
         options.virtual_inputs = inputs;
         options.set_launch_params(v, block_size * nb * nc, block_size);
 
         auto src = interpolate_string(
-            nms_compute_kernel_src,
+            nms_sort_kernel_src,
             {{"params", enum_params(inputs.size(), "void * private_p")},
              {"args", enum_params(inputs.size(), "private_p")},
              {"num_batches", std::to_string(nb)},
@@ -146,11 +180,58 @@ struct nms_compiler : compiler<nms_compiler>
         return compile_hip_code_object(ctx, src, options);
     }
 
-    // Compile the serial compaction kernel. `inputs` is:
+    // Compile the per-block filter kernel. `inputs` is:
+    //   [sorted, max, iou, score_thr, mask, counts, raw_output]
+    // `raw_output` is the last input so the framework treats it as the
+    // kernel's chained output flowing into the compact kernel. The filter's
+    // inner loops are O(N) per (batch, class), so the launch is sized to
+    // NumBoxes (not AlignedNumBoxes) to avoid leaving padding-only threads
+    // idle. nb, nc, b are passed through the augmented value because the
+    // filter's inputs no longer carry the raw boxes / scores shapes.
+    operation
+    compile_filter(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        const auto nb        = v.at("num_batches").to<std::size_t>();
+        const auto nc        = v.at("num_classes").to<std::size_t>();
+        const auto b         = v.at("num_boxes").to<std::size_t>();
+        const auto aligned_b = static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(b)));
+
+        // Clamp the per-block thread count to [64, 256]: a multiple of the
+        // wavefront size keeps __syncthreads / block_scan well-defined, and
+        // 256 is the sweet spot for the O(N) inner loops without inflating
+        // shared-memory pressure on `removed[N]` (which is sized by N, not by
+        // block_size).
+        const auto block_size = std::min<std::size_t>(
+            std::max<std::size_t>(
+                static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(b))),
+                std::size_t{64}),
+            std::size_t{256});
+
+        hip_compile_options options;
+        options.inputs         = inputs;
+        options.output         = inputs.back(); // raw_output buffer
+        options.kernel_name    = "nms_filter_kernel";
+        options.virtual_inputs = inputs;
+        options.set_launch_params(v, block_size * nb * nc, block_size);
+
+        auto src = interpolate_string(
+            nms_filter_kernel_src,
+            {{"params", enum_params(inputs.size(), "void * private_p")},
+             {"args", enum_params(inputs.size(), "private_p")},
+             {"num_batches", std::to_string(nb)},
+             {"num_classes", std::to_string(nc)},
+             {"num_boxes", std::to_string(b)},
+             {"aligned_num_boxes", std::to_string(aligned_b)}});
+        return compile_hip_code_object(ctx, src, options);
+    }
+
+    // Compile the compaction kernel. `inputs` is:
     //   [counts, raw_output, output]
-    // Launched with one thread (single block, single thread) since the work
-    // is intentionally serial: it walks per-block regions in fixed order to
-    // produce the exact byte-for-byte output the CPU op produces.
+    // Launched as a single block: an exclusive prefix scan over counts gives
+    // each per-block region a base offset, then the block's threads scatter
+    // selections to those offsets in parallel. The single-block constraint
+    // keeps the scan in shared memory; `nms_compact` static_asserts a hard
+    // cap on NumBlocks that comfortably fits any realistic ONNX NMS.
     operation
     compile_compact(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
@@ -163,12 +244,23 @@ struct nms_compiler : compiler<nms_compiler>
                                     ? raw_s.elements() / (num_blocks * std::size_t{3})
                                     : std::size_t{0};
 
+        // Pick a block size large enough to give the scan and scatter useful
+        // parallelism without inflating LDS pressure. block_scan requires the
+        // block size to be a multiple of the wavefront size; 64 is the
+        // smallest safe choice for all supported gfx targets.
+        const auto total = std::max(num_blocks * num_boxes, std::size_t{1});
+        const auto block_size = std::min<std::size_t>(
+            std::max<std::size_t>(
+                static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(total))),
+                std::size_t{64}),
+            std::size_t{256});
+
         hip_compile_options options;
         options.inputs         = inputs;
         options.output         = inputs.back();
         options.kernel_name    = "nms_compact_kernel";
         options.virtual_inputs = inputs;
-        options.set_launch_params(v, std::size_t{1}, std::size_t{1});
+        options.set_launch_params(v, block_size, block_size); // one block
 
         auto src = interpolate_string(
             nms_compact_kernel_src,
@@ -179,12 +271,24 @@ struct nms_compiler : compiler<nms_compiler>
         return compile_hip_code_object(ctx, src, options);
     }
 
-    // Required compiler<> hook: return the compute kernel based on the raw
-    // input shapes. The full two-kernel chain is handled in `compile()`; this
-    // entry point is only used by callers that ask for a single op view.
+    // Required compiler<> hook: return the sort kernel built from the raw
+    // user input shapes (boxes, scores). The full three-kernel chain is
+    // handled in `compile()`; this entry point is only used by callers that
+    // ask for a single op view.
     operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
-        return compile_compute(ctx, inputs, v);
+        if(inputs.size() < 2)
+            MIGRAPHX_THROW("nms_compiler: compile_op needs at least boxes and scores");
+        const auto& boxes_s  = inputs[0];
+        const auto& scores_s = inputs[1];
+        if(boxes_s.lens().size() != 3 or scores_s.lens().size() != 3)
+            MIGRAPHX_THROW("nms_compiler: boxes and scores must be 3-D");
+        const auto nb        = boxes_s.lens()[0];
+        const auto b         = boxes_s.lens()[1];
+        const auto nc        = scores_s.lens()[1];
+        const auto aligned_b = static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(b)));
+        const shape sorted_s{shape::int8_type, {nb * nc * aligned_b * nms_bytes_per_data}};
+        return compile_sort(ctx, {boxes_s, scores_s, sorted_s}, v);
     }
 
     compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
@@ -234,23 +338,34 @@ struct nms_compiler : compiler<nms_compiler>
         // Per-block selection counts (one int32 per (batch, class) block).
         shape counts_s{shape::int32_type, {nb * nc}};
 
-        // Compute kernel input shapes: [user inputs..., sorted, mask, counts, raw_out]
-        std::vector<shape> compute_shapes = raw_shapes;
-        compute_shapes.push_back(sorted_s);
-        compute_shapes.push_back(mask_s);
-        compute_shapes.push_back(counts_s);
-        compute_shapes.push_back(raw_output_s);
+        // Sort kernel input shapes:   [boxes, scores, sorted]
+        std::vector<shape> sort_shapes = {boxes_s, scores_s, sorted_s};
+
+        // Filter kernel input shapes: [sorted, max, iou, thr, mask, counts, raw_out]
+        std::vector<shape> filter_shapes = {sorted_s,
+                                            raw_shapes[2],
+                                            raw_shapes[3],
+                                            raw_shapes[4],
+                                            mask_s,
+                                            counts_s,
+                                            raw_output_s};
 
         // Compact kernel input shapes: [counts, raw_out, output]
-        std::vector<shape> compact_shapes;
-        compact_shapes.push_back(counts_s);
-        compact_shapes.push_back(raw_output_s);
-        compact_shapes.push_back(raw.back()->get_shape());
+        std::vector<shape> compact_shapes = {counts_s, raw_output_s, raw.back()->get_shape()};
+
+        // The filter kernel can't recover nb/nc/b from its input shapes
+        // (sorted/mask/counts/raw_out are all flat scratch buffers), so we
+        // pass them through an augmented value alongside the op attributes.
+        value augmented        = op.to_value();
+        augmented["num_batches"] = nb;
+        augmented["num_classes"] = nc;
+        augmented["num_boxes"]   = b;
 
-        auto compute_kop = compile_compute(ctx, compute_shapes, op.to_value());
-        auto compact_kop = compile_compact(ctx, compact_shapes, op.to_value());
+        auto sort_kop    = compile_sort(ctx, sort_shapes, augmented);
+        auto filter_kop  = compile_filter(ctx, filter_shapes, augmented);
+        auto compact_kop = compile_compact(ctx, compact_shapes, augmented);
 
-        std::vector<operation> kops = {compute_kop, compact_kop};
+        std::vector<operation> kops = {sort_kop, filter_kop, compact_kop};
 
         return {kops,
                 [=](module& m, instruction_ref ins2, const std::vector<operation>& cops) {
@@ -291,21 +406,24 @@ struct nms_compiler : compiler<nms_compiler>
                     out = m.insert_instruction(
                         ins2, make_op("hip::fill", {{"value", 0}}), out);
 
-                    auto compute_args = args;
-                    compute_args.push_back(sorted);
-                    compute_args.push_back(mask);
-                    compute_args.push_back(counts);
-                    compute_args.push_back(raw_out);
-
-                    auto compute_ins =
-                        m.insert_instruction(ins2, cops[0], compute_args);
-
-                    // Use compute_ins (returned raw_out) as the dataflow edge
-                    // so the compact kernel is ordered after the compute
-                    // kernel and the raw_out buffer remains live.
-                    std::vector<instruction_ref> compact_args = {
-                        counts, compute_ins, out};
-                    m.replace_instruction(ins2, cops[1], compact_args);
+                    // Phase 1: sort. Inputs are [boxes, scores, sorted]; the
+                    // returned `sort_ins` is the post-write `sorted` buffer
+                    // which becomes the filter kernel's first input.
+                    auto sort_ins = m.insert_instruction(
+                        ins2, cops[0], {args[0], args[1], sorted});
+
+                    // Phase 2: filter. Use `sort_ins` as the dataflow edge so
+                    // the filter is ordered after sort and `sorted` stays
+                    // live. Returned `filter_ins` is the post-write
+                    // `raw_output` buffer fed to compact.
+                    auto filter_ins = m.insert_instruction(
+                        ins2,
+                        cops[1],
+                        {sort_ins, args[2], args[3], args[4], mask, counts, raw_out});
+
+                    // Phase 3: compact. Counts/filter_ins/out match the
+                    // [counts, raw_output, output] order in compact_shapes.
+                    m.replace_instruction(ins2, cops[2], {counts, filter_ins, out});
                 }};
     }
 };
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
index ead79aed578..dfdff7430c8 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
@@ -28,6 +28,8 @@
 #include <migraphx/kernels/array.hpp>
 #include <migraphx/kernels/index.hpp>
 #include <migraphx/kernels/math.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <migraphx/kernels/scan.hpp>
 #include <migraphx/kernels/sort.hpp>
 #include <migraphx/kernels/tensor_view.hpp>
 #include <migraphx/kernels/types.hpp>
@@ -164,8 +166,8 @@ __device__ void nms_make_iou_mask(index idx, const nms_data* sorted, uint8_t* ma
 
 // Phase 3: greedy filter that writes selections into a per-block region of a
 // scratch buffer (block_id * N entries) and stores the per-block count. A
-// follow-up serial compaction kernel gathers per-block regions in block_id
-// order to produce a deterministic compacted output that matches the CPU op.
+// follow-up compaction kernel gathers per-block regions in block_id order to
+// produce a deterministic compacted output that matches the CPU op.
 template <index_int N>
 __device__ void nms_filter_per_block(index idx,
                                      const nms_data* sorted,
@@ -221,42 +223,60 @@ __device__ void nms_filter_per_block(index idx,
         block_counts[block_id] = static_cast<int32_t>(output_idx);
 }
 
-// Serial compaction: a single thread walks per-block regions in block_id order
-// (which equals the CPU op's (batch, class) iteration order) and copies the
-// first block_counts[b] entries of each region into a contiguous prefix of the
-// final output buffer. Trailing slots are left as the zero fill applied before
-// this kernel runs.
+// Single-block compaction: an exclusive prefix scan over block_counts gives
+// each per-block region a base offset in the final output; threads in the
+// single launched block then scatter the per-block selections in parallel.
+// Block_id order is preserved, which matches the CPU op's (batch, class)
+// iteration order, and each block writes its `block_counts[b]` entries in
+// order, so the final output is bit-for-bit identical to the serial walker.
+// Trailing slots are left as the zero fill applied before this kernel runs.
 template <index_int NumBlocks, index_int NumBoxes>
 __device__ void
 nms_compact(index idx, const int64_t* raw_output, const int32_t* block_counts, int64_t* output)
 {
-    if(idx.global == 0)
-    {
-        index_int dst = 0;
-        for(index_int b = 0; b < NumBlocks; ++b)
+    static_assert(NumBlocks > 0, "num_blocks must be > 0");
+    // offsets[] is sized 4 * NumBlocks bytes in LDS, well within the 64 KB
+    // per-block budget for any realistic ONNX NMS (nb * nc).
+    static_assert(NumBlocks <= 16384,
+                  "nms_compact: NumBlocks exceeds the LDS budget for offsets[]");
+
+    __shared__ int32_t offsets[NumBlocks];
+
+    // Exclusive prefix sum: emit(b, inclusive) -> offsets[b] = inclusive - counts[b].
+    block_scan(
+        idx,
+        op::sum{},
+        int32_t{0},
+        index_int{NumBlocks},
+        [&](auto b) -> int32_t { return block_counts[b]; },
+        [&](auto b, auto inclusive) { offsets[b] = inclusive - block_counts[b]; });
+    __syncthreads();
+
+    // Parallel scatter: flatten (b, i) so all threads see roughly equal work,
+    // regardless of how `block_counts[b]` is distributed across blocks.
+    constexpr index_int total = NumBlocks * NumBoxes;
+    idx.local_stride(total, [&](auto bi) {
+        const index_int b = bi / NumBoxes;
+        const index_int i = bi % NumBoxes;
+        if(i < static_cast<index_int>(block_counts[b]))
         {
-            const int32_t cnt  = block_counts[b];
-            const int64_t* src = raw_output + b * NumBoxes * 3;
-            for(int32_t i = 0; i < cnt; ++i)
-            {
-                output[dst * 3 + 0] = src[i * 3 + 0];
-                output[dst * 3 + 1] = src[i * 3 + 1];
-                output[dst * 3 + 2] = src[i * 3 + 2];
-                ++dst;
-            }
+            const int64_t* src = raw_output + (b * NumBoxes + i) * 3;
+            int64_t* dst       = output + (offsets[b] + i) * 3;
+            dst[0]             = src[0];
+            dst[1]             = src[1];
+            dst[2]             = src[2];
         }
-    }
+    });
 }
 
-// Per-block driver: one block per (batch_idx, class_idx). Workspace pointers
-// are sliced into per-block segments using idx.group. Selections are written
-// to a per-block region of `raw_output` and the per-block count is written to
-// `block_counts`; a follow-up compact kernel produces the final compacted
-// output that matches the CPU op's ordering.
+// Per-block sort driver: one block per (batch_idx, class_idx). Loads boxes /
+// scores for this (batch, class) into a per-block region of `sorted_buf` and
+// runs a block-level bitonic sort. The result feeds the follow-up filter
+// kernel, which reads `sorted_buf` and writes the IoU mask / per-block
+// selection list.
 //
-// `raw_output_buf` is intentionally the last parameter so that JIT-compiled
-// callers (which use `inputs.back()` as the kernel's output buffer) treat it
-// as the chained output flowing into the compact kernel.
+// `sorted_buf` is the last parameter so the JIT framework treats it as the
+// chained output flowing into the filter kernel.
 template <bool CenterPointBox,
           index_int NumBatches,
           index_int NumClasses,
@@ -264,22 +284,56 @@ template <bool CenterPointBox,
           index_int AlignedNumBoxes,
           class Boxes,
           class Scores,
+          class Sorted>
+__device__ void nonmaxsuppression_sort(Boxes boxes, Scores scores, Sorted sorted_buf)
+{
+    static_assert(NumBatches > 0, "num_batches must be > 0");
+    static_assert(NumClasses > 0, "num_classes must be > 0");
+
+    auto idx                 = make_index();
+    const index_int block_id = idx.group;
+    const int batch_idx      = static_cast<int>(block_id / NumClasses);
+    const int class_idx      = static_cast<int>(block_id % NumClasses);
+
+    nms_data* my_sorted =
+        reinterpret_cast<nms_data*>(sorted_buf.data()) + block_id * AlignedNumBoxes;
+
+    const float* boxes_b   = boxes.data() + batch_idx * NumBoxes * 4;
+    const float* scores_bc = scores.data() + (batch_idx * NumClasses + class_idx) * NumBoxes;
+
+    nms_load_and_sort<CenterPointBox, NumBoxes, AlignedNumBoxes>(
+        idx, boxes_b, scores_bc, my_sorted);
+}
+
+// Per-block filter driver: one block per (batch_idx, class_idx). Reads the
+// previously-sorted records out of `sorted_buf`, builds the IoU mask in
+// `mask_buf`, then runs the greedy filter writing selections into a per-block
+// region of `raw_output` and the per-block count into `counts_buf`.
+//
+// The box-coordinate convention has already been normalized into corner form
+// in `sorted_buf`, so this driver does not need `CenterPointBox`.
+//
+// `raw_output_buf` is intentionally the last parameter so that JIT-compiled
+// callers (which use `inputs.back()` as the kernel's output buffer) treat it
+// as the chained output flowing into the compact kernel.
+template <index_int NumBatches,
+          index_int NumClasses,
+          index_int NumBoxes,
+          index_int AlignedNumBoxes,
+          class Sorted,
           class MaxOut,
           class IouThr,
           class ScoreThr,
-          class Sorted,
           class Mask,
           class Counts,
           class RawOutput>
-__device__ void nonmaxsuppression(Boxes boxes,
-                                  Scores scores,
-                                  MaxOut max_out_p,
-                                  IouThr iou_thr_p,
-                                  ScoreThr score_thr_p,
-                                  Sorted sorted_buf,
-                                  Mask mask_buf,
-                                  Counts counts_buf,
-                                  RawOutput raw_output_buf)
+__device__ void nonmaxsuppression_filter(Sorted sorted_buf,
+                                         MaxOut max_out_p,
+                                         IouThr iou_thr_p,
+                                         ScoreThr score_thr_p,
+                                         Mask mask_buf,
+                                         Counts counts_buf,
+                                         RawOutput raw_output_buf)
 {
     static_assert(NumBatches > 0, "num_batches must be > 0");
     static_assert(NumClasses > 0, "num_classes must be > 0");
@@ -294,19 +348,12 @@ __device__ void nonmaxsuppression(Boxes boxes,
         reinterpret_cast<nms_data*>(sorted_buf.data()) + block_id * AlignedNumBoxes;
     uint8_t* my_mask = reinterpret_cast<uint8_t*>(mask_buf.data()) + block_id * iou_packed_size;
 
-    const float* boxes_b   = boxes.data() + batch_idx * NumBoxes * 4;
-    const float* scores_bc = scores.data() + (batch_idx * NumClasses + class_idx) * NumBoxes;
-
     // Pull scalar tensor inputs once. They're broadcast to all threads via the
     // common load (each thread reads the same single element).
     const int64_t max_out_val = max_out_p[0];
     const float iou_thr_val   = iou_thr_p[0];
     const float score_thr_val = score_thr_p[0];
 
-    nms_load_and_sort<CenterPointBox, NumBoxes, AlignedNumBoxes>(
-        idx, boxes_b, scores_bc, my_sorted);
-    __syncthreads();
-
     if constexpr(NumBoxes > 1)
     {
         nms_make_iou_mask<NumBoxes>(idx, my_sorted, my_mask, iou_thr_val);
@@ -332,8 +379,8 @@ __device__ void nonmaxsuppression(Boxes boxes,
                                    reinterpret_cast<int32_t*>(counts_buf.data()));
 }
 
-// Serial compact wrapper invoked from the second JIT kernel. Reads the
-// per-block counts and raw_output produced by `nonmaxsuppression` and copies
+// Compact wrapper invoked from the final JIT kernel. Reads the per-block
+// counts and raw_output produced by `nonmaxsuppression_filter` and copies
 // selections into the final output in block_id (i.e. (batch, class)) order.
 // `output` is last to match the JIT convention of using `inputs.back()` as
 // the kernel's logical output buffer.
diff --git a/test/verify/test_nms.cpp b/test/verify/test_nms.cpp
index 94828ae35a2..c25e303529d 100644
--- a/test/verify/test_nms.cpp
+++ b/test/verify/test_nms.cpp
@@ -148,6 +148,7 @@ struct test_nms_not_center : verify_program<test_nms_not_center>
     }
 };
 
+// TODO: update this test
 // Test NMS with dynamic inputs that have different compile-time spatial ranges.
 // This reproduces the scenario from nms_repro_minidimmismatch.py where
 // boxes has 10 spatial entries and scores has 5, but at runtime both are

From ced7e69b6742944239f5cbde5e9885f48d3471a9 Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Wed, 13 May 2026 14:32:28 -0500
Subject: [PATCH 04/32] Change NMS ONNX parsing and ref behavior

---
 src/include/migraphx/op/nonmaxsuppression.hpp | 48 +++++++------------
 src/onnx/parse_nonmaxsuppression.cpp          |  8 +++-
 2 files changed, 24 insertions(+), 32 deletions(-)

diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp
index b7b13f40354..87a4f1eebb0 100644
--- a/src/include/migraphx/op/nonmaxsuppression.hpp
+++ b/src/include/migraphx/op/nonmaxsuppression.hpp
@@ -40,9 +40,14 @@
 #include <migraphx/argument.hpp>
 #include <migraphx/par.hpp>
 
-/*
-https://github.com/onnx/onnx/blob/main/docs/Operators.md#NonMaxSuppression
-*/
+/**
+ *  nonmaxsuppression(boxes,
+ *                    scores,
+ *                    optional(max_output_boxes_per_class),
+ *                    optional(iou_threshold),
+ *                    optional(score_threshold));
+ *  Outputs tuple of {tensor with dims[max_num_boxes, 3]: selected_box_indices, scalar int64_t: num_selected_indices} 
+ */
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace op {
@@ -50,13 +55,11 @@ namespace op {
 struct nonmaxsuppression
 {
     bool center_point_box = false;
-    bool use_dyn_output   = false;
 
     template <class Self, class F>
     static auto reflect(Self& self, F f)
     {
-        return pack(f(self.center_point_box, "center_point_box"),
-                    f(self.use_dyn_output, "use_dyn_output"));
+        return pack(f(self.center_point_box, "center_point_box"));
     }
 
     std::string name() const { return "nonmaxsuppression"; }
@@ -87,21 +90,9 @@ struct nonmaxsuppression
             }
         };
 
-        bool needs_dyn_output = use_dyn_output or inputs.at(0).dynamic() or inputs.at(1).dynamic();
-
-        if(needs_dyn_output)
-        {
-            std::vector<shape::dynamic_dimension> out_lens = {};
-            out_lens.push_back({0, max_num_boxes});
-            out_lens.push_back({3, 3});
-            return {shape::int64_type, out_lens};
-        }
-        else
-        {
-            fixed_shape_error_check();
-            std::vector<std::size_t> out_lens = {max_num_boxes, 3};
-            return {shape::int64_type, out_lens};
-        }
+        fixed_shape_error_check();
+        std::vector<std::size_t> out_lens = {max_num_boxes, 3};
+        return {shape::int64_type, out_lens};
     }
 
     struct box
@@ -236,7 +227,6 @@ struct nonmaxsuppression
                             double iou_threshold,
                             double score_threshold) const
     {
-        std::fill(output.begin(), output.end(), 0);
         const auto& lens       = scores.get_shape().lens();
         const auto num_batches = lens[0];
         const auto num_classes = lens[1];
@@ -325,14 +315,12 @@ struct nonmaxsuppression
                                            score_threshold);
             });
         });
-        if(output_shape.dynamic())
-        {
-            return result.reshape({output_shape.type(), {num_selected, 3}});
-        }
-        else
-        {
-            return result;
-        }
+        shape scalar_int_shape = {shape::int64_type, {1}};
+        argument num_selected_result{scalar_int_shape};
+        num_selected_result.visit([&](auto output){
+            output.begin() = num_selected;
+        });
+        return {{result, num_selected}};
     }
 };
 
diff --git a/src/onnx/parse_nonmaxsuppression.cpp b/src/onnx/parse_nonmaxsuppression.cpp
index 212ee5123a4..c2389b0e675 100644
--- a/src/onnx/parse_nonmaxsuppression.cpp
+++ b/src/onnx/parse_nonmaxsuppression.cpp
@@ -39,8 +39,12 @@ struct parse_nonmaxsuppression : op_parser<parse_nonmaxsuppression>
                           const std::vector<instruction_ref>& args) const
     {
         auto op = parser.load(opd.op_name, info);
-        op.from_value({{"use_dyn_output", parser.use_dyn_output}});
-        return info.add_instruction(op, args);
+        auto nms_ins = info.add_instruction(op, args);
+        // variable ends input slice to handle dynamic shape output
+        auto nms_indices = info.add_instruction(make_op("get_tuple_elem", {{"index", 0}}), nms_ins);
+        auto nms_num_selected = info.add_instruction(make_op("get_tuple_elem", {{"index", 1}}), nms_ins);
+        auto slice_ins = info.add_instruction(make_op("slice", {{"axes", {0}}, {"starts", {0}}}, nms_indices, nms_num_selected));
+        return slice_ins;
     }
 };
 

From 84c7d3b630f9d9b4109654fc212c72231d2c1240 Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Wed, 13 May 2026 20:22:25 -0500
Subject: [PATCH 05/32] Progress, refactor

---
 src/include/migraphx/op/nonmaxsuppression.hpp |  10 +-
 src/onnx/parse_nonmaxsuppression.cpp          |   6 +-
 src/targets/gpu/jit/nonmaxsuppression.cpp     |  48 ++--
 .../migraphx/kernels/nonmaxsuppression.hpp    | 241 ++++++++----------
 .../kernels/include/migraphx/kernels/sort.hpp |   4 +-
 src/targets/gpu/lowering.cpp                  |   2 +-
 test/verify/test_nms.cpp                      |   8 +-
 7 files changed, 147 insertions(+), 172 deletions(-)

diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp
index 87a4f1eebb0..6b9af617909 100644
--- a/src/include/migraphx/op/nonmaxsuppression.hpp
+++ b/src/include/migraphx/op/nonmaxsuppression.hpp
@@ -72,7 +72,7 @@ struct nonmaxsuppression
         auto max_classes           = inputs.at(1).max_lens().at(1);
         auto max_spatial_dimension = inputs.at(0).max_lens().at(1);
         // Per ONNX spec, output is [num_selected_indices, 3] where each row is
-        // [batch_index, class_index, box_index].  The maximum possible
+        // [batch_index, class_index, box_index]. The maximum possible
         // num_selected_indices = num_batches * num_classes * spatial_dimension.
         const auto max_num_boxes = max_batches * max_classes * max_spatial_dimension;
 
@@ -92,7 +92,9 @@ struct nonmaxsuppression
 
         fixed_shape_error_check();
         std::vector<std::size_t> out_lens = {max_num_boxes, 3};
-        return {shape::int64_type, out_lens};
+        shape s_ind{shape::int64_type, out_lens};
+        shape s_num_selected{shape::int64_type, {1}};
+        return shape({s_ind, s_num_selected});
     }
 
     struct box
@@ -223,7 +225,7 @@ struct nonmaxsuppression
     std::size_t compute_nms(Output output,
                             const Boxes& boxes,
                             const Scores& scores,
-                            std::size_t max_output_boxes_per_class,
+                            int64_t max_output_boxes_per_class,
                             double iou_threshold,
                             double score_threshold) const
     {
@@ -320,7 +322,7 @@ struct nonmaxsuppression
         num_selected_result.visit([&](auto output){
             output.begin() = num_selected;
         });
-        return {{result, num_selected}};
+        return {{result, num_selected_result}};
     }
 };
 
diff --git a/src/onnx/parse_nonmaxsuppression.cpp b/src/onnx/parse_nonmaxsuppression.cpp
index c2389b0e675..0ffffa03bcd 100644
--- a/src/onnx/parse_nonmaxsuppression.cpp
+++ b/src/onnx/parse_nonmaxsuppression.cpp
@@ -41,9 +41,9 @@ struct parse_nonmaxsuppression : op_parser<parse_nonmaxsuppression>
         auto op = parser.load(opd.op_name, info);
         auto nms_ins = info.add_instruction(op, args);
         // variable ends input slice to handle dynamic shape output
-        auto nms_indices = info.add_instruction(make_op("get_tuple_elem", {{"index", 0}}), nms_ins);
-        auto nms_num_selected = info.add_instruction(make_op("get_tuple_elem", {{"index", 1}}), nms_ins);
-        auto slice_ins = info.add_instruction(make_op("slice", {{"axes", {0}}, {"starts", {0}}}, nms_indices, nms_num_selected));
+        auto indices = info.add_instruction(make_op("get_tuple_elem", {{"index", 0}}), nms_ins);
+        auto num_selected = info.add_instruction(make_op("get_tuple_elem", {{"index", 1}}), nms_ins);
+        auto slice_ins = info.add_instruction(make_op("slice", {{"axes", {0}}, {"starts", {0}}}), indices, num_selected);
         return slice_ins;
     }
 };
diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp
index 189ba2057e8..a473f8477be 100644
--- a/src/targets/gpu/jit/nonmaxsuppression.cpp
+++ b/src/targets/gpu/jit/nonmaxsuppression.cpp
@@ -135,54 +135,48 @@ MIGRAPHX_GLOBAL void nms_compact_kernel(${params})
 } // namespace migraphx
 )__migraphx__";
 
+// TODO: use compute_block_size and/or compute_global_for?
+// TODO: Don't need num_batches, num_classes, num_boxes as template parameters since tensor_view has shapes.
 struct nms_compiler : compiler<nms_compiler>
 {
     std::vector<std::string> names() const { return {"nonmaxsuppression"}; }
 
-    // Compile the per-block sort kernel. `inputs` is:
-    //   [boxes, scores, sorted]
-    // `sorted` is the last input so the framework treats it as the kernel's
-    // chained output flowing into the filter kernel. Launch is sized to
-    // AlignedNumBoxes so the bitonic sort has enough lane-parallelism even
-    // when NumBoxes is small relative to it.
+    // Compile the sort kernel.
+    // inputs: [boxes, scores, sorted]
     operation
     compile_sort(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
         const auto& boxes_s  = inputs[0];
         const auto& scores_s = inputs[1];
-        const auto nb        = boxes_s.lens()[0];
-        const auto b         = boxes_s.lens()[1];
-        const auto nc        = scores_s.lens()[1];
-        const auto aligned_b = static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(b)));
-        // bitonic block_sort uses __syncthreads between every stage; pad up
-        // to a wavefront so degenerate cases (e.g. NumBoxes <= 1) still
-        // launch a valid block.
-        const auto block_size = std::min<std::size_t>(
-            std::max<std::size_t>(aligned_b, std::size_t{64}), std::size_t{1024});
+        const auto num_batches        = boxes_s.lens()[0];
+        const auto num_boxes          = boxes_s.lens()[1];
+        const auto num_classes        = scores_s.lens()[1];
+        const auto aligned_b = static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(num_boxes)));
+        // clamp between 64 and 1024 threads based on aligned_num_boxes
+        const auto block_size = std::min<std::size_t>(std::max<std::size_t>(aligned_b, std::size_t{64}), std::size_t{1024});
 
         hip_compile_options options;
         options.inputs         = inputs;
         options.output         = inputs.back(); // sorted buffer
         options.kernel_name    = "nms_sort_kernel";
         options.virtual_inputs = inputs;
-        options.set_launch_params(v, block_size * nb * nc, block_size);
+        options.set_launch_params(v, block_size * num_batches * num_classes, block_size);
 
         auto src = interpolate_string(
             nms_sort_kernel_src,
             {{"params", enum_params(inputs.size(), "void * private_p")},
              {"args", enum_params(inputs.size(), "private_p")},
-             {"num_batches", std::to_string(nb)},
-             {"num_classes", std::to_string(nc)},
-             {"num_boxes", std::to_string(b)},
+             {"num_batches", std::to_string(num_batches)},
+             {"num_classes", std::to_string(num_classes)},
+             {"num_boxes", std::to_string(num_boxes)},
              {"aligned_num_boxes", std::to_string(aligned_b)},
              {"center_point_box",
               v.at("center_point_box").to<bool>() ? "true" : "false"}});
         return compile_hip_code_object(ctx, src, options);
     }
 
-    // Compile the per-block filter kernel. `inputs` is:
-    //   [sorted, max, iou, score_thr, mask, counts, raw_output]
-    // `raw_output` is the last input so the framework treats it as the
+    // inputs: [sorted, max, iou, score_thr, mask, counts, raw_output]
+    // `raw_output` is the last input so the framework treats it as the(
     // kernel's chained output flowing into the compact kernel. The filter's
     // inner loops are O(N) per (batch, class), so the launch is sized to
     // NumBoxes (not AlignedNumBoxes) to avoid leaving padding-only threads
@@ -225,6 +219,7 @@ struct nms_compiler : compiler<nms_compiler>
         return compile_hip_code_object(ctx, src, options);
     }
 
+    // TODO: REDO this whole thing. It doesn't make sense.
     // Compile the compaction kernel. `inputs` is:
     //   [counts, raw_output, output]
     // Launched as a single block: an exclusive prefix scan over counts gives
@@ -260,6 +255,7 @@ struct nms_compiler : compiler<nms_compiler>
         options.output         = inputs.back();
         options.kernel_name    = "nms_compact_kernel";
         options.virtual_inputs = inputs;
+        // BUG: this is not one block
         options.set_launch_params(v, block_size, block_size); // one block
 
         auto src = interpolate_string(
@@ -398,14 +394,6 @@ struct nms_compiler : compiler<nms_compiler>
                     auto counts = m.insert_instruction(
                         ins2, make_op("hip::allocate", {{"shape", to_value(counts_s)}}));
 
-                    // Pre-zero the final output buffer so unwritten rows match
-                    // the CPU implementation's behavior (trailing zeros). The
-                    // counts and raw_out scratch don't need zeroing: each
-                    // block writes its count exactly once and the compact
-                    // kernel only reads counts[b] entries from each block.
-                    out = m.insert_instruction(
-                        ins2, make_op("hip::fill", {{"value", 0}}), out);
-
                     // Phase 1: sort. Inputs are [boxes, scores, sorted]; the
                     // returned `sort_ins` is the post-write `sorted` buffer
                     // which becomes the filter kernel's first input.
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
index dfdff7430c8..c4c27a76ed3 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
@@ -36,9 +36,6 @@
 
 namespace migraphx {
 
-// Per-box record carried through the sort. Box corners are stored normalized
-// to (xmin, ymin, xmax, ymax) so the IoU computation is independent of the
-// center_point_box attribute.
 struct nms_data
 {
     float score;
@@ -47,24 +44,25 @@ struct nms_data
 };
 
 // Decode a single box into (xmin, ymin, xmax, ymax) corners.
-template <bool CenterPointBox>
-__device__ inline array<float, 4> nms_normalize_box(const float* b)
+// Normalize such that [x1, y1] is the bottom left corner
+template <bool CenterPointBox, class Box>
+__device__ inline array<float, 4> nms_normalize_box(Box box)
 {
     if constexpr(CenterPointBox)
     {
-        const float xc = b[0];
-        const float yc = b[1];
-        const float hw = b[2] * 0.5f;
-        const float hh = b[3] * 0.5f;
+        const float xc = box[0];
+        const float yc = box[1];
+        const float hw = box[2] * 0.5f;
+        const float hh = box[3] * 0.5f;
         return {xc - hw, yc - hh, xc + hw, yc + hh};
     }
     else
     {
         // ONNX layout: [y1, x1, y2, x2]; corners may be in either order.
-        const float y1   = b[0];
-        const float x1   = b[1];
-        const float y2   = b[2];
-        const float x2   = b[3];
+        const float y1   = box[0];
+        const float x1   = box[1];
+        const float y2   = box[2];
+        const float x2   = box[3];
         const float xmin = min(x1, x2);
         const float xmax = max(x1, x2);
         const float ymin = min(y1, y2);
@@ -73,8 +71,9 @@ __device__ inline array<float, 4> nms_normalize_box(const float* b)
     }
 }
 
+template <class Box>
 __device__ inline bool
-nms_iou_over_threshold(const array<float, 4>& a, const array<float, 4>& b, float threshold)
+nms_iou_over_threshold(const Box a, Box b, float threshold)
 {
     const float left   = max(a[0], b[0]);
     const float right  = min(a[2], b[2]);
@@ -97,6 +96,7 @@ __device__ inline index_int nms_packed_idx(index_int i, index_int j, index_int N
     return (i * N - (i * (i + 1)) / 2) + j - (i + 1);
 }
 
+// Comparator for sorting nms_data{}.
 struct nms_score_greater
 {
     constexpr bool operator()(const nms_data& a, const nms_data& b) const
@@ -105,18 +105,42 @@ struct nms_score_greater
     }
 };
 
-// Phase 1: load (score, box, box_index) tuples into a per-block buffer of
-// AlignedN entries (power of two), padding the [N, AlignedN) tail with sentinel
-// values, then sort the buffer in descending order by score.
-template <bool CenterPointBox, index_int N, index_int AlignedN>
-__device__ void nms_load_and_sort(index idx,
-                                  const float* boxes_b,   // [N, 4]
-                                  const float* scores_bc, // [N]
-                                  nms_data* sorted)
+// Phase 1
+// One block per (batch_idx, class_idx).
+// Load data into per-block buffer of nms_data.
+// Pads values after N with sentinel values.
+// Sorts the nms_data in descending order by score.
+// boxes_tv: dims([N, 4]) of float.
+// scores_tv: dims([N]) of float.
+// sorted_tv: dims([N]) of nms_data{}.
+template <bool CenterPointBox,
+          index_int NumBatches,
+          index_int NumClasses,
+          index_int NumBoxes,
+          index_int AlignedNumBoxes,
+          class Boxes,
+          class Scores,
+          class Output>
+__device__ void nonmaxsuppression_sort(Boxes boxes_tv, Scores scores_tv, Output out_tv)
 {
-    idx.local_stride(AlignedN, [&](auto i) {
-        nms_data d;
-        if(i < N)
+    static_assert(NumBatches > 0, "num_batches must be > 0");
+    static_assert(NumClasses > 0, "num_classes must be > 0");
+
+    auto idx = make_index();
+    const index_int block_id = idx.group;
+    const int batch_idx      = static_cast<int>(block_id / NumClasses);
+    const int class_idx      = static_cast<int>(block_id % NumClasses);
+    
+    constexpr auto block_out_shape = make_shape(index_ints<AlignedNumBoxes>{});
+    auto* p = reinterpret_cast<nms_data*>(out_tv.data()) + block_id * AlignedNumBoxes;
+    auto block_out_tv = make_tensor_view<nms_data>(p, block_out_shape);
+
+    const auto* boxes_b   = boxes_tv.data() + batch_idx * NumBoxes * 4;
+    const auto* scores_bc = scores_tv.data() + (batch_idx * NumClasses + class_idx) * NumBoxes;
+
+    nms_data d;
+    idx.local_stride(AlignedNumBoxes, [&](auto i) {
+        if(i < NumBoxes)
         {
             d.score     = scores_bc[i];
             d.box       = nms_normalize_box<CenterPointBox>(boxes_b + i * 4);
@@ -130,14 +154,15 @@ __device__ void nms_load_and_sort(index idx,
             d.box       = array<float, 4>{0.f, 0.f, 0.f, 0.f};
             d.box_index = -1;
         }
-        sorted[i] = d;
+        block_out_tv[i] = d;
     });
     __syncthreads();
-    bitonic_sort<nms_score_greater>{nms_score_greater{}}.template block_sort<AlignedN>(idx, sorted);
+    bitonic_sort<nms_score_greater>{nms_score_greater{}}.template block_sort<AlignedNumBoxes>(idx, block_out_tv);
 }
 
-// Phase 2: build the packed upper-triangular IoU mask for the N sorted boxes.
-// Work is striped (i, N-1-i) per thread so each thread does roughly the same
+// Phase 2
+// Build the packed upper-triangular IoU mask for the N sorted boxes.
+// Work is striped such that each thread does a multiple of 2 rows so each does roughly the same
 // amount of work regardless of where it falls in the triangle.
 template <index_int N>
 __device__ void nms_make_iou_mask(index idx, const nms_data* sorted, uint8_t* mask, float iou_thr)
@@ -164,17 +189,16 @@ __device__ void nms_make_iou_mask(index idx, const nms_data* sorted, uint8_t* ma
     }
 }
 
-// Phase 3: greedy filter that writes selections into a per-block region of a
-// scratch buffer (block_id * N entries) and stores the per-block count. A
-// follow-up compaction kernel gathers per-block regions in block_id order to
-// produce a deterministic compacted output that matches the CPU op.
+// Phase 3
+// Greedy filter that writes selections into a per-block region of a
+// scratch buffer (block_id * N entries) and stores the per-block count.
 template <index_int N>
 __device__ void nms_filter_per_block(index idx,
                                      const nms_data* sorted,
                                      const uint8_t* mask,
                                      int batch_idx,
                                      int class_idx,
-                                     index_int max_output,
+                                     int64_t max_output,
                                      float score_thr,
                                      int64_t* raw_output,    // [num_blocks * N * 3]
                                      int32_t* block_counts)  // [num_blocks]
@@ -223,95 +247,13 @@ __device__ void nms_filter_per_block(index idx,
         block_counts[block_id] = static_cast<int32_t>(output_idx);
 }
 
-// Single-block compaction: an exclusive prefix scan over block_counts gives
-// each per-block region a base offset in the final output; threads in the
-// single launched block then scatter the per-block selections in parallel.
-// Block_id order is preserved, which matches the CPU op's (batch, class)
-// iteration order, and each block writes its `block_counts[b]` entries in
-// order, so the final output is bit-for-bit identical to the serial walker.
-// Trailing slots are left as the zero fill applied before this kernel runs.
-template <index_int NumBlocks, index_int NumBoxes>
-__device__ void
-nms_compact(index idx, const int64_t* raw_output, const int32_t* block_counts, int64_t* output)
-{
-    static_assert(NumBlocks > 0, "num_blocks must be > 0");
-    // offsets[] is sized 4 * NumBlocks bytes in LDS, well within the 64 KB
-    // per-block budget for any realistic ONNX NMS (nb * nc).
-    static_assert(NumBlocks <= 16384,
-                  "nms_compact: NumBlocks exceeds the LDS budget for offsets[]");
-
-    __shared__ int32_t offsets[NumBlocks];
-
-    // Exclusive prefix sum: emit(b, inclusive) -> offsets[b] = inclusive - counts[b].
-    block_scan(
-        idx,
-        op::sum{},
-        int32_t{0},
-        index_int{NumBlocks},
-        [&](auto b) -> int32_t { return block_counts[b]; },
-        [&](auto b, auto inclusive) { offsets[b] = inclusive - block_counts[b]; });
-    __syncthreads();
-
-    // Parallel scatter: flatten (b, i) so all threads see roughly equal work,
-    // regardless of how `block_counts[b]` is distributed across blocks.
-    constexpr index_int total = NumBlocks * NumBoxes;
-    idx.local_stride(total, [&](auto bi) {
-        const index_int b = bi / NumBoxes;
-        const index_int i = bi % NumBoxes;
-        if(i < static_cast<index_int>(block_counts[b]))
-        {
-            const int64_t* src = raw_output + (b * NumBoxes + i) * 3;
-            int64_t* dst       = output + (offsets[b] + i) * 3;
-            dst[0]             = src[0];
-            dst[1]             = src[1];
-            dst[2]             = src[2];
-        }
-    });
-}
-
-// Per-block sort driver: one block per (batch_idx, class_idx). Loads boxes /
-// scores for this (batch, class) into a per-block region of `sorted_buf` and
-// runs a block-level bitonic sort. The result feeds the follow-up filter
-// kernel, which reads `sorted_buf` and writes the IoU mask / per-block
-// selection list.
-//
-// `sorted_buf` is the last parameter so the JIT framework treats it as the
-// chained output flowing into the filter kernel.
-template <bool CenterPointBox,
-          index_int NumBatches,
-          index_int NumClasses,
-          index_int NumBoxes,
-          index_int AlignedNumBoxes,
-          class Boxes,
-          class Scores,
-          class Sorted>
-__device__ void nonmaxsuppression_sort(Boxes boxes, Scores scores, Sorted sorted_buf)
-{
-    static_assert(NumBatches > 0, "num_batches must be > 0");
-    static_assert(NumClasses > 0, "num_classes must be > 0");
-
-    auto idx                 = make_index();
-    const index_int block_id = idx.group;
-    const int batch_idx      = static_cast<int>(block_id / NumClasses);
-    const int class_idx      = static_cast<int>(block_id % NumClasses);
-
-    nms_data* my_sorted =
-        reinterpret_cast<nms_data*>(sorted_buf.data()) + block_id * AlignedNumBoxes;
-
-    const float* boxes_b   = boxes.data() + batch_idx * NumBoxes * 4;
-    const float* scores_bc = scores.data() + (batch_idx * NumClasses + class_idx) * NumBoxes;
-
-    nms_load_and_sort<CenterPointBox, NumBoxes, AlignedNumBoxes>(
-        idx, boxes_b, scores_bc, my_sorted);
-}
-
 // Per-block filter driver: one block per (batch_idx, class_idx). Reads the
 // previously-sorted records out of `sorted_buf`, builds the IoU mask in
 // `mask_buf`, then runs the greedy filter writing selections into a per-block
 // region of `raw_output` and the per-block count into `counts_buf`.
 //
-// The box-coordinate convention has already been normalized into corner form
-// in `sorted_buf`, so this driver does not need `CenterPointBox`.
+// Expecting box-coordinate convention has already been normalized into corner form
+// in `sorted_buf`.
 //
 // `raw_output_buf` is intentionally the last parameter so that JIT-compiled
 // callers (which use `inputs.back()` as the kernel's output buffer) treat it
@@ -350,7 +292,7 @@ __device__ void nonmaxsuppression_filter(Sorted sorted_buf,
 
     // Pull scalar tensor inputs once. They're broadcast to all threads via the
     // common load (each thread reads the same single element).
-    const int64_t max_out_val = max_out_p[0];
+    const int64_t max_output_boxes_per_class = max_out_p[0];
     const float iou_thr_val   = iou_thr_p[0];
     const float score_thr_val = score_thr_p[0];
 
@@ -360,25 +302,64 @@ __device__ void nonmaxsuppression_filter(Sorted sorted_buf,
         __syncthreads();
     }
 
-    // The CPU op reads max_output_boxes_per_class as std::size_t, so a negative
-    // signed value is treated as a very large unsigned (effectively unlimited).
-    // Mirror that here by reinterpreting as unsigned and then capping at
-    // NumBoxes, which is the most we could ever emit per (batch, class) block.
-    const auto max_unsigned    = static_cast<uint64_t>(max_out_val);
-    const index_int max_output = (max_unsigned > static_cast<uint64_t>(NumBoxes))
-                                     ? static_cast<index_int>(NumBoxes)
-                                     : static_cast<index_int>(max_unsigned);
     nms_filter_per_block<NumBoxes>(idx,
                                    my_sorted,
                                    my_mask,
                                    batch_idx,
                                    class_idx,
-                                   max_output,
+                                   max_output_boxes_per_class,
                                    score_thr_val,
                                    reinterpret_cast<int64_t*>(raw_output_buf.data()),
                                    reinterpret_cast<int32_t*>(counts_buf.data()));
 }
 
+// Single-block compaction: an exclusive prefix scan over block_counts gives
+// each per-block region a base offset in the final output; threads in the
+// single launched block then scatter the per-block selections in parallel.
+// Block_id order is preserved, which matches the CPU op's (batch, class)
+// iteration order, and each block writes its `block_counts[b]` entries in
+// order, so the final output is bit-for-bit identical to the serial walker.
+// Trailing slots are left as the zero fill applied before this kernel runs.
+// TODO: this explaination makes no sense
+template <index_int NumBlocks, index_int NumBoxes>
+__device__ void
+nms_compact(index idx, const int64_t* raw_output, const int32_t* block_counts, int64_t* output)
+{
+    static_assert(NumBlocks > 0, "num_blocks must be > 0");
+    // offsets[] is sized 4 * NumBlocks bytes in LDS, well within the 64 KB
+    // per-block budget for any realistic ONNX NMS (nb * nc).
+    static_assert(NumBlocks <= 16384,
+                  "nms_compact: NumBlocks exceeds the LDS budget for offsets[]");
+
+    __shared__ int32_t offsets[NumBlocks];
+
+    // Exclusive prefix sum: emit(b, inclusive) -> offsets[b] = inclusive - counts[b].
+    block_scan(
+        idx,
+        op::sum{},
+        int32_t{0},
+        index_int{NumBlocks},
+        [&](auto b) -> int32_t { return block_counts[b]; },
+        [&](auto b, auto inclusive) { offsets[b] = inclusive - block_counts[b]; });
+    __syncthreads();
+
+    // Parallel scatter: flatten (b, i) so all threads see roughly equal work,
+    // regardless of how `block_counts[b]` is distributed across blocks.
+    constexpr index_int total = NumBlocks * NumBoxes;
+    idx.local_stride(total, [&](auto bi) {
+        const index_int b = bi / NumBoxes;
+        const index_int i = bi % NumBoxes;
+        if(i < static_cast<index_int>(block_counts[b]))
+        {
+            const int64_t* src = raw_output + (b * NumBoxes + i) * 3;
+            int64_t* dst       = output + (offsets[b] + i) * 3;
+            dst[0]             = src[0];
+            dst[1]             = src[1];
+            dst[2]             = src[2];
+        }
+    });
+}
+
 // Compact wrapper invoked from the final JIT kernel. Reads the per-block
 // counts and raw_output produced by `nonmaxsuppression_filter` and copies
 // selections into the final output in block_id (i.e. (batch, class)) order.
@@ -387,7 +368,7 @@ __device__ void nonmaxsuppression_filter(Sorted sorted_buf,
 template <index_int NumBlocks, index_int NumBoxes, class Counts, class RawOutput, class Output>
 __device__ void nonmaxsuppression_compact(Counts counts_buf,
                                           RawOutput raw_output_buf,
-                                          Output output)
+                                          Output output_indices)
 {
     static_assert(NumBlocks > 0, "num_blocks must be > 0");
 
@@ -395,7 +376,7 @@ __device__ void nonmaxsuppression_compact(Counts counts_buf,
     nms_compact<NumBlocks, NumBoxes>(idx,
                                      reinterpret_cast<const int64_t*>(raw_output_buf.data()),
                                      reinterpret_cast<const int32_t*>(counts_buf.data()),
-                                     output.data());
+                                     output_indices.data());
 }
 
 } // namespace migraphx
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp
index fa4d1c981e2..980a628682b 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp
@@ -146,8 +146,8 @@ struct bitonic_sort
     // (e.g. greater{} -> descending). The buffer must be sized to N (a
     // compile-time power of 2); callers pad with sentinel values when the
     // logical length is smaller.
-    template <index_int N, class T>
-    __device__ void block_sort(index idx, T* buf) const
+    template <index_int N, class Array>
+    __device__ void block_sort(index idx, Array& buf) const
     {
         static_assert(is_power_of_2(N), "N must be a power of 2");
         for(index_int k = 2; k <= N; k <<= 1)
diff --git a/src/targets/gpu/lowering.cpp b/src/targets/gpu/lowering.cpp
index 9e510f10047..56c6039c075 100644
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -108,7 +108,7 @@ struct miopen_apply
         add_if_op();
         add_loop_op();
         add_neg_op();
-        add_nms_op();
+        //add_nms_op();
         add_lrn_op();
         add_convolution_backwards_op();
         add_select_module_op();
diff --git a/test/verify/test_nms.cpp b/test/verify/test_nms.cpp
index c25e303529d..9039784a689 100644
--- a/test/verify/test_nms.cpp
+++ b/test/verify/test_nms.cpp
@@ -43,14 +43,18 @@ struct test_nms : verify_program<test_nms>
         auto iou_threshold   = mm->add_literal(0.5f);
         auto score_threshold = mm->add_literal(0.0f);
 
-        auto r =
+        auto nms =
             mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", 1}}),
                                 boxes_l,
                                 scores_l,
                                 max_out_l,
                                 iou_threshold,
                                 score_threshold);
-        mm->add_return({r});
+
+        auto indices = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms);
+        auto num_selected = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 1}}), nms);
+        auto slice_ins = mm->add_instruction(migraphx::make_op("slice", {{"axes", {0}}, {"starts", {0}}}), indices, num_selected);
+        mm->add_return({slice_ins});
 
         return p;
     }

From 43c10be99f4e05becae45c9761eaa0cf6301fa8a Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Thu, 14 May 2026 17:43:50 -0500
Subject: [PATCH 06/32] Cleanup before refactor into 3 JIT instructions

---
 .../include/migraphx/gpu/device/scan.hpp      |   9 ++
 src/targets/gpu/jit/nonmaxsuppression.cpp     | 126 +++++++-----------
 .../migraphx/kernels/nonmaxsuppression.hpp    | 103 +++++++-------
 3 files changed, 99 insertions(+), 139 deletions(-)

diff --git a/src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp b/src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp
index 5a66f7f7308..95ce82f224e 100644
--- a/src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp
@@ -33,6 +33,14 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
 
+// Inclusive prefix sum within a kernel block.
+// Hillis-Steele scan with double-buffered (ping-pong) shared array.
+// `N`: upper bound on blockDim.x, sizes the shared buffer.
+// `op`: associative binary reduce function ex. sum or max.
+// `init`: initializer
+// `fs`: striding function for thread work distribution.
+// `input`: input with input(index_int).
+// `output`: output with output(index_int, inclusive_scan_value_at_index_int).
 template <index_int N,
           class Op,
           class T,
@@ -72,6 +80,7 @@ __device__ void block_scan(index idx, Op op, T init, ForStride fs, Input input,
     });
 }
 
+// Overload of block_scan with default local_stride up to `n`.
 template <index_int N, class Op, class T, class Input, class Output>
 __device__ void block_scan(index idx, Op op, T init, index_int n, Input input, Output output)
 {
diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp
index a473f8477be..dc0202a4109 100644
--- a/src/targets/gpu/jit/nonmaxsuppression.cpp
+++ b/src/targets/gpu/jit/nonmaxsuppression.cpp
@@ -43,12 +43,12 @@ namespace gpu {
 // and reinterpreted in the kernel.
 static constexpr std::size_t nms_bytes_per_data = 24;
 
-// Phase-1 ("sort") kernel: each block normalizes its (batch, class)'s boxes
+// Phase 1 ("sort") kernel: each block normalizes its (batch, class)'s boxes
 // and bitonic-sorts them by descending score into a per-block region of the
 // `sorted` scratch buffer. Launch dimensions are sized to AlignedNumBoxes so
 // the sort has enough parallelism even when NumBoxes is small relative to it.
 // NOLINTNEXTLINE
-static const char* const nms_sort_kernel_src = R"__migraphx__(
+static const char* const nms_load_sort_kernel_src = R"__migraphx__(
 #include <migraphx/kernels/nonmaxsuppression.hpp>
 #include <args.hpp>
 
@@ -125,8 +125,8 @@ extern "C" {
 
 MIGRAPHX_GLOBAL void nms_compact_kernel(${params})
 {
-    make_tensors()(${args})([](auto counts, auto raw_out, auto out) {
-        nonmaxsuppression_compact<${num_blocks}, ${num_boxes}>(counts, raw_out, out);
+    make_tensors()(${args})([](auto bc_counts, auto output_indices, auto output_num_selected) {
+        nonmaxsuppression_compact<${num_batch_class}, ${num_boxes}>(bc_counts, output_indices, output_num_selected);
     });
 }
 
@@ -144,7 +144,7 @@ struct nms_compiler : compiler<nms_compiler>
     // Compile the sort kernel.
     // inputs: [boxes, scores, sorted]
     operation
-    compile_sort(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    compile_load_sort(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
         const auto& boxes_s  = inputs[0];
         const auto& scores_s = inputs[1];
@@ -157,7 +157,7 @@ struct nms_compiler : compiler<nms_compiler>
 
         hip_compile_options options;
         options.inputs         = inputs;
-        options.output         = inputs.back(); // sorted buffer
+        options.output         = inputs.back();
         options.kernel_name    = "nms_sort_kernel";
         options.virtual_inputs = inputs;
         options.set_launch_params(v, block_size * num_batches * num_classes, block_size);
@@ -203,7 +203,7 @@ struct nms_compiler : compiler<nms_compiler>
 
         hip_compile_options options;
         options.inputs         = inputs;
-        options.output         = inputs.back(); // raw_output buffer
+        options.output         = inputs.back();
         options.kernel_name    = "nms_filter_kernel";
         options.virtual_inputs = inputs;
         options.set_launch_params(v, block_size * nb * nc, block_size);
@@ -220,13 +220,7 @@ struct nms_compiler : compiler<nms_compiler>
     }
 
     // TODO: REDO this whole thing. It doesn't make sense.
-    // Compile the compaction kernel. `inputs` is:
-    //   [counts, raw_output, output]
-    // Launched as a single block: an exclusive prefix scan over counts gives
-    // each per-block region a base offset, then the block's threads scatter
-    // selections to those offsets in parallel. The single-block constraint
-    // keeps the scan in shared memory; `nms_compact` static_asserts a hard
-    // cap on NumBlocks that comfortably fits any realistic ONNX NMS.
+    // Compiles the nms_compact_kernel.
     operation
     compile_compact(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
@@ -239,10 +233,6 @@ struct nms_compiler : compiler<nms_compiler>
                                     ? raw_s.elements() / (num_blocks * std::size_t{3})
                                     : std::size_t{0};
 
-        // Pick a block size large enough to give the scan and scatter useful
-        // parallelism without inflating LDS pressure. block_scan requires the
-        // block size to be a multiple of the wavefront size; 64 is the
-        // smallest safe choice for all supported gfx targets.
         const auto total = std::max(num_blocks * num_boxes, std::size_t{1});
         const auto block_size = std::min<std::size_t>(
             std::max<std::size_t>(
@@ -255,36 +245,21 @@ struct nms_compiler : compiler<nms_compiler>
         options.output         = inputs.back();
         options.kernel_name    = "nms_compact_kernel";
         options.virtual_inputs = inputs;
-        // BUG: this is not one block
-        options.set_launch_params(v, block_size, block_size); // one block
+        options.set_launch_params(v, 1, block_size);
 
         auto src = interpolate_string(
             nms_compact_kernel_src,
             {{"params", enum_params(inputs.size(), "void * private_p")},
              {"args", enum_params(inputs.size(), "private_p")},
-             {"num_blocks", std::to_string(num_blocks)},
+             {"num_batch_class", std::to_string(num_batch_class)},
              {"num_boxes", std::to_string(num_boxes)}});
         return compile_hip_code_object(ctx, src, options);
     }
 
-    // Required compiler<> hook: return the sort kernel built from the raw
-    // user input shapes (boxes, scores). The full three-kernel chain is
-    // handled in `compile()`; this entry point is only used by callers that
-    // ask for a single op view.
+    // Required compiler<> hook, should not be used for this compiler.
     operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
-        if(inputs.size() < 2)
-            MIGRAPHX_THROW("nms_compiler: compile_op needs at least boxes and scores");
-        const auto& boxes_s  = inputs[0];
-        const auto& scores_s = inputs[1];
-        if(boxes_s.lens().size() != 3 or scores_s.lens().size() != 3)
-            MIGRAPHX_THROW("nms_compiler: boxes and scores must be 3-D");
-        const auto nb        = boxes_s.lens()[0];
-        const auto b         = boxes_s.lens()[1];
-        const auto nc        = scores_s.lens()[1];
-        const auto aligned_b = static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(b)));
-        const shape sorted_s{shape::int8_type, {nb * nc * aligned_b * nms_bytes_per_data}};
-        return compile_sort(ctx, {boxes_s, scores_s, sorted_s}, v);
+        return {};
     }
 
     compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
@@ -326,28 +301,27 @@ struct nms_compiler : compiler<nms_compiler>
         const auto aligned_b  = static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(b)));
         const auto iou_packed = (b > 1) ? (b * (b - 1) / 2) : std::size_t{1};
 
-        shape sorted_s{shape::int8_type, {nb * nc * aligned_b * nms_bytes_per_data}};
-        shape mask_s{shape::uint8_type, {nb * nc * iou_packed}};
-        // Per-block raw output: nb*nc blocks, each can write up to b
+        shape sorted_shape{shape::int8_type, {nb * nc * aligned_b * nms_bytes_per_data}};
+        shape mask_shape{shape::uint8_type, {nb * nc * iou_packed}};
+        // Per-block output: nb*nc blocks, each can write up to b
         // selections of (batch, class, box_idx) int64 triples.
-        shape raw_output_s{shape::int64_type, {nb * nc * b * 3}};
-        // Per-block selection counts (one int32 per (batch, class) block).
-        shape counts_s{shape::int32_type, {nb * nc}};
+        shape output_s{shape::int64_type, {nb * nc * b * 3}};
+        // Per-batch-per-class selection counts (one index_int per (batch, class) block).
+        shape bc_counts_shape{shape::int32_type, {nb * nc}};
 
         // Sort kernel input shapes:   [boxes, scores, sorted]
-        std::vector<shape> sort_shapes = {boxes_s, scores_s, sorted_s};
+        std::vector<shape> sort_shapes = {boxes_s, scores_s, sorted_shape};
 
         // Filter kernel input shapes: [sorted, max, iou, thr, mask, counts, raw_out]
-        std::vector<shape> filter_shapes = {sorted_s,
+        std::vector<shape> filter_shapes = {sorted_shape,
                                             raw_shapes[2],
                                             raw_shapes[3],
                                             raw_shapes[4],
-                                            mask_s,
-                                            counts_s,
+                                            mask_shape,
+                                            bc_counts_shape,
                                             raw_output_s};
 
-        // Compact kernel input shapes: [counts, raw_out, output]
-        std::vector<shape> compact_shapes = {counts_s, raw_output_s, raw.back()->get_shape()};
+        std::vector<shape> compact_shapes = {bc_counts_shape, output_s, {shape::int64_type, {1}}};
 
         // The filter kernel can't recover nb/nc/b from its input shapes
         // (sorted/mask/counts/raw_out are all flat scratch buffers), so we
@@ -361,57 +335,47 @@ struct nms_compiler : compiler<nms_compiler>
         auto filter_kop  = compile_filter(ctx, filter_shapes, augmented);
         auto compact_kop = compile_compact(ctx, compact_shapes, augmented);
 
+        // kernel operations
         std::vector<operation> kops = {sort_kop, filter_kop, compact_kop};
 
         return {kops,
-                [=](module& m, instruction_ref ins2, const std::vector<operation>& cops) {
-                    auto args = ins2->inputs();
-                    auto out  = args.back();
+                [=](module& m, instruction_ref rep_ins, const std::vector<operation>& ops) {
+                    auto args = rep_ins->inputs();
+                    auto output  = args.back();
                     args.pop_back();
-
+                    
+                    // fill out optional arguments
                     if(args.size() < 3)
                     {
                         args.push_back(m.insert_literal(
-                            ins2, literal{default_max_s, {std::int64_t{0}}}));
+                            rep_ins, literal{default_max_s, {std::int64_t{0}}}));
                     }
                     if(args.size() < 4)
                     {
                         args.push_back(
-                            m.insert_literal(ins2, literal{default_iou_s, {0.0f}}));
+                            m.insert_literal(rep_ins, literal{default_iou_s, {0.0f}}));
                     }
                     if(args.size() < 5)
                     {
                         args.push_back(
-                            m.insert_literal(ins2, literal{default_thr_s, {0.0f}}));
+                            m.insert_literal(rep_ins, literal{default_thr_s, {0.0f}}));
                     }
 
-                    auto sorted = m.insert_instruction(
-                        ins2, make_op("hip::allocate", {{"shape", to_value(sorted_s)}}));
-                    auto mask = m.insert_instruction(
-                        ins2, make_op("hip::allocate", {{"shape", to_value(mask_s)}}));
-                    auto raw_out = m.insert_instruction(
-                        ins2, make_op("hip::allocate", {{"shape", to_value(raw_output_s)}}));
-                    auto counts = m.insert_instruction(
-                        ins2, make_op("hip::allocate", {{"shape", to_value(counts_s)}}));
-
-                    // Phase 1: sort. Inputs are [boxes, scores, sorted]; the
-                    // returned `sort_ins` is the post-write `sorted` buffer
-                    // which becomes the filter kernel's first input.
-                    auto sort_ins = m.insert_instruction(
-                        ins2, cops[0], {args[0], args[1], sorted});
-
-                    // Phase 2: filter. Use `sort_ins` as the dataflow edge so
-                    // the filter is ordered after sort and `sorted` stays
-                    // live. Returned `filter_ins` is the post-write
-                    // `raw_output` buffer fed to compact.
+                    auto sorted = m.insert_instruction(rep_ins, make_op("hip::allocate", {{"shape", to_value(sorted_shape)}}));
+                    auto mask = m.insert_instruction(rep_ins, make_op("hip::allocate", {{"shape", to_value(mask_shape)}}));
+                    auto bc_counts = m.insert_instruction(rep_ins, make_op("hip::allocate", {{"shape", to_value(bc_counts_shape)}}));
+                    auto output_num_selected = m.insert_instruction(rep_ins, make_op("hip::allocate", {{"shape", to_value(scalar_shape)}}));
+
+                    auto load_sort_ins = m.insert_instruction(rep_ins, ops[0], {args[0], args[1], sorted});
+
                     auto filter_ins = m.insert_instruction(
-                        ins2,
-                        cops[1],
-                        {sort_ins, args[2], args[3], args[4], mask, counts, raw_out});
+                        rep_ins,
+                        ops[1],
+                        {load_sort_ins, args[2], args[3], args[4], mask, bc_counts, output});
 
-                    // Phase 3: compact. Counts/filter_ins/out match the
-                    // [counts, raw_output, output] order in compact_shapes.
-                    m.replace_instruction(ins2, cops[2], {counts, filter_ins, out});
+                    output = m.insert_instruction(rep_ins, make_op("get_tuple_elem", {{"index", 0}}), filter_ins); 
+                    auto bc_counts_output = m.insert_instruction(rep_ins, make_op("get_tuple_elem", {{"index", 1}}), filter_ins);
+                    m.replace_instruction(rep_ins, ops[2], {bc_counts_output, output, output_num_selected});
                 }};
     }
 };
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
index c4c27a76ed3..3ac8520fc53 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
@@ -105,7 +105,7 @@ struct nms_score_greater
     }
 };
 
-// Phase 1
+// Kernel 1.
 // One block per (batch_idx, class_idx).
 // Load data into per-block buffer of nms_data.
 // Pads values after N with sentinel values.
@@ -160,7 +160,7 @@ __device__ void nonmaxsuppression_sort(Boxes boxes_tv, Scores scores_tv, Output
     bitonic_sort<nms_score_greater>{nms_score_greater{}}.template block_sort<AlignedNumBoxes>(idx, block_out_tv);
 }
 
-// Phase 2
+// Part of kernel 2.
 // Build the packed upper-triangular IoU mask for the N sorted boxes.
 // Work is striped such that each thread does a multiple of 2 rows so each does roughly the same
 // amount of work regardless of where it falls in the triangle.
@@ -189,7 +189,7 @@ __device__ void nms_make_iou_mask(index idx, const nms_data* sorted, uint8_t* ma
     }
 }
 
-// Phase 3
+// Part of kernel 2.
 // Greedy filter that writes selections into a per-block region of a
 // scratch buffer (block_id * N entries) and stores the per-block count.
 template <index_int N>
@@ -313,72 +313,59 @@ __device__ void nonmaxsuppression_filter(Sorted sorted_buf,
                                    reinterpret_cast<int32_t*>(counts_buf.data()));
 }
 
-// Single-block compaction: an exclusive prefix scan over block_counts gives
-// each per-block region a base offset in the final output; threads in the
-// single launched block then scatter the per-block selections in parallel.
-// Block_id order is preserved, which matches the CPU op's (batch, class)
-// iteration order, and each block writes its `block_counts[b]` entries in
-// order, so the final output is bit-for-bit identical to the serial walker.
-// Trailing slots are left as the zero fill applied before this kernel runs.
-// TODO: this explaination makes no sense
-template <index_int NumBlocks, index_int NumBoxes>
-__device__ void
-nms_compact(index idx, const int64_t* raw_output, const int32_t* block_counts, int64_t* output)
-{
-    static_assert(NumBlocks > 0, "num_blocks must be > 0");
-    // offsets[] is sized 4 * NumBlocks bytes in LDS, well within the 64 KB
-    // per-block budget for any realistic ONNX NMS (nb * nc).
-    static_assert(NumBlocks <= 16384,
-                  "nms_compact: NumBlocks exceeds the LDS budget for offsets[]");
-
-    __shared__ int32_t offsets[NumBlocks];
 
-    // Exclusive prefix sum: emit(b, inclusive) -> offsets[b] = inclusive - counts[b].
+// Kernel 3.
+// Move batch/class box index entries to the beginning of the output buffer.
+// Runs with 1 block. Swaps indices within `output_indices`.
+// `bc_counts`: Number of selected boxes per batch per class. (read-only)
+// `output_indices`: Output box indices that are initially segemented by non-initialized values between selected
+// indices between each batch/class. After this kernel, the selected indicies will be compacted to the beginning
+// of the tensor.
+// `output_num_selected`: Total number of selected boxes.
+template <index_int NumBatchClass, index_int NumBoxes, class Counts, class IdxOutput, class NumOutput>
+__device__ void nonmaxsuppression_compact(const Counts bc_counts,
+                                          NumOutput output_num_selected,
+                                          IdxOutput output_indices)
+{
+    static_assert(NumBatchClass > 0, "NumBatchClass must be > 0");
+    static_assert(NumBatchClass <= 16000, "nms_compact: NumBlocks exceeds the LDS budget for offsets[]");
+    __shared__ array<index_int, NumBatchClass> offsets;
+    // Exclusive prefix sum on bc_counts to get offsets
     block_scan(
         idx,
         op::sum{},
-        int32_t{0},
-        index_int{NumBlocks},
-        [&](auto b) -> int32_t { return block_counts[b]; },
-        [&](auto b, auto inclusive) { offsets[b] = inclusive - block_counts[b]; });
+        0,
+        NumBlocks,
+        [&](auto i) -> int32_t { return bc_counts[i]; },
+        [&](auto i, auto inclusive_value) { offsets[i] = inclusive_value - block_counts[i]; });
     __syncthreads();
 
-    // Parallel scatter: flatten (b, i) so all threads see roughly equal work,
-    // regardless of how `block_counts[b]` is distributed across blocks.
-    constexpr index_int total = NumBlocks * NumBoxes;
-    idx.local_stride(total, [&](auto bi) {
-        const index_int b = bi / NumBoxes;
-        const index_int i = bi % NumBoxes;
-        if(i < static_cast<index_int>(block_counts[b]))
+    // Get num_selected_boxes from last value of exclusive scan and add last bc_counts value.
+    if(idx.local == 0)
+    {
+        output_num_selected[0] = offsets[NumBatchClass-1] + block_counts[NumBlocks-1];
+    }
+
+    // swap index values to make the output packed
+    constexpr index_int index_size = 3;
+    constexpr index_int max_entries = NumBatchClass * NumBoxes;
+    idx.local_stride(max_entries, [&](auto i) {
+        const index_int batch_class_idx = i / NumBoxes;
+        const index_int box_idx = i & NumBoxes;
+        if(box_idx < block_counts[batch_class_idx])
         {
-            const int64_t* src = raw_output + (b * NumBoxes + i) * 3;
-            int64_t* dst       = output + (offsets[b] + i) * 3;
-            dst[0]             = src[0];
-            dst[1]             = src[1];
-            dst[2]             = src[2];
+            auto src = [&](auto j){return output_indices[batch_class_idx * NumBoxes + box_idx * index_size + j]};
+            auto dst = [&](auto j){return output_indices[(offsets[batch_class_idx] + box_idx) * index_size + j]};
+            array<int64_t, 3> tmp_src = {src(0), src(1), src(2)};
+            for(int k = 0; k < 3; ++k)
+            {
+                src(k) = dst(k);
+                dst(k) = tmp_src[k];
+            }
         }
     });
 }
 
-// Compact wrapper invoked from the final JIT kernel. Reads the per-block
-// counts and raw_output produced by `nonmaxsuppression_filter` and copies
-// selections into the final output in block_id (i.e. (batch, class)) order.
-// `output` is last to match the JIT convention of using `inputs.back()` as
-// the kernel's logical output buffer.
-template <index_int NumBlocks, index_int NumBoxes, class Counts, class RawOutput, class Output>
-__device__ void nonmaxsuppression_compact(Counts counts_buf,
-                                          RawOutput raw_output_buf,
-                                          Output output_indices)
-{
-    static_assert(NumBlocks > 0, "num_blocks must be > 0");
-
-    auto idx = make_index();
-    nms_compact<NumBlocks, NumBoxes>(idx,
-                                     reinterpret_cast<const int64_t*>(raw_output_buf.data()),
-                                     reinterpret_cast<const int32_t*>(counts_buf.data()),
-                                     output_indices.data());
-}
-
 } // namespace migraphx
 
 #endif // MIGRAPHX_GUARD_KERNELS_NONMAXSUPPRESSION_HPP

From f2734dcbbd155ca15a9a467b930214f4a4898537 Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Thu, 14 May 2026 18:12:05 -0500
Subject: [PATCH 07/32] minor progress

---
 .../migraphx/kernels/nonmaxsuppression.hpp    | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
index 3ac8520fc53..94bd32dcd5e 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
@@ -105,7 +105,7 @@ struct nms_score_greater
     }
 };
 
-// Kernel 1.
+// Phase 1
 // One block per (batch_idx, class_idx).
 // Load data into per-block buffer of nms_data.
 // Pads values after N with sentinel values.
@@ -148,8 +148,7 @@ __device__ void nonmaxsuppression_sort(Boxes boxes_tv, Scores scores_tv, Output
         }
         else
         {
-            // Sentinel: -inf score so it never beats any real entry, and a
-            // negative box_index so accidental dereferencing is detectable.
+            // Sentinel: -inf score so it never beats any real entry
             d.score     = -__FLT_MAX__;
             d.box       = array<float, 4>{0.f, 0.f, 0.f, 0.f};
             d.box_index = -1;
@@ -160,12 +159,14 @@ __device__ void nonmaxsuppression_sort(Boxes boxes_tv, Scores scores_tv, Output
     bitonic_sort<nms_score_greater>{nms_score_greater{}}.template block_sort<AlignedNumBoxes>(idx, block_out_tv);
 }
 
-// Part of kernel 2.
+// Phase 2
 // Build the packed upper-triangular IoU mask for the N sorted boxes.
 // Work is striped such that each thread does a multiple of 2 rows so each does roughly the same
 // amount of work regardless of where it falls in the triangle.
-template <index_int N>
-__device__ void nms_make_iou_mask(index idx, const nms_data* sorted, uint8_t* mask, float iou_thr)
+// `sorted`: sorted nms_data{} tensor
+// `mask`: bool mask tensor
+template <index_int N, class SortedData, class Mask>
+__device__ void nms_make_iou_mask(index idx, const SortedData sorted, Mask mask, float iou_threshold)
 {
     constexpr index_int half = N / 2;
 
@@ -173,7 +174,7 @@ __device__ void nms_make_iou_mask(index idx, const nms_data* sorted, uint8_t* ma
         for(index_int j = i + 1; j < N; ++j)
         {
             mask[nms_packed_idx(i, j, N)] =
-                nms_iou_over_threshold(sorted[i].box, sorted[j].box, iou_thr) ? 1 : 0;
+                nms_iou_over_threshold(sorted[i].box, sorted[j].box, iou_threshold) ? 1 : 0;
         }
     };
 
@@ -189,7 +190,7 @@ __device__ void nms_make_iou_mask(index idx, const nms_data* sorted, uint8_t* ma
     }
 }
 
-// Part of kernel 2.
+// Phase 2
 // Greedy filter that writes selections into a per-block region of a
 // scratch buffer (block_id * N entries) and stores the per-block count.
 template <index_int N>
@@ -282,8 +283,8 @@ __device__ void nonmaxsuppression_filter(Sorted sorted_buf,
 
     auto idx                            = make_index();
     const index_int block_id            = idx.group;
-    const int batch_idx                 = static_cast<int>(block_id / NumClasses);
-    const int class_idx                 = static_cast<int>(block_id % NumClasses);
+    const int batch_idx                 = block_id / NumClasses;
+    const int class_idx                 = block_id % NumClasses;
     constexpr index_int iou_packed_size = (NumBoxes > 1) ? (NumBoxes * (NumBoxes - 1)) / 2 : 1;
 
     nms_data* my_sorted =
@@ -314,7 +315,7 @@ __device__ void nonmaxsuppression_filter(Sorted sorted_buf,
 }
 
 
-// Kernel 3.
+// Phase 3
 // Move batch/class box index entries to the beginning of the output buffer.
 // Runs with 1 block. Swaps indices within `output_indices`.
 // `bc_counts`: Number of selected boxes per batch per class. (read-only)

From 637937738f8709be958d1343d8dc5f9ec55cab2b Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Thu, 14 May 2026 18:22:23 -0500
Subject: [PATCH 08/32] AI refactor to separate instructions

---
 src/targets/gpu/CMakeLists.txt                |   1 +
 .../gpu/prepare_nonmaxsuppression.hpp         |  48 ++++
 src/targets/gpu/jit/nonmaxsuppression.cpp     | 265 ++++++------------
 .../migraphx/kernels/nonmaxsuppression.hpp    |  37 ++-
 src/targets/gpu/lowering.cpp                  |  29 --
 src/targets/gpu/prepare_nonmaxsuppression.cpp | 216 ++++++++++++++
 src/targets/gpu/target.cpp                    |   3 +
 7 files changed, 371 insertions(+), 228 deletions(-)
 create mode 100644 src/targets/gpu/include/migraphx/gpu/prepare_nonmaxsuppression.hpp
 create mode 100644 src/targets/gpu/prepare_nonmaxsuppression.cpp

diff --git a/src/targets/gpu/CMakeLists.txt b/src/targets/gpu/CMakeLists.txt
index 6d66ccdc573..b8e92310b99 100644
--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -183,6 +183,7 @@ add_library(migraphx_gpu
     pack_args.cpp
     prefuse_ops.cpp
     prepare_mlir.cpp
+    prepare_nonmaxsuppression.cpp
     prepare_reduce.cpp
     perfdb.cpp
     pooling.cpp
diff --git a/src/targets/gpu/include/migraphx/gpu/prepare_nonmaxsuppression.hpp b/src/targets/gpu/include/migraphx/gpu/prepare_nonmaxsuppression.hpp
new file mode 100644
index 00000000000..bf47c8607b9
--- /dev/null
+++ b/src/targets/gpu/include/migraphx/gpu/prepare_nonmaxsuppression.hpp
@@ -0,0 +1,48 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+#ifndef MIGRAPHX_GUARD_GPU_PREPARE_NONMAXSUPPRESSION_HPP
+#define MIGRAPHX_GUARD_GPU_PREPARE_NONMAXSUPPRESSION_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/export.h>
+#include <string>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module;
+
+namespace gpu {
+
+struct MIGRAPHX_GPU_EXPORT prepare_nonmaxsuppression
+{
+    std::string name() const { return "gpu::prepare_nonmaxsuppression"; }
+    void apply(module& m) const;
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_PREPARE_NONMAXSUPPRESSION_HPP
diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp
index dc0202a4109..be32bf75479 100644
--- a/src/targets/gpu/jit/nonmaxsuppression.cpp
+++ b/src/targets/gpu/jit/nonmaxsuppression.cpp
@@ -27,9 +27,6 @@
 #include <migraphx/gpu/compiler.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/instruction.hpp>
-#include <migraphx/literal.hpp>
-#include <migraphx/make_op.hpp>
-#include <migraphx/module.hpp>
 
 #include <algorithm>
 #include <cstdint>
@@ -38,17 +35,12 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 
-// nms_data is laid out as { float score; float box[4]; int box_index; } for a
-// total of 24 bytes per entry. The scratch workspace is allocated as raw int8
-// and reinterpreted in the kernel.
-static constexpr std::size_t nms_bytes_per_data = 24;
-
-// Phase 1 ("sort") kernel: each block normalizes its (batch, class)'s boxes
+// Phase-1 ("sort") kernel: each block normalizes its (batch, class)'s boxes
 // and bitonic-sorts them by descending score into a per-block region of the
 // `sorted` scratch buffer. Launch dimensions are sized to AlignedNumBoxes so
 // the sort has enough parallelism even when NumBoxes is small relative to it.
 // NOLINTNEXTLINE
-static const char* const nms_load_sort_kernel_src = R"__migraphx__(
+static const char* const nms_sort_kernel_src = R"__migraphx__(
 #include <migraphx/kernels/nonmaxsuppression.hpp>
 #include <args.hpp>
 
@@ -76,7 +68,9 @@ MIGRAPHX_GLOBAL void nms_sort_kernel(${params})
 // records out of the shared `sorted` buffer, builds the IoU mask, runs the
 // greedy filter, and writes selections into a per-block region of the
 // `raw_output` scratch plus a per-block count. No global atomic counter is
-// used, so per-block contents are deterministic.
+// used, so per-block contents are deterministic. The argument order after the
+// `mask` scratch reflects the precompile_op tuple output flatten order:
+// (raw_output, bc_counts).
 // NOLINTNEXTLINE
 static const char* const nms_filter_kernel_src = R"__migraphx__(
 #include <migraphx/kernels/nonmaxsuppression.hpp>
@@ -93,13 +87,13 @@ MIGRAPHX_GLOBAL void nms_filter_kernel(${params})
                                 auto iou_p,
                                 auto thr_p,
                                 auto mask,
-                                auto counts,
-                                auto raw_out) {
+                                auto raw_out,
+                                auto counts) {
         nonmaxsuppression_filter<${num_batches},
                                  ${num_classes},
                                  ${num_boxes},
                                  ${aligned_num_boxes}>(
-            sorted, max_p, iou_p, thr_p, mask, counts, raw_out);
+            sorted, max_p, iou_p, thr_p, mask, raw_out, counts);
     });
 }
 
@@ -125,8 +119,12 @@ extern "C" {
 
 MIGRAPHX_GLOBAL void nms_compact_kernel(${params})
 {
-    make_tensors()(${args})([](auto bc_counts, auto output_indices, auto output_num_selected) {
-        nonmaxsuppression_compact<${num_batch_class}, ${num_boxes}>(bc_counts, output_indices, output_num_selected);
+    make_tensors()(${args})([](auto bc_counts,
+                               auto raw_output,
+                               auto output_indices,
+                               auto output_num_selected) {
+        nonmaxsuppression_compact<${num_batch_class}, ${num_boxes}>(
+            bc_counts, raw_output, output_indices, output_num_selected);
     });
 }
 
@@ -135,60 +133,65 @@ MIGRAPHX_GLOBAL void nms_compact_kernel(${params})
 } // namespace migraphx
 )__migraphx__";
 
-// TODO: use compute_block_size and/or compute_global_for?
-// TODO: Don't need num_batches, num_classes, num_boxes as template parameters since tensor_view has shapes.
-struct nms_compiler : compiler<nms_compiler>
+// Compiler for the per-(batch, class) sort kernel. `inputs` is the
+// precompile_op input list:  [boxes, scores, sorted_alloc].
+struct nms_sort_compiler : compiler<nms_sort_compiler>
 {
-    std::vector<std::string> names() const { return {"nonmaxsuppression"}; }
+    std::vector<std::string> names() const { return {"gpu::nms_sort"}; }
 
-    // Compile the sort kernel.
-    // inputs: [boxes, scores, sorted]
-    operation
-    compile_load_sort(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
         const auto& boxes_s  = inputs[0];
         const auto& scores_s = inputs[1];
-        const auto num_batches        = boxes_s.lens()[0];
-        const auto num_boxes          = boxes_s.lens()[1];
-        const auto num_classes        = scores_s.lens()[1];
-        const auto aligned_b = static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(num_boxes)));
-        // clamp between 64 and 1024 threads based on aligned_num_boxes
-        const auto block_size = std::min<std::size_t>(std::max<std::size_t>(aligned_b, std::size_t{64}), std::size_t{1024});
+        const auto nb        = boxes_s.lens()[0];
+        const auto b         = boxes_s.lens()[1];
+        const auto nc        = scores_s.lens()[1];
+        const auto aligned_b =
+            static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(b)));
+        // Clamp the block size to [64, 1024] threads, sized for the bitonic sort.
+        const auto block_size = std::min<std::size_t>(
+            std::max<std::size_t>(aligned_b, std::size_t{64}), std::size_t{1024});
 
         hip_compile_options options;
         options.inputs         = inputs;
         options.output         = inputs.back();
         options.kernel_name    = "nms_sort_kernel";
         options.virtual_inputs = inputs;
-        options.set_launch_params(v, block_size * num_batches * num_classes, block_size);
+        options.set_launch_params(v, block_size * nb * nc, block_size);
 
         auto src = interpolate_string(
             nms_sort_kernel_src,
             {{"params", enum_params(inputs.size(), "void * private_p")},
              {"args", enum_params(inputs.size(), "private_p")},
-             {"num_batches", std::to_string(num_batches)},
-             {"num_classes", std::to_string(num_classes)},
-             {"num_boxes", std::to_string(num_boxes)},
+             {"num_batches", std::to_string(nb)},
+             {"num_classes", std::to_string(nc)},
+             {"num_boxes", std::to_string(b)},
              {"aligned_num_boxes", std::to_string(aligned_b)},
-             {"center_point_box",
-              v.at("center_point_box").to<bool>() ? "true" : "false"}});
+             {"center_point_box", v.at("center_point_box").to<bool>() ? "true" : "false"}});
         return compile_hip_code_object(ctx, src, options);
     }
 
-    // inputs: [sorted, max, iou, score_thr, mask, counts, raw_output]
-    // `raw_output` is the last input so the framework treats it as the(
-    // kernel's chained output flowing into the compact kernel. The filter's
-    // inner loops are O(N) per (batch, class), so the launch is sized to
-    // NumBoxes (not AlignedNumBoxes) to avoid leaving padding-only threads
-    // idle. nb, nc, b are passed through the augmented value because the
-    // filter's inputs no longer carry the raw boxes / scores shapes.
-    operation
-    compile_filter(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        return compile_op(ctx, to_shapes(ins->inputs()), op.to_value());
+    }
+};
+
+// Compiler for the filter kernel. `inputs` is the precompile_op input list:
+//   [sorted, max, iou, thr, mask, tuple_alloc]
+// where `tuple_alloc` is a tuple allocation holding (raw_output, bc_counts).
+// After flattening the tuple, the kernel sees 7 arguments.
+struct nms_filter_compiler : compiler<nms_filter_compiler>
+{
+    std::vector<std::string> names() const { return {"gpu::nms_filter"}; }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
-        const auto nb        = v.at("num_batches").to<std::size_t>();
-        const auto nc        = v.at("num_classes").to<std::size_t>();
-        const auto b         = v.at("num_boxes").to<std::size_t>();
-        const auto aligned_b = static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(b)));
+        const auto nb = v.at("num_batches").to<std::size_t>();
+        const auto nc = v.at("num_classes").to<std::size_t>();
+        const auto b  = v.at("num_boxes").to<std::size_t>();
+        const auto aligned_b =
+            static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(b)));
 
         // Clamp the per-block thread count to [64, 256]: a multiple of the
         // wavefront size keeps __syncthreads / block_scan well-defined, and
@@ -202,16 +205,16 @@ struct nms_compiler : compiler<nms_compiler>
             std::size_t{256});
 
         hip_compile_options options;
-        options.inputs         = inputs;
+        options.inputs         = flatten(inputs);
         options.output         = inputs.back();
         options.kernel_name    = "nms_filter_kernel";
-        options.virtual_inputs = inputs;
+        options.virtual_inputs = options.inputs;
         options.set_launch_params(v, block_size * nb * nc, block_size);
 
         auto src = interpolate_string(
             nms_filter_kernel_src,
-            {{"params", enum_params(inputs.size(), "void * private_p")},
-             {"args", enum_params(inputs.size(), "private_p")},
+            {{"params", enum_params(options.inputs.size(), "void * private_p")},
+             {"args", enum_params(options.inputs.size(), "private_p")},
              {"num_batches", std::to_string(nb)},
              {"num_classes", std::to_string(nc)},
              {"num_boxes", std::to_string(b)},
@@ -219,13 +222,23 @@ struct nms_compiler : compiler<nms_compiler>
         return compile_hip_code_object(ctx, src, options);
     }
 
-    // TODO: REDO this whole thing. It doesn't make sense.
-    // Compiles the nms_compact_kernel.
-    operation
-    compile_compact(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        return compile_op(ctx, to_shapes(ins->inputs()), op.to_value());
+    }
+};
+
+// Compiler for the compact kernel. `inputs` is the precompile_op input list:
+//   [bc_counts, raw_output, tuple_alloc]
+// where `tuple_alloc` is a tuple allocation holding (selected_indices,
+// num_selected). After flattening, the kernel sees 4 arguments. `num_blocks`
+// (a.k.a. nb*nc) and `num_boxes` are recovered from the input shapes.
+struct nms_compact_compiler : compiler<nms_compact_compiler>
+{
+    std::vector<std::string> names() const { return {"gpu::nms_compact"}; }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
-        // Derive num_blocks (length of counts) and per-block stride NumBoxes
-        // (raw_output is sized nb*nc*NumBoxes*3 int64 entries).
         const auto& cnt_s     = inputs[0];
         const auto& raw_s     = inputs[1];
         const auto num_blocks = cnt_s.elements();
@@ -233,7 +246,7 @@ struct nms_compiler : compiler<nms_compiler>
                                     ? raw_s.elements() / (num_blocks * std::size_t{3})
                                     : std::size_t{0};
 
-        const auto total = std::max(num_blocks * num_boxes, std::size_t{1});
+        const auto total      = std::max(num_blocks * num_boxes, std::size_t{1});
         const auto block_size = std::min<std::size_t>(
             std::max<std::size_t>(
                 static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(total))),
@@ -241,142 +254,24 @@ struct nms_compiler : compiler<nms_compiler>
             std::size_t{256});
 
         hip_compile_options options;
-        options.inputs         = inputs;
+        options.inputs         = flatten(inputs);
         options.output         = inputs.back();
         options.kernel_name    = "nms_compact_kernel";
-        options.virtual_inputs = inputs;
-        options.set_launch_params(v, 1, block_size);
+        options.virtual_inputs = options.inputs;
+        options.set_launch_params(v, block_size, block_size);
 
         auto src = interpolate_string(
             nms_compact_kernel_src,
-            {{"params", enum_params(inputs.size(), "void * private_p")},
-             {"args", enum_params(inputs.size(), "private_p")},
-             {"num_batch_class", std::to_string(num_batch_class)},
+            {{"params", enum_params(options.inputs.size(), "void * private_p")},
+             {"args", enum_params(options.inputs.size(), "private_p")},
+             {"num_batch_class", std::to_string(num_blocks)},
              {"num_boxes", std::to_string(num_boxes)}});
         return compile_hip_code_object(ctx, src, options);
     }
 
-    // Required compiler<> hook, should not be used for this compiler.
-    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
-    {
-        return {};
-    }
-
     compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
     {
-        // ins->inputs() is [user_inputs..., output_alloc] from
-        // insert_precompile_op. user_inputs has 2..5 entries per ONNX NMS.
-        auto raw = ins->inputs();
-        if(raw.size() < 3 or raw.size() > 6)
-            MIGRAPHX_THROW("nms_compiler: unexpected input count " + std::to_string(raw.size()));
-
-        std::vector<shape> raw_shapes;
-        raw_shapes.reserve(raw.size() - 1);
-        std::transform(raw.begin(),
-                       raw.end() - 1,
-                       std::back_inserter(raw_shapes),
-                       [](auto i) { return i->get_shape(); });
-
-        // Default shapes for missing optional scalar inputs. The literals
-        // inserted by the replace lambda use these same shapes so the
-        // compiled kernel's tensor_view types match the runtime arguments.
-        const shape default_max_s{shape::int64_type, {1}};
-        const shape default_iou_s{shape::float_type, {1}};
-        const shape default_thr_s{shape::float_type, {1}};
-        if(raw_shapes.size() < 3)
-            raw_shapes.push_back(default_max_s);
-        if(raw_shapes.size() < 4)
-            raw_shapes.push_back(default_iou_s);
-        if(raw_shapes.size() < 5)
-            raw_shapes.push_back(default_thr_s);
-
-        const auto& boxes_s  = raw_shapes[0];
-        const auto& scores_s = raw_shapes[1];
-        if(boxes_s.lens().size() != 3 or scores_s.lens().size() != 3)
-            MIGRAPHX_THROW("nms_compiler: boxes and scores must be 3-D");
-
-        const auto nb         = boxes_s.lens()[0];
-        const auto b          = boxes_s.lens()[1];
-        const auto nc         = scores_s.lens()[1];
-        const auto aligned_b  = static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(b)));
-        const auto iou_packed = (b > 1) ? (b * (b - 1) / 2) : std::size_t{1};
-
-        shape sorted_shape{shape::int8_type, {nb * nc * aligned_b * nms_bytes_per_data}};
-        shape mask_shape{shape::uint8_type, {nb * nc * iou_packed}};
-        // Per-block output: nb*nc blocks, each can write up to b
-        // selections of (batch, class, box_idx) int64 triples.
-        shape output_s{shape::int64_type, {nb * nc * b * 3}};
-        // Per-batch-per-class selection counts (one index_int per (batch, class) block).
-        shape bc_counts_shape{shape::int32_type, {nb * nc}};
-
-        // Sort kernel input shapes:   [boxes, scores, sorted]
-        std::vector<shape> sort_shapes = {boxes_s, scores_s, sorted_shape};
-
-        // Filter kernel input shapes: [sorted, max, iou, thr, mask, counts, raw_out]
-        std::vector<shape> filter_shapes = {sorted_shape,
-                                            raw_shapes[2],
-                                            raw_shapes[3],
-                                            raw_shapes[4],
-                                            mask_shape,
-                                            bc_counts_shape,
-                                            raw_output_s};
-
-        std::vector<shape> compact_shapes = {bc_counts_shape, output_s, {shape::int64_type, {1}}};
-
-        // The filter kernel can't recover nb/nc/b from its input shapes
-        // (sorted/mask/counts/raw_out are all flat scratch buffers), so we
-        // pass them through an augmented value alongside the op attributes.
-        value augmented        = op.to_value();
-        augmented["num_batches"] = nb;
-        augmented["num_classes"] = nc;
-        augmented["num_boxes"]   = b;
-
-        auto sort_kop    = compile_sort(ctx, sort_shapes, augmented);
-        auto filter_kop  = compile_filter(ctx, filter_shapes, augmented);
-        auto compact_kop = compile_compact(ctx, compact_shapes, augmented);
-
-        // kernel operations
-        std::vector<operation> kops = {sort_kop, filter_kop, compact_kop};
-
-        return {kops,
-                [=](module& m, instruction_ref rep_ins, const std::vector<operation>& ops) {
-                    auto args = rep_ins->inputs();
-                    auto output  = args.back();
-                    args.pop_back();
-                    
-                    // fill out optional arguments
-                    if(args.size() < 3)
-                    {
-                        args.push_back(m.insert_literal(
-                            rep_ins, literal{default_max_s, {std::int64_t{0}}}));
-                    }
-                    if(args.size() < 4)
-                    {
-                        args.push_back(
-                            m.insert_literal(rep_ins, literal{default_iou_s, {0.0f}}));
-                    }
-                    if(args.size() < 5)
-                    {
-                        args.push_back(
-                            m.insert_literal(rep_ins, literal{default_thr_s, {0.0f}}));
-                    }
-
-                    auto sorted = m.insert_instruction(rep_ins, make_op("hip::allocate", {{"shape", to_value(sorted_shape)}}));
-                    auto mask = m.insert_instruction(rep_ins, make_op("hip::allocate", {{"shape", to_value(mask_shape)}}));
-                    auto bc_counts = m.insert_instruction(rep_ins, make_op("hip::allocate", {{"shape", to_value(bc_counts_shape)}}));
-                    auto output_num_selected = m.insert_instruction(rep_ins, make_op("hip::allocate", {{"shape", to_value(scalar_shape)}}));
-
-                    auto load_sort_ins = m.insert_instruction(rep_ins, ops[0], {args[0], args[1], sorted});
-
-                    auto filter_ins = m.insert_instruction(
-                        rep_ins,
-                        ops[1],
-                        {load_sort_ins, args[2], args[3], args[4], mask, bc_counts, output});
-
-                    output = m.insert_instruction(rep_ins, make_op("get_tuple_elem", {{"index", 0}}), filter_ins); 
-                    auto bc_counts_output = m.insert_instruction(rep_ins, make_op("get_tuple_elem", {{"index", 1}}), filter_ins);
-                    m.replace_instruction(rep_ins, ops[2], {bc_counts_output, output, output_num_selected});
-                }};
+        return compile_op(ctx, to_shapes(ins->inputs()), op.to_value());
     }
 };
 
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
index 94bd32dcd5e..22eebbb1bb1 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
@@ -256,9 +256,10 @@ __device__ void nms_filter_per_block(index idx,
 // Expecting box-coordinate convention has already been normalized into corner form
 // in `sorted_buf`.
 //
-// `raw_output_buf` is intentionally the last parameter so that JIT-compiled
-// callers (which use `inputs.back()` as the kernel's output buffer) treat it
-// as the chained output flowing into the compact kernel.
+// The parameter order matches the flatten order of the precompile_op tuple
+// output (raw_output, counts). `sorted_buf` and `mask_buf` are scratch inputs
+// allocated upstream; `raw_output_buf` and `counts_buf` are the two halves of
+// the tuple-typed output buffer.
 template <index_int NumBatches,
           index_int NumClasses,
           index_int NumBoxes,
@@ -268,15 +269,15 @@ template <index_int NumBatches,
           class IouThr,
           class ScoreThr,
           class Mask,
-          class Counts,
-          class RawOutput>
+          class RawOutput,
+          class Counts>
 __device__ void nonmaxsuppression_filter(Sorted sorted_buf,
                                          MaxOut max_out_p,
                                          IouThr iou_thr_p,
                                          ScoreThr score_thr_p,
                                          Mask mask_buf,
-                                         Counts counts_buf,
-                                         RawOutput raw_output_buf)
+                                         RawOutput raw_output_buf,
+                                         Counts counts_buf)
 {
     static_assert(NumBatches > 0, "num_batches must be > 0");
     static_assert(NumClasses > 0, "num_classes must be > 0");
@@ -317,16 +318,24 @@ __device__ void nonmaxsuppression_filter(Sorted sorted_buf,
 
 // Phase 3
 // Move batch/class box index entries to the beginning of the output buffer.
-// Runs with 1 block. Swaps indices within `output_indices`.
+// Runs with 1 block. Reads from `raw_indices` (the filter kernel's per-block
+// output) and writes the compacted selections into `output_indices`.
 // `bc_counts`: Number of selected boxes per batch per class. (read-only)
-// `output_indices`: Output box indices that are initially segemented by non-initialized values between selected
-// indices between each batch/class. After this kernel, the selected indicies will be compacted to the beginning
-// of the tensor.
+// `raw_indices`: Per-block raw indices written by the filter kernel
+// (read-only).
+// `output_indices`: Output box indices, packed contiguously at the beginning
+// of the buffer in (batch, class) iteration order.
 // `output_num_selected`: Total number of selected boxes.
-template <index_int NumBatchClass, index_int NumBoxes, class Counts, class IdxOutput, class NumOutput>
+template <index_int NumBatchClass,
+          index_int NumBoxes,
+          class Counts,
+          class RawIndices,
+          class IdxOutput,
+          class NumOutput>
 __device__ void nonmaxsuppression_compact(const Counts bc_counts,
-                                          NumOutput output_num_selected,
-                                          IdxOutput output_indices)
+                                          RawIndices raw_indices,
+                                          IdxOutput output_indices,
+                                          NumOutput output_num_selected)
 {
     static_assert(NumBatchClass > 0, "NumBatchClass must be > 0");
     static_assert(NumBatchClass <= 16000, "nms_compact: NumBlocks exceeds the LDS budget for offsets[]");
diff --git a/src/targets/gpu/lowering.cpp b/src/targets/gpu/lowering.cpp
index 56c6039c075..1a9275de52b 100644
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -108,7 +108,6 @@ struct miopen_apply
         add_if_op();
         add_loop_op();
         add_neg_op();
-        //add_nms_op();
         add_lrn_op();
         add_convolution_backwards_op();
         add_select_module_op();
@@ -447,34 +446,6 @@ struct miopen_apply
         });
     }
 
-    void add_nms_op()
-    {
-        apply_map.emplace("nonmaxsuppression", [=](instruction_ref ins) {
-            // Fixed-output NMS is handled by the JIT kernel registered via
-            // jit/nonmaxsuppression.cpp; route it through insert_precompile_op
-            // so compile_ops picks it up later. The dynamic-output mode still
-            // falls back to the CPU implementation.
-            auto op_val = ins->get_operator().to_value();
-            if(not op_val.at("use_dyn_output").to<bool>())
-                return insert_precompile_op(ins);
-
-            auto s      = ins->get_shape();
-            auto output = insert_allocation(ins, s);
-            std::vector<instruction_ref> cpu_inputs;
-            auto inputs = ins->inputs();
-            std::transform(
-                inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) {
-                    return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in);
-                });
-            cpu_inputs.front() =
-                mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs);
-            auto cpu_out = mod->insert_instruction(ins, ins->get_operator(), cpu_inputs);
-            auto gpu_out =
-                mod->insert_instruction(ins, make_op("hip::copy_to_gpu"), cpu_out, output);
-            return mod->replace_instruction(ins, gpu_out);
-        });
-    }
-
     void add_lrn_op()
     {
         apply_map.emplace("lrn", [=](instruction_ref ins) {
diff --git a/src/targets/gpu/prepare_nonmaxsuppression.cpp b/src/targets/gpu/prepare_nonmaxsuppression.cpp
new file mode 100644
index 00000000000..8f9219428a6
--- /dev/null
+++ b/src/targets/gpu/prepare_nonmaxsuppression.cpp
@@ -0,0 +1,216 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+#include <migraphx/gpu/prepare_nonmaxsuppression.hpp>
+#include <migraphx/bit.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/literal.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/register_op.hpp>
+
+#include <algorithm>
+#include <cstdint>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+// nms_data is laid out as { float score; float box[4]; int box_index; } for a
+// total of 24 bytes per entry. The scratch workspace is allocated as raw int8
+// and reinterpreted in the kernel.
+static constexpr std::size_t nms_bytes_per_data = 24;
+
+// Phase-1 op: sort boxes per (batch, class) into a flat byte scratch buffer.
+struct nms_sort
+{
+    bool center_point_box = false;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.center_point_box, "center_point_box"));
+    }
+
+    std::string name() const { return "gpu::nms_sort"; }
+
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(2);
+        const auto& boxes_s  = inputs.at(0);
+        const auto& scores_s = inputs.at(1);
+        if(boxes_s.lens().size() != 3 or scores_s.lens().size() != 3)
+            MIGRAPHX_THROW("gpu::nms_sort: boxes and scores must be 3-D");
+        const auto nb = boxes_s.lens()[0];
+        const auto b  = boxes_s.lens()[1];
+        const auto nc = scores_s.lens()[1];
+        const auto aligned_b =
+            static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(b)));
+        return shape{shape::int8_type, {nb * nc * aligned_b * nms_bytes_per_data}};
+    }
+};
+MIGRAPHX_REGISTER_OP(nms_sort);
+
+// Phase-2 op: build the IoU mask and run the greedy filter. Produces a tuple
+// of (raw_output, bc_counts). num_batches/num_classes/num_boxes are kept as
+// op attributes because the filter inputs are flat scratch buffers from which
+// these can't be recovered.
+struct nms_filter
+{
+    std::size_t num_batches = 0;
+    std::size_t num_classes = 0;
+    std::size_t num_boxes   = 0;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.num_batches, "num_batches"),
+                    f(self.num_classes, "num_classes"),
+                    f(self.num_boxes, "num_boxes"));
+    }
+
+    std::string name() const { return "gpu::nms_filter"; }
+
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(5);
+        shape raw_output_shape{shape::int64_type, {num_batches * num_classes * num_boxes * 3}};
+        shape bc_counts_shape{shape::int32_type, {num_batches * num_classes}};
+        return shape{{raw_output_shape, bc_counts_shape}};
+    }
+};
+MIGRAPHX_REGISTER_OP(nms_filter);
+
+// Phase-3 op: prefix-scan the per-block counts and compact the selections into
+// the final (selected_indices, num_selected) tuple.
+struct nms_compact
+{
+    template <class Self, class F>
+    static auto reflect(Self&, F)
+    {
+        return pack();
+    }
+
+    std::string name() const { return "gpu::nms_compact"; }
+
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(2);
+        const auto& raw_out_s    = inputs.at(1);
+        const auto max_num_boxes = raw_out_s.elements() / std::size_t{3};
+        shape selected_indices_shape{shape::int64_type, {max_num_boxes, 3}};
+        shape num_selected_shape{shape::int64_type, {1}};
+        return shape{{selected_indices_shape, num_selected_shape}};
+    }
+};
+MIGRAPHX_REGISTER_OP(nms_compact);
+
+namespace {
+
+std::vector<instruction_ref> find_nms(module& m)
+{
+    std::vector<instruction_ref> result;
+    auto im = iterator_for(m);
+    std::copy_if(im.begin(), im.end(), std::back_inserter(result), [](auto ins) {
+        return ins->name() == "nonmaxsuppression";
+    });
+    return result;
+}
+
+void rewrite_nms(module& m, instruction_ref ins)
+{
+    auto inputs = ins->inputs();
+    if(inputs.size() < 2 or inputs.size() > 5)
+        MIGRAPHX_THROW("prepare_nonmaxsuppression: unexpected input count " +
+                       std::to_string(inputs.size()));
+
+    const auto& boxes_s  = inputs[0]->get_shape();
+    const auto& scores_s = inputs[1]->get_shape();
+    if(boxes_s.lens().size() != 3 or scores_s.lens().size() != 3)
+        MIGRAPHX_THROW("prepare_nonmaxsuppression: boxes and scores must be 3-D");
+
+    const auto nb         = boxes_s.lens()[0];
+    const auto b          = boxes_s.lens()[1];
+    const auto nc         = scores_s.lens()[1];
+    const auto iou_packed = (b > 1) ? (b * (b - 1) / 2) : std::size_t{1};
+
+    // Fill in missing optional scalar inputs with default literals. The kernels
+    // load these via tensor_view<T, {1}>, so single-element shapes are needed.
+    const shape default_max_s{shape::int64_type, {1}};
+    const shape default_iou_s{shape::float_type, {1}};
+    const shape default_thr_s{shape::float_type, {1}};
+    if(inputs.size() < 3)
+        inputs.push_back(m.insert_literal(ins, literal{default_max_s, {std::int64_t{0}}}));
+    if(inputs.size() < 4)
+        inputs.push_back(m.insert_literal(ins, literal{default_iou_s, {0.0f}}));
+    if(inputs.size() < 5)
+        inputs.push_back(m.insert_literal(ins, literal{default_thr_s, {0.0f}}));
+
+    auto op_val = ins->get_operator().to_value();
+    bool center_point_box = op_val.at("center_point_box").to<bool>();
+
+    // Mask is scratch only; allocate up-front so the standard replace_allocate
+    // pass can later turn it into hip::allocate.
+    shape mask_shape{shape::uint8_type, {nb * nc * iou_packed}};
+    auto mask_alloc =
+        m.insert_instruction(ins, make_op("allocate", {{"shape", to_value(mask_shape)}}));
+
+    auto sorted = m.insert_instruction(
+        ins, make_op("gpu::nms_sort", {{"center_point_box", center_point_box}}), inputs[0], inputs[1]);
+
+    auto filter = m.insert_instruction(
+        ins,
+        make_op("gpu::nms_filter",
+                {{"num_batches", nb}, {"num_classes", nc}, {"num_boxes", b}}),
+        sorted,
+        inputs[2],
+        inputs[3],
+        inputs[4],
+        mask_alloc);
+
+    auto raw_output =
+        m.insert_instruction(ins, make_op("get_tuple_elem", {{"index", 0}}), filter);
+    auto bc_counts =
+        m.insert_instruction(ins, make_op("get_tuple_elem", {{"index", 1}}), filter);
+
+    auto compact = m.insert_instruction(ins, make_op("gpu::nms_compact"), bc_counts, raw_output);
+
+    m.replace_instruction(ins, compact);
+}
+
+} // namespace
+
+void prepare_nonmaxsuppression::apply(module& m) const
+{
+    for(auto ins : find_nms(m))
+    {
+        rewrite_nms(m, ins);
+    }
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/src/targets/gpu/target.cpp b/src/targets/gpu/target.cpp
index 3ed3e72033d..8ff00a75b7b 100644
--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -73,6 +73,7 @@
 #include <migraphx/gpu/fuse_mlir.hpp>
 #include <migraphx/gpu/fuse_ops.hpp>
 #include <migraphx/gpu/prefuse_ops.hpp>
+#include <migraphx/gpu/prepare_nonmaxsuppression.hpp>
 #include <migraphx/gpu/lowering.hpp>
 #include <migraphx/gpu/schedule_model.hpp>
 #include <migraphx/gpu/sync_device.hpp>
@@ -163,6 +164,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
         dead_code_elimination{},
         auto_contiguous{},
         dead_code_elimination{},
+        prepare_nonmaxsuppression{},
+        dead_code_elimination{},
         lowering{&ctx, options.offload_copy},
         eliminate_contiguous{"gpu::contiguous"},
         dead_code_elimination{},

From 2ac67b0ee552483c344c4a7a5e80ad836a81e374 Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Fri, 15 May 2026 18:20:03 -0500
Subject: [PATCH 09/32] Progress on cleanup, now segementation fault in kernel

---
 src/include/migraphx/op/nonmaxsuppression.hpp |   6 +-
 src/targets/gpu/compile_hip_code_object.cpp   |   3 +
 src/targets/gpu/jit/nonmaxsuppression.cpp     | 127 ++++-------
 .../migraphx/kernels/nonmaxsuppression.hpp    | 206 ++++++++----------
 src/targets/gpu/prepare_nonmaxsuppression.cpp | 181 ++++++++-------
 5 files changed, 230 insertions(+), 293 deletions(-)

diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp
index 6b9af617909..9cf3d41070b 100644
--- a/src/include/migraphx/op/nonmaxsuppression.hpp
+++ b/src/include/migraphx/op/nonmaxsuppression.hpp
@@ -294,7 +294,8 @@ struct nonmaxsuppression
     argument compute(const shape& output_shape, std::vector<argument> args) const
     {
         // make buffer of maximum size
-        shape max_output_shape = {output_shape.type(), output_shape.max_lens()};
+        auto output_shapes = flatten({output_shape});
+        shape max_output_shape = {output_shapes.at(0).type(), output_shapes.at(0).max_lens()};
         argument result{max_output_shape};
 
         std::size_t max_output_boxes_per_class =
@@ -317,8 +318,7 @@ struct nonmaxsuppression
                                            score_threshold);
             });
         });
-        shape scalar_int_shape = {shape::int64_type, {1}};
-        argument num_selected_result{scalar_int_shape};
+        argument num_selected_result{output_shapes.at(1)};
         num_selected_result.visit([&](auto output){
             output.begin() = num_selected;
         });
diff --git a/src/targets/gpu/compile_hip_code_object.cpp b/src/targets/gpu/compile_hip_code_object.cpp
index f44804758d5..efe3b4f80bd 100644
--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -192,6 +192,9 @@ compute_global_for(const context& ctx, std::size_t n, std::size_t over)
     };
 }
 
+
+// `n`: The amount of parallel work within a block.
+// `max_block_size`: Upper limit on block size.
 std::size_t compute_block_size(const context& ctx, std::size_t n, std::size_t max_block_size)
 {
     const std::size_t min_block_size = ctx.get_current_device().get_wavefront_size();
diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp
index be32bf75479..be37fcadbc6 100644
--- a/src/targets/gpu/jit/nonmaxsuppression.cpp
+++ b/src/targets/gpu/jit/nonmaxsuppression.cpp
@@ -35,10 +35,6 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 
-// Phase-1 ("sort") kernel: each block normalizes its (batch, class)'s boxes
-// and bitonic-sorts them by descending score into a per-block region of the
-// `sorted` scratch buffer. Launch dimensions are sized to AlignedNumBoxes so
-// the sort has enough parallelism even when NumBoxes is small relative to it.
 // NOLINTNEXTLINE
 static const char* const nms_sort_kernel_src = R"__migraphx__(
 #include <migraphx/kernels/nonmaxsuppression.hpp>
@@ -64,13 +60,6 @@ MIGRAPHX_GLOBAL void nms_sort_kernel(${params})
 } // namespace migraphx
 )__migraphx__";
 
-// Phase-2 ("filter") kernel: each block reads its (batch, class)'s sorted
-// records out of the shared `sorted` buffer, builds the IoU mask, runs the
-// greedy filter, and writes selections into a per-block region of the
-// `raw_output` scratch plus a per-block count. No global atomic counter is
-// used, so per-block contents are deterministic. The argument order after the
-// `mask` scratch reflects the precompile_op tuple output flatten order:
-// (raw_output, bc_counts).
 // NOLINTNEXTLINE
 static const char* const nms_filter_kernel_src = R"__migraphx__(
 #include <migraphx/kernels/nonmaxsuppression.hpp>
@@ -87,13 +76,13 @@ MIGRAPHX_GLOBAL void nms_filter_kernel(${params})
                                 auto iou_p,
                                 auto thr_p,
                                 auto mask,
-                                auto raw_out,
+                                auto output,
                                 auto counts) {
         nonmaxsuppression_filter<${num_batches},
                                  ${num_classes},
                                  ${num_boxes},
                                  ${aligned_num_boxes}>(
-            sorted, max_p, iou_p, thr_p, mask, raw_out, counts);
+            sorted, max_p, iou_p, thr_p, mask, output, counts);
     });
 }
 
@@ -102,12 +91,6 @@ MIGRAPHX_GLOBAL void nms_filter_kernel(${params})
 } // namespace migraphx
 )__migraphx__";
 
-// Phase-3 ("compact") kernel: a single block does an exclusive prefix scan
-// over the per-block counts to obtain output offsets, then its threads
-// scatter selections from each per-block region of `raw_output` into the
-// contiguous prefix of the final output. The order of (block_id 0, 1, ...)
-// is the same as the CPU op's (batch, class) iteration order, so the
-// resulting output matches the CPU op exactly.
 // NOLINTNEXTLINE
 static const char* const nms_compact_kernel_src = R"__migraphx__(
 #include <migraphx/kernels/nonmaxsuppression.hpp>
@@ -119,12 +102,12 @@ extern "C" {
 
 MIGRAPHX_GLOBAL void nms_compact_kernel(${params})
 {
-    make_tensors()(${args})([](auto bc_counts,
-                               auto raw_output,
-                               auto output_indices,
-                               auto output_num_selected) {
+    make_tensors()(${args})([](const auto bc_counts,
+                               auto indices,
+                               auto num_selected,
+                               auto output) {
         nonmaxsuppression_compact<${num_batch_class}, ${num_boxes}>(
-            bc_counts, raw_output, output_indices, output_num_selected);
+            bc_counts, indices, num_selected, output);
     });
 }
 
@@ -133,8 +116,7 @@ MIGRAPHX_GLOBAL void nms_compact_kernel(${params})
 } // namespace migraphx
 )__migraphx__";
 
-// Compiler for the per-(batch, class) sort kernel. `inputs` is the
-// precompile_op input list:  [boxes, scores, sorted_alloc].
+// `inputs` is the precompile_op input list:  [boxes, scores, sorted_alloc].
 struct nms_sort_compiler : compiler<nms_sort_compiler>
 {
     std::vector<std::string> names() const { return {"gpu::nms_sort"}; }
@@ -143,30 +125,29 @@ struct nms_sort_compiler : compiler<nms_sort_compiler>
     {
         const auto& boxes_s  = inputs[0];
         const auto& scores_s = inputs[1];
-        const auto nb        = boxes_s.lens()[0];
-        const auto b         = boxes_s.lens()[1];
-        const auto nc        = scores_s.lens()[1];
-        const auto aligned_b =
-            static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(b)));
-        // Clamp the block size to [64, 1024] threads, sized for the bitonic sort.
-        const auto block_size = std::min<std::size_t>(
-            std::max<std::size_t>(aligned_b, std::size_t{64}), std::size_t{1024});
+        const auto num_batches = boxes_s.lens()[0];
+        const auto num_boxes = boxes_s.lens()[1];
+        const auto num_classes = scores_s.lens()[1];
+        const auto aligned_num_boxes =
+            static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(num_boxes)));
+        // NOTE: topK kernel uses relement/4 for amount of work in a block?
+        auto block_size = compute_block_size(ctx, num_boxes, 1024);
 
         hip_compile_options options;
         options.inputs         = inputs;
         options.output         = inputs.back();
         options.kernel_name    = "nms_sort_kernel";
         options.virtual_inputs = inputs;
-        options.set_launch_params(v, block_size * nb * nc, block_size);
+        options.set_launch_params(v, block_size * num_batches * num_classes, block_size);
 
         auto src = interpolate_string(
             nms_sort_kernel_src,
             {{"params", enum_params(inputs.size(), "void * private_p")},
              {"args", enum_params(inputs.size(), "private_p")},
-             {"num_batches", std::to_string(nb)},
-             {"num_classes", std::to_string(nc)},
-             {"num_boxes", std::to_string(b)},
-             {"aligned_num_boxes", std::to_string(aligned_b)},
+             {"num_batches", std::to_string(num_batches)},
+             {"num_classes", std::to_string(num_classes)},
+             {"num_boxes", std::to_string(num_boxes)},
+             {"aligned_num_boxes", std::to_string(aligned_num_boxes)},
              {"center_point_box", v.at("center_point_box").to<bool>() ? "true" : "false"}});
         return compile_hip_code_object(ctx, src, options);
     }
@@ -177,9 +158,8 @@ struct nms_sort_compiler : compiler<nms_sort_compiler>
     }
 };
 
-// Compiler for the filter kernel. `inputs` is the precompile_op input list:
-//   [sorted, max, iou, thr, mask, tuple_alloc]
-// where `tuple_alloc` is a tuple allocation holding (raw_output, bc_counts).
+// `inputs` is the precompile_op input list: [sorted, max, iou, thr, mask, tuple_alloc].
+// Where `tuple_alloc` is a tuple allocation holding (raw_output, bc_counts).
 // After flattening the tuple, the kernel sees 7 arguments.
 struct nms_filter_compiler : compiler<nms_filter_compiler>
 {
@@ -187,38 +167,30 @@ struct nms_filter_compiler : compiler<nms_filter_compiler>
 
     operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
-        const auto nb = v.at("num_batches").to<std::size_t>();
-        const auto nc = v.at("num_classes").to<std::size_t>();
-        const auto b  = v.at("num_boxes").to<std::size_t>();
-        const auto aligned_b =
-            static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(b)));
-
-        // Clamp the per-block thread count to [64, 256]: a multiple of the
-        // wavefront size keeps __syncthreads / block_scan well-defined, and
-        // 256 is the sweet spot for the O(N) inner loops without inflating
-        // shared-memory pressure on `removed[N]` (which is sized by N, not by
-        // block_size).
-        const auto block_size = std::min<std::size_t>(
-            std::max<std::size_t>(
-                static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(b))),
-                std::size_t{64}),
-            std::size_t{256});
+        const auto num_batches = v.at("num_batches").to<std::size_t>();
+        const auto num_classes = v.at("num_classes").to<std::size_t>();
+        const auto num_boxes  = v.at("num_boxes").to<std::size_t>();
+        const auto aligned_num_boxes =
+            static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(num_boxes)));
+        // TODO: tune for max block size?
+        // num_boxes/2 because of strided thread work distribution
+        const auto block_size = compute_block_size(ctx, num_boxes/2, 256);
 
         hip_compile_options options;
         options.inputs         = flatten(inputs);
         options.output         = inputs.back();
         options.kernel_name    = "nms_filter_kernel";
         options.virtual_inputs = options.inputs;
-        options.set_launch_params(v, block_size * nb * nc, block_size);
+        options.set_launch_params(v, block_size * num_batches * num_classes, block_size);
 
         auto src = interpolate_string(
             nms_filter_kernel_src,
             {{"params", enum_params(options.inputs.size(), "void * private_p")},
              {"args", enum_params(options.inputs.size(), "private_p")},
-             {"num_batches", std::to_string(nb)},
-             {"num_classes", std::to_string(nc)},
-             {"num_boxes", std::to_string(b)},
-             {"aligned_num_boxes", std::to_string(aligned_b)}});
+             {"num_batches", std::to_string(num_batches)},
+             {"num_classes", std::to_string(num_classes)},
+             {"num_boxes", std::to_string(num_boxes)},
+             {"aligned_num_boxes", std::to_string(aligned_num_boxes)}});
         return compile_hip_code_object(ctx, src, options);
     }
 
@@ -228,30 +200,21 @@ struct nms_filter_compiler : compiler<nms_filter_compiler>
     }
 };
 
-// Compiler for the compact kernel. `inputs` is the precompile_op input list:
-//   [bc_counts, raw_output, tuple_alloc]
-// where `tuple_alloc` is a tuple allocation holding (selected_indices,
-// num_selected). After flattening, the kernel sees 4 arguments. `num_blocks`
-// (a.k.a. nb*nc) and `num_boxes` are recovered from the input shapes.
+// `inputs` is the precompile_op input list: [bc_counts, raw_output, tuple_alloc]
+// where `tuple_alloc` is a tuple allocation holding (selected_indices, num_selected).
+// After flattening, the kernel sees 4 arguments.
 struct nms_compact_compiler : compiler<nms_compact_compiler>
 {
     std::vector<std::string> names() const { return {"gpu::nms_compact"}; }
 
     operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
-        const auto& cnt_s     = inputs[0];
-        const auto& raw_s     = inputs[1];
-        const auto num_blocks = cnt_s.elements();
-        const auto num_boxes  = (num_blocks > 0)
-                                    ? raw_s.elements() / (num_blocks * std::size_t{3})
-                                    : std::size_t{0};
-
-        const auto total      = std::max(num_blocks * num_boxes, std::size_t{1});
-        const auto block_size = std::min<std::size_t>(
-            std::max<std::size_t>(
-                static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(total))),
-                std::size_t{64}),
-            std::size_t{256});
+        const auto& cnt_s = inputs[0];
+        const auto& indices_s = inputs[1];
+        const auto num_batch_class = cnt_s.elements();
+        const auto num_boxes = indices_s.elements() / (num_batch_class * std::size_t{3});
+        // TODO: tune for max block size?
+        const auto block_size = compute_block_size(ctx, num_boxes, 256);
 
         hip_compile_options options;
         options.inputs         = flatten(inputs);
@@ -264,7 +227,7 @@ struct nms_compact_compiler : compiler<nms_compact_compiler>
             nms_compact_kernel_src,
             {{"params", enum_params(options.inputs.size(), "void * private_p")},
              {"args", enum_params(options.inputs.size(), "private_p")},
-             {"num_batch_class", std::to_string(num_blocks)},
+             {"num_batch_class", std::to_string(num_batch_class)},
              {"num_boxes", std::to_string(num_boxes)}});
         return compile_hip_code_object(ctx, src, options);
     }
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
index 22eebbb1bb1..c226ab78ab6 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
@@ -33,6 +33,7 @@
 #include <migraphx/kernels/sort.hpp>
 #include <migraphx/kernels/tensor_view.hpp>
 #include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/slice.hpp>
 
 namespace migraphx {
 
@@ -44,7 +45,7 @@ struct nms_data
 };
 
 // Decode a single box into (xmin, ymin, xmax, ymax) corners.
-// Normalize such that [x1, y1] is the bottom left corner
+// Normalize such that [x1, y1] is the bottom left corner.
 template <bool CenterPointBox, class Box>
 __device__ inline array<float, 4> nms_normalize_box(Box box)
 {
@@ -71,9 +72,9 @@ __device__ inline array<float, 4> nms_normalize_box(Box box)
     }
 }
 
-template <class Box>
+template <class Box, class Threshold>
 __device__ inline bool
-nms_iou_over_threshold(const Box a, Box b, float threshold)
+nms_iou_over_threshold(const Box a, Box b, Threshold threshold)
 {
     const float left   = max(a[0], b[0]);
     const float right  = min(a[2], b[2]);
@@ -105,7 +106,6 @@ struct nms_score_greater
     }
 };
 
-// Phase 1
 // One block per (batch_idx, class_idx).
 // Load data into per-block buffer of nms_data.
 // Pads values after N with sentinel values.
@@ -123,8 +123,10 @@ template <bool CenterPointBox,
           class Output>
 __device__ void nonmaxsuppression_sort(Boxes boxes_tv, Scores scores_tv, Output out_tv)
 {
-    static_assert(NumBatches > 0, "num_batches must be > 0");
-    static_assert(NumClasses > 0, "num_classes must be > 0");
+    static_assert(NumBatches > 0);
+    static_assert(NumClasses > 0);
+    static_assert(NumBoxes > 0);
+    static_assert(AlignedNumBoxes > 0);
 
     auto idx = make_index();
     const index_int block_id = idx.group;
@@ -138,87 +140,86 @@ __device__ void nonmaxsuppression_sort(Boxes boxes_tv, Scores scores_tv, Output
     const auto* boxes_b   = boxes_tv.data() + batch_idx * NumBoxes * 4;
     const auto* scores_bc = scores_tv.data() + (batch_idx * NumClasses + class_idx) * NumBoxes;
 
-    nms_data d;
+    nms_data tmp_data;
     idx.local_stride(AlignedNumBoxes, [&](auto i) {
         if(i < NumBoxes)
         {
-            d.score     = scores_bc[i];
-            d.box       = nms_normalize_box<CenterPointBox>(boxes_b + i * 4);
-            d.box_index = static_cast<int>(i);
+            tmp_data.score     = scores_bc[i];
+            tmp_data.box       = nms_normalize_box<CenterPointBox>(boxes_b + i * 4);
+            tmp_data.box_index = static_cast<int>(i);
         }
         else
         {
             // Sentinel: -inf score so it never beats any real entry
-            d.score     = -__FLT_MAX__;
-            d.box       = array<float, 4>{0.f, 0.f, 0.f, 0.f};
-            d.box_index = -1;
+            tmp_data.score     = -__FLT_MAX__;
+            tmp_data.box       = array<float, 4>{0.f, 0.f, 0.f, 0.f};
+            tmp_data.box_index = -1;
         }
-        block_out_tv[i] = d;
+        block_out_tv[i] = tmp_data;
     });
     __syncthreads();
     bitonic_sort<nms_score_greater>{nms_score_greater{}}.template block_sort<AlignedNumBoxes>(idx, block_out_tv);
 }
 
-// Phase 2
-// Build the packed upper-triangular IoU mask for the N sorted boxes.
+// Build the packed upper-triangular IoU mask for the NumBoxes sorted boxes.
 // Work is striped such that each thread does a multiple of 2 rows so each does roughly the same
 // amount of work regardless of where it falls in the triangle.
 // `sorted`: sorted nms_data{} tensor
 // `mask`: bool mask tensor
-template <index_int N, class SortedData, class Mask>
-__device__ void nms_make_iou_mask(index idx, const SortedData sorted, Mask mask, float iou_threshold)
+template <index_int NumBoxes, class SortedData, class Mask, class IouThreshold>
+__device__ void nms_make_iou_mask(index idx, const SortedData sorted, Mask mask, IouThreshold iou_threshold)
 {
-    constexpr index_int half = N / 2;
+    static_assert(NumBoxes > 0);
+    constexpr index_int half = NumBoxes / 2;
 
     auto fill_row = [&](index_int i) {
-        for(index_int j = i + 1; j < N; ++j)
+        for(index_int j = i + 1; j < NumBoxes; ++j)
         {
-            mask[nms_packed_idx(i, j, N)] =
+            mask[nms_packed_idx(i, j, NumBoxes)] =
                 nms_iou_over_threshold(sorted[i].box, sorted[j].box, iou_threshold) ? 1 : 0;
         }
     };
 
     idx.local_stride(half, [&](auto i) {
         fill_row(i);
-        fill_row(N - 1 - i);
+        fill_row(NumBoxes - 1 - i);
     });
 
-    if constexpr((N & 1) != 0 and N > 1)
+    // Have thread 0 do middle row if odd NumBoxes
+    if constexpr((NumBoxes & 1) != 0 and NumBoxes > 1)
     {
         if(idx.local == 0)
             fill_row(half);
     }
 }
 
-// Phase 2
-// Greedy filter that writes selections into a per-block region of a
-// scratch buffer (block_id * N entries) and stores the per-block count.
-template <index_int N>
+// TODO: use template for types
+// Greedy filter that writes selections into a per-batch per-class region of output.
+template <index_int NumBoxes, index_int NumClasses, class Sorted, class Mask, class Output, class Counts>
 __device__ void nms_filter_per_block(index idx,
-                                     const nms_data* sorted,
-                                     const uint8_t* mask,
-                                     int batch_idx,
-                                     int class_idx,
+                                     const Sorted sorted,
+                                     const Mask mask,
                                      int64_t max_output,
                                      float score_thr,
-                                     int64_t* raw_output,    // [num_blocks * N * 3]
-                                     int32_t* block_counts)  // [num_blocks]
+                                     Output output,
+                                     Counts bc_counts)
 {
-    __shared__ uint8_t removed[N > 0 ? N : 1];
-    // Match the CPU op: only filter by score when score_threshold > 0 (the CPU
-    // takes the same branch). With a non-positive (or sentinel) threshold, all
-    // boxes are kept regardless of sign.
+    static_assert(NumBoxes > 1);
+
+    const index_int block_id = idx.group;
+    const int batch_idx = block_id / NumClasses;
+    const int class_idx = block_id % NumClasses;
+    // TODO: use bits for removed mask
+    __shared__ uint8_t removed[NumBoxes];
+    // Match the ref op: only filter by score when score_threshold > 0.
     const bool do_filter = score_thr > 0.f;
-    idx.local_stride(N, [&](auto i) {
-        removed[i] = (do_filter and sorted[i].score < score_thr) ? 1 : 0;
+    idx.local_stride(NumBoxes, [&](auto i) {
+        removed[i] = (do_filter and sorted[i].score < score_thr);
     });
     __syncthreads();
 
-    const index_int block_id = idx.group;
-    int64_t* my_output       = raw_output + block_id * N * 3;
-
     index_int output_idx = 0;
-    for(index_int i = 0; i < N; ++i)
+    for(index_int i = 0; i < NumBoxes; ++i)
     {
         if(output_idx >= max_output)
         {
@@ -229,37 +230,25 @@ __device__ void nms_filter_per_block(index idx,
         {
             if(idx.local == 0)
             {
-                my_output[output_idx * 3 + 0] = batch_idx;
-                my_output[output_idx * 3 + 1] = class_idx;
-                my_output[output_idx * 3 + 2] = sorted[i].box_index;
+                output[output_idx * 3 + 0] = batch_idx;
+                output[output_idx * 3 + 1] = class_idx;
+                output[output_idx * 3 + 2] = sorted[i].box_index;
             }
             ++output_idx;
-            // Update removed[] using row i of the IoU mask. Each thread handles
-            // a stride of the row to balance work.
-            for(index_int j = i + 1 + idx.local; j < N; j += idx.nlocal())
+            for(index_int j = i + 1 + idx.local; j < NumBoxes; j += idx.nlocal())
             {
-                removed[j] |= mask[nms_packed_idx(i, j, N)];
+                removed[j] |= mask[nms_packed_idx(i, j, NumBoxes)];
             }
         }
         __syncthreads();
     }
 
     if(idx.local == 0)
-        block_counts[block_id] = static_cast<int32_t>(output_idx);
+        bc_counts[block_id] = static_cast<int32_t>(output_idx);
 }
 
-// Per-block filter driver: one block per (batch_idx, class_idx). Reads the
-// previously-sorted records out of `sorted_buf`, builds the IoU mask in
-// `mask_buf`, then runs the greedy filter writing selections into a per-block
-// region of `raw_output` and the per-block count into `counts_buf`.
-//
-// Expecting box-coordinate convention has already been normalized into corner form
-// in `sorted_buf`.
-//
-// The parameter order matches the flatten order of the precompile_op tuple
-// output (raw_output, counts). `sorted_buf` and `mask_buf` are scratch inputs
-// allocated upstream; `raw_output_buf` and `counts_buf` are the two halves of
-// the tuple-typed output buffer.
+// Per-block filter driver: one block per (batch_idx, class_idx).`.
+// Expecting box-coordinate convention has already been normalized into corner form.
 template <index_int NumBatches,
           index_int NumClasses,
           index_int NumBoxes,
@@ -269,91 +258,85 @@ template <index_int NumBatches,
           class IouThr,
           class ScoreThr,
           class Mask,
-          class RawOutput,
+          class Output,
           class Counts>
 __device__ void nonmaxsuppression_filter(Sorted sorted_buf,
                                          MaxOut max_out_p,
                                          IouThr iou_thr_p,
                                          ScoreThr score_thr_p,
-                                         Mask mask_buf,
-                                         RawOutput raw_output_buf,
-                                         Counts counts_buf)
+                                         Mask mask,
+                                         Output output,
+                                         Counts bc_counts)
 {
-    static_assert(NumBatches > 0, "num_batches must be > 0");
-    static_assert(NumClasses > 0, "num_classes must be > 0");
+    static_assert(NumBatches > 0);
+    static_assert(NumClasses > 0);
+    static_assert(NumBoxes > 0);
 
     auto idx                            = make_index();
     const index_int block_id            = idx.group;
-    const int batch_idx                 = block_id / NumClasses;
-    const int class_idx                 = block_id % NumClasses;
-    constexpr index_int iou_packed_size = (NumBoxes > 1) ? (NumBoxes * (NumBoxes - 1)) / 2 : 1;
+    //constexpr index_int iou_packed_size = (NumBoxes > 1) ? (NumBoxes * (NumBoxes - 1)) / 2 : 1;
 
-    nms_data* my_sorted =
-        reinterpret_cast<nms_data*>(sorted_buf.data()) + block_id * AlignedNumBoxes;
-    uint8_t* my_mask = reinterpret_cast<uint8_t*>(mask_buf.data()) + block_id * iou_packed_size;
+    constexpr auto my_sorted_shape = make_shape(index_ints<NumBoxes>{});
+    nms_data* my_sorted_p = reinterpret_cast<nms_data*>(sorted_buf.data()) + block_id * AlignedNumBoxes;
+    auto my_sorted = make_tensor_view<nms_data>(my_sorted_p, my_sorted_shape);
+    
+    auto my_mask = slice_tensor(mask, block_id, slice_axes<1>());
+    auto my_output = slice_tensor(output, block_id, slice_axes<1, 2>());
 
-    // Pull scalar tensor inputs once. They're broadcast to all threads via the
-    // common load (each thread reads the same single element).
+    // Read scalar tensor inputs
     const int64_t max_output_boxes_per_class = max_out_p[0];
     const float iou_thr_val   = iou_thr_p[0];
     const float score_thr_val = score_thr_p[0];
 
-    if constexpr(NumBoxes > 1)
-    {
-        nms_make_iou_mask<NumBoxes>(idx, my_sorted, my_mask, iou_thr_val);
-        __syncthreads();
-    }
+    nms_make_iou_mask<NumBoxes>(idx, my_sorted, my_mask, iou_thr_val);
+    __syncthreads();
 
-    nms_filter_per_block<NumBoxes>(idx,
+    nms_filter_per_block<NumBoxes, NumClasses>(idx,
                                    my_sorted,
                                    my_mask,
-                                   batch_idx,
-                                   class_idx,
                                    max_output_boxes_per_class,
                                    score_thr_val,
-                                   reinterpret_cast<int64_t*>(raw_output_buf.data()),
-                                   reinterpret_cast<int32_t*>(counts_buf.data()));
+                                   my_output,
+                                   bc_counts);
 }
 
 
-// Phase 3
-// Move batch/class box index entries to the beginning of the output buffer.
-// Runs with 1 block. Reads from `raw_indices` (the filter kernel's per-block
-// output) and writes the compacted selections into `output_indices`.
+// Move batch/class box index entries to the beginning of the output buffer. Runs with 1 block.
 // `bc_counts`: Number of selected boxes per batch per class. (read-only)
-// `raw_indices`: Per-block raw indices written by the filter kernel
-// (read-only).
-// `output_indices`: Output box indices, packed contiguously at the beginning
+// `indices`: Box indices, kernel packs selected boxes in-place to the beginning
 // of the buffer in (batch, class) iteration order.
-// `output_num_selected`: Total number of selected boxes.
+// `num_selected`: Total number of selected boxes.
 template <index_int NumBatchClass,
           index_int NumBoxes,
           class Counts,
-          class RawIndices,
-          class IdxOutput,
-          class NumOutput>
+          class Idx,
+          class Num,
+          class Out>
 __device__ void nonmaxsuppression_compact(const Counts bc_counts,
-                                          RawIndices raw_indices,
-                                          IdxOutput output_indices,
-                                          NumOutput output_num_selected)
+                                          const Idx indices,
+                                          Num num_selected,
+                                          Out output)
 {
-    static_assert(NumBatchClass > 0, "NumBatchClass must be > 0");
-    static_assert(NumBatchClass <= 16000, "nms_compact: NumBlocks exceeds the LDS budget for offsets[]");
-    __shared__ array<index_int, NumBatchClass> offsets;
+    static_assert(NumBatchClass > 0);
+    static_assert(NumBoxes > 0);
+    static_assert(NumBatchClass <= 16000, "nms_compact: NumBatchClass exceeds the LDS budget for offsets[]");
+
+    auto idx = make_index();
+    __shared__ index_int offsets[NumBatchClass];
     // Exclusive prefix sum on bc_counts to get offsets
     block_scan(
         idx,
         op::sum{},
         0,
-        NumBlocks,
+        NumBatchClass,
         [&](auto i) -> int32_t { return bc_counts[i]; },
-        [&](auto i, auto inclusive_value) { offsets[i] = inclusive_value - block_counts[i]; });
+        [&](auto i, auto inclusive_value) { offsets[i] = inclusive_value - bc_counts[i]; });
     __syncthreads();
 
     // Get num_selected_boxes from last value of exclusive scan and add last bc_counts value.
     if(idx.local == 0)
     {
-        output_num_selected[0] = offsets[NumBatchClass-1] + block_counts[NumBlocks-1];
+        num_selected[0] = offsets[NumBatchClass-1] + bc_counts[NumBatchClass-1];
     }
 
     // swap index values to make the output packed
@@ -362,15 +345,12 @@ __device__ void nonmaxsuppression_compact(const Counts bc_counts,
     idx.local_stride(max_entries, [&](auto i) {
         const index_int batch_class_idx = i / NumBoxes;
         const index_int box_idx = i & NumBoxes;
-        if(box_idx < block_counts[batch_class_idx])
+        if(box_idx < bc_counts[batch_class_idx])
         {
-            auto src = [&](auto j){return output_indices[batch_class_idx * NumBoxes + box_idx * index_size + j]};
-            auto dst = [&](auto j){return output_indices[(offsets[batch_class_idx] + box_idx) * index_size + j]};
-            array<int64_t, 3> tmp_src = {src(0), src(1), src(2)};
             for(int k = 0; k < 3; ++k)
             {
-                src(k) = dst(k);
-                dst(k) = tmp_src[k];
+                output[(offsets[batch_class_idx] + box_idx) * index_size + k] =
+                indices[batch_class_idx * NumBoxes + box_idx * index_size + k] ;
             }
         }
     });
diff --git a/src/targets/gpu/prepare_nonmaxsuppression.cpp b/src/targets/gpu/prepare_nonmaxsuppression.cpp
index 8f9219428a6..187d193b1bc 100644
--- a/src/targets/gpu/prepare_nonmaxsuppression.cpp
+++ b/src/targets/gpu/prepare_nonmaxsuppression.cpp
@@ -26,13 +26,12 @@
 #include <migraphx/bit.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/instruction.hpp>
-#include <migraphx/iterator_for.hpp>
 #include <migraphx/literal.hpp>
 #include <migraphx/make_op.hpp>
+#include <migraphx/matcher.hpp>
 #include <migraphx/module.hpp>
 #include <migraphx/register_op.hpp>
 
-#include <algorithm>
 #include <cstdint>
 
 namespace migraphx {
@@ -40,11 +39,11 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 
 // nms_data is laid out as { float score; float box[4]; int box_index; } for a
-// total of 24 bytes per entry. The scratch workspace is allocated as raw int8
+// total of 24 bytes per entry. The scratch workspace is allocated as raw uint8
 // and reinterpreted in the kernel.
 static constexpr std::size_t nms_bytes_per_data = 24;
 
-// Phase-1 op: sort boxes per (batch, class) into a flat byte scratch buffer.
+// Sort boxes per (batch, class) into nms_data{} tensor.
 struct nms_sort
 {
     bool center_point_box = false;
@@ -64,20 +63,20 @@ struct nms_sort
         const auto& scores_s = inputs.at(1);
         if(boxes_s.lens().size() != 3 or scores_s.lens().size() != 3)
             MIGRAPHX_THROW("gpu::nms_sort: boxes and scores must be 3-D");
-        const auto nb = boxes_s.lens()[0];
-        const auto b  = boxes_s.lens()[1];
-        const auto nc = scores_s.lens()[1];
+        const auto num_batches = boxes_s.lens()[0];
+        const auto num_boxes  = boxes_s.lens()[1];
+        const auto num_classes = scores_s.lens()[1];
         const auto aligned_b =
-            static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(b)));
-        return shape{shape::int8_type, {nb * nc * aligned_b * nms_bytes_per_data}};
+            static_cast<std::size_t>(bit_ceil(static_cast<std::uint32_t>(num_boxes)));
+        return shape{shape::uint8_type, {num_batches * num_classes * aligned_b * nms_bytes_per_data}};
     }
 };
 MIGRAPHX_REGISTER_OP(nms_sort);
 
-// Phase-2 op: build the IoU mask and run the greedy filter. Produces a tuple
-// of (raw_output, bc_counts). num_batches/num_classes/num_boxes are kept as
-// op attributes because the filter inputs are flat scratch buffers from which
-// these can't be recovered.
+// Build the IoU mask and run the greedy filter.
+// Produces a tuple of (raw_output, bc_counts).
+// num_batches/num_classes/num_boxes are kept as op attributes because the filter inputs
+// is a scratch buffer from which these can't be recovered.
 struct nms_filter
 {
     std::size_t num_batches = 0;
@@ -97,23 +96,19 @@ struct nms_filter
     shape compute_shape(const std::vector<shape>& inputs) const
     {
         check_shapes{inputs, *this}.has(5);
-        shape raw_output_shape{shape::int64_type, {num_batches * num_classes * num_boxes * 3}};
+        shape output_shape{shape::int64_type, {num_batches * num_classes, num_boxes, 3}};
         shape bc_counts_shape{shape::int32_type, {num_batches * num_classes}};
-        return shape{{raw_output_shape, bc_counts_shape}};
+        return shape{{output_shape, bc_counts_shape}};
     }
 };
 MIGRAPHX_REGISTER_OP(nms_filter);
 
-// Phase-3 op: prefix-scan the per-block counts and compact the selections into
-// the final (selected_indices, num_selected) tuple.
+// TODO: This should work in-place, saving memory. Need to update IR to handle it.
+//  Needs a make_tuple type of operator that reuses the indicies input.
+// Prefix-scan the per-block counts and compact the selections into
+// the final selected_indices. Output as selected_indices and num_selected tuple.
 struct nms_compact
 {
-    template <class Self, class F>
-    static auto reflect(Self&, F)
-    {
-        return pack();
-    }
-
     std::string name() const { return "gpu::nms_compact"; }
 
     shape compute_shape(const std::vector<shape>& inputs) const
@@ -130,85 +125,81 @@ MIGRAPHX_REGISTER_OP(nms_compact);
 
 namespace {
 
-std::vector<instruction_ref> find_nms(module& m)
+struct find_nonmaxsuppression
 {
-    std::vector<instruction_ref> result;
-    auto im = iterator_for(m);
-    std::copy_if(im.begin(), im.end(), std::back_inserter(result), [](auto ins) {
-        return ins->name() == "nonmaxsuppression";
-    });
-    return result;
-}
+    auto matcher() const { return match::name("nonmaxsuppression"); }
 
-void rewrite_nms(module& m, instruction_ref ins)
-{
-    auto inputs = ins->inputs();
-    if(inputs.size() < 2 or inputs.size() > 5)
-        MIGRAPHX_THROW("prepare_nonmaxsuppression: unexpected input count " +
-                       std::to_string(inputs.size()));
-
-    const auto& boxes_s  = inputs[0]->get_shape();
-    const auto& scores_s = inputs[1]->get_shape();
-    if(boxes_s.lens().size() != 3 or scores_s.lens().size() != 3)
-        MIGRAPHX_THROW("prepare_nonmaxsuppression: boxes and scores must be 3-D");
-
-    const auto nb         = boxes_s.lens()[0];
-    const auto b          = boxes_s.lens()[1];
-    const auto nc         = scores_s.lens()[1];
-    const auto iou_packed = (b > 1) ? (b * (b - 1) / 2) : std::size_t{1};
-
-    // Fill in missing optional scalar inputs with default literals. The kernels
-    // load these via tensor_view<T, {1}>, so single-element shapes are needed.
-    const shape default_max_s{shape::int64_type, {1}};
-    const shape default_iou_s{shape::float_type, {1}};
-    const shape default_thr_s{shape::float_type, {1}};
-    if(inputs.size() < 3)
-        inputs.push_back(m.insert_literal(ins, literal{default_max_s, {std::int64_t{0}}}));
-    if(inputs.size() < 4)
-        inputs.push_back(m.insert_literal(ins, literal{default_iou_s, {0.0f}}));
-    if(inputs.size() < 5)
-        inputs.push_back(m.insert_literal(ins, literal{default_thr_s, {0.0f}}));
-
-    auto op_val = ins->get_operator().to_value();
-    bool center_point_box = op_val.at("center_point_box").to<bool>();
-
-    // Mask is scratch only; allocate up-front so the standard replace_allocate
-    // pass can later turn it into hip::allocate.
-    shape mask_shape{shape::uint8_type, {nb * nc * iou_packed}};
-    auto mask_alloc =
-        m.insert_instruction(ins, make_op("allocate", {{"shape", to_value(mask_shape)}}));
-
-    auto sorted = m.insert_instruction(
-        ins, make_op("gpu::nms_sort", {{"center_point_box", center_point_box}}), inputs[0], inputs[1]);
-
-    auto filter = m.insert_instruction(
-        ins,
-        make_op("gpu::nms_filter",
-                {{"num_batches", nb}, {"num_classes", nc}, {"num_boxes", b}}),
-        sorted,
-        inputs[2],
-        inputs[3],
-        inputs[4],
-        mask_alloc);
-
-    auto raw_output =
-        m.insert_instruction(ins, make_op("get_tuple_elem", {{"index", 0}}), filter);
-    auto bc_counts =
-        m.insert_instruction(ins, make_op("get_tuple_elem", {{"index", 1}}), filter);
-
-    auto compact = m.insert_instruction(ins, make_op("gpu::nms_compact"), bc_counts, raw_output);
-
-    m.replace_instruction(ins, compact);
-}
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins    = r.result;
+        auto inputs = ins->inputs();
+        if(inputs.size() < 2 or inputs.size() > 5)
+            MIGRAPHX_THROW("prepare_nonmaxsuppression: unexpected input count " +
+                           std::to_string(inputs.size()));
+
+        const auto& boxes_s  = inputs[0]->get_shape();
+        const auto& scores_s = inputs[1]->get_shape();
+        if(boxes_s.ndim() != 3 or scores_s.ndim() != 3)
+            MIGRAPHX_THROW("prepare_nonmaxsuppression: boxes and scores must be 3-D");
+
+        const auto num_batches = boxes_s.lens()[0];
+        const auto num_boxes = boxes_s.lens()[1];
+        const auto num_classes = scores_s.lens()[1];
+        const auto iou_packed = (num_boxes * (num_boxes - 1) / 2);
+
+        // Fill in missing optional scalar inputs with default literals.
+        const shape default_max_s{shape::int64_type, {1}};
+        const shape default_iou_s{shape::float_type, {1}};
+        const shape default_thr_s{shape::float_type, {1}};
+        if(inputs.size() < 3)
+            inputs.push_back(m.insert_literal(ins, literal{default_max_s, {std::int64_t{0}}}));
+        if(inputs.size() < 4)
+            inputs.push_back(m.insert_literal(ins, literal{default_iou_s, {0.0f}}));
+        if(inputs.size() < 5)
+            inputs.push_back(m.insert_literal(ins, literal{default_thr_s, {0.0f}}));
+
+        auto op_val           = ins->get_operator().to_value();
+        bool center_point_box = op_val.at("center_point_box").to<bool>();
+
+        // Mask is scratch only; allocate up-front so the standard
+        // replace_allocate pass can later turn it into hip::allocate.
+        shape mask_shape{shape::uint8_type, {num_batches * num_classes, iou_packed}};
+        auto mask_alloc =
+            m.insert_instruction(ins, make_op("allocate", {{"shape", to_value(mask_shape)}}));
+
+        auto sorted = m.insert_instruction(
+            ins,
+            make_op("gpu::nms_sort", {{"center_point_box", center_point_box}}),
+            inputs[0],
+            inputs[1]);
+
+        auto filter = m.insert_instruction(
+            ins,
+            make_op("gpu::nms_filter",
+                    {{"num_batches", num_batches}, {"num_classes", num_classes}, {"num_boxes", num_boxes}}),
+            sorted,
+            inputs[2],
+            inputs[3],
+            inputs[4],
+            mask_alloc);
+
+        auto output =
+            m.insert_instruction(ins, make_op("get_tuple_elem", {{"index", 0}}), filter);
+        auto bc_counts =
+            m.insert_instruction(ins, make_op("get_tuple_elem", {{"index", 1}}), filter);
+
+        auto compact =
+            m.insert_instruction(ins, make_op("gpu::nms_compact"), bc_counts, output);
+
+        m.replace_instruction(ins, compact);
+    }
+};
 
 } // namespace
 
 void prepare_nonmaxsuppression::apply(module& m) const
 {
-    for(auto ins : find_nms(m))
-    {
-        rewrite_nms(m, ins);
-    }
+    match::find_matches(m, find_nonmaxsuppression{});
 }
 
 } // namespace gpu

From 5ca611f7cc8d8eced2ce84152ec8e6fb3bc470ac Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Mon, 18 May 2026 12:31:16 -0500
Subject: [PATCH 10/32] Fix JIT global and local. Single verify_test test_nms
 works.

---
 .../gpu/include/migraphx/gpu/compile_hip_code_object.hpp   | 4 +++-
 src/targets/gpu/jit/nonmaxsuppression.cpp                  | 6 +++---
 test/verify/test_nms.cpp                                   | 7 ++++---
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp b/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
index f434348dbd5..592e32b9af4 100644
--- a/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
@@ -36,8 +36,10 @@ namespace gpu {
 struct context;
 
 struct hip_compile_options
-{
+{   
+    // Total number of threads
     std::size_t global;
+    // Threads per block
     std::size_t local;
     std::vector<shape> inputs;
     shape output;
diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp
index be37fcadbc6..14df58b8d41 100644
--- a/src/targets/gpu/jit/nonmaxsuppression.cpp
+++ b/src/targets/gpu/jit/nonmaxsuppression.cpp
@@ -131,14 +131,14 @@ struct nms_sort_compiler : compiler<nms_sort_compiler>
         const auto aligned_num_boxes =
             static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(num_boxes)));
         // NOTE: topK kernel uses relement/4 for amount of work in a block?
-        auto block_size = compute_block_size(ctx, num_boxes, 1024);
+        auto block_size = compute_block_size(ctx, aligned_num_boxes, 1024);
 
         hip_compile_options options;
         options.inputs         = inputs;
         options.output         = inputs.back();
         options.kernel_name    = "nms_sort_kernel";
         options.virtual_inputs = inputs;
-        options.set_launch_params(v, block_size * num_batches * num_classes, block_size);
+        options.set_launch_params(v, num_batches * num_classes * block_size, block_size);
 
         auto src = interpolate_string(
             nms_sort_kernel_src,
@@ -181,7 +181,7 @@ struct nms_filter_compiler : compiler<nms_filter_compiler>
         options.output         = inputs.back();
         options.kernel_name    = "nms_filter_kernel";
         options.virtual_inputs = options.inputs;
-        options.set_launch_params(v, block_size * num_batches * num_classes, block_size);
+        options.set_launch_params(v, num_batches * num_classes * block_size, block_size);
 
         auto src = interpolate_string(
             nms_filter_kernel_src,
diff --git a/test/verify/test_nms.cpp b/test/verify/test_nms.cpp
index 9039784a689..99a502dbbfc 100644
--- a/test/verify/test_nms.cpp
+++ b/test/verify/test_nms.cpp
@@ -51,10 +51,11 @@ struct test_nms : verify_program<test_nms>
                                 iou_threshold,
                                 score_threshold);
 
-        auto indices = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms);
+        //auto indices = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms);
         auto num_selected = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 1}}), nms);
-        auto slice_ins = mm->add_instruction(migraphx::make_op("slice", {{"axes", {0}}, {"starts", {0}}}), indices, num_selected);
-        mm->add_return({slice_ins});
+        //auto slice_ins = mm->add_instruction(migraphx::make_op("slice", {{"axes", {0}}, {"starts", {0}}}), indices, num_selected);
+        //mm->add_return({slice_ins});
+        mm->add_return({num_selected});
 
         return p;
     }

From a48c90999ab2114eae3fcea51fe6427fafcd9e05 Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Mon, 18 May 2026 17:04:07 -0500
Subject: [PATCH 11/32] Fixes

---
 src/include/migraphx/op/nonmaxsuppression.hpp         |  9 ++++++---
 src/targets/gpu/jit/nonmaxsuppression.cpp             | 11 ++++++-----
 .../include/migraphx/kernels/nonmaxsuppression.hpp    | 11 +++++------
 src/targets/gpu/prepare_nonmaxsuppression.cpp         |  2 +-
 4 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp
index 9cf3d41070b..68ea521f4f6 100644
--- a/src/include/migraphx/op/nonmaxsuppression.hpp
+++ b/src/include/migraphx/op/nonmaxsuppression.hpp
@@ -297,12 +297,16 @@ struct nonmaxsuppression
         auto output_shapes = flatten({output_shape});
         shape max_output_shape = {output_shapes.at(0).type(), output_shapes.at(0).max_lens()};
         argument result{max_output_shape};
+        argument num_selected_result{output_shapes.at(1)};
 
         std::size_t max_output_boxes_per_class =
             (args.size() > 2) ? (args.at(2).at<std::size_t>()) : 0;
         if(max_output_boxes_per_class == 0)
         {
-            return result;
+            num_selected_result.visit([&](auto output){
+                output.at(0) = 0;
+            });
+            return {{result, num_selected_result}};
         }
         double iou_threshold     = (args.size() > 3) ? (args.at(3).at<double>()) : 0.0f;
         double score_threshold   = (args.size() > 4) ? (args.at(4).at<double>()) : 0.0f;
@@ -318,9 +322,8 @@ struct nonmaxsuppression
                                            score_threshold);
             });
         });
-        argument num_selected_result{output_shapes.at(1)};
         num_selected_result.visit([&](auto output){
-            output.begin() = num_selected;
+            output.at(0) = num_selected;
         });
         return {{result, num_selected_result}};
     }
diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp
index 14df58b8d41..2581f975d8b 100644
--- a/src/targets/gpu/jit/nonmaxsuppression.cpp
+++ b/src/targets/gpu/jit/nonmaxsuppression.cpp
@@ -104,10 +104,10 @@ MIGRAPHX_GLOBAL void nms_compact_kernel(${params})
 {
     make_tensors()(${args})([](const auto bc_counts,
                                auto indices,
-                               auto num_selected,
-                               auto output) {
+                               auto selected_indices,
+                               auto num_selected) {
         nonmaxsuppression_compact<${num_batch_class}, ${num_boxes}>(
-            bc_counts, indices, num_selected, output);
+            bc_counts, indices, selected_indices, num_selected);
     });
 }
 
@@ -213,8 +213,9 @@ struct nms_compact_compiler : compiler<nms_compact_compiler>
         const auto& indices_s = inputs[1];
         const auto num_batch_class = cnt_s.elements();
         const auto num_boxes = indices_s.elements() / (num_batch_class * std::size_t{3});
-        // TODO: tune for max block size?
-        const auto block_size = compute_block_size(ctx, num_boxes, 256);
+        // TODO: tune for block size?
+        // num_boxes block size could also work?
+        const auto block_size = compute_block_size(ctx, num_batch_class * num_boxes, 256);
 
         hip_compile_options options;
         options.inputs         = flatten(inputs);
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
index c226ab78ab6..ca4236c65fa 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
@@ -150,8 +150,8 @@ __device__ void nonmaxsuppression_sort(Boxes boxes_tv, Scores scores_tv, Output
         }
         else
         {
-            // Sentinel: -inf score so it never beats any real entry
-            tmp_data.score     = -__FLT_MAX__;
+            // Sentinel: score so it never beats any real entry
+            tmp_data.score     = numeric_limits<Boxes::value_type>::lowest();
             tmp_data.box       = array<float, 4>{0.f, 0.f, 0.f, 0.f};
             tmp_data.box_index = -1;
         }
@@ -204,8 +204,7 @@ __device__ void nms_filter_per_block(index idx,
                                      Output output,
                                      Counts bc_counts)
 {
-    static_assert(NumBoxes > 1);
-
+    static_assert(NumBoxes > 0);
     const index_int block_id = idx.group;
     const int batch_idx = block_id / NumClasses;
     const int class_idx = block_id % NumClasses;
@@ -344,13 +343,13 @@ __device__ void nonmaxsuppression_compact(const Counts bc_counts,
     constexpr index_int max_entries = NumBatchClass * NumBoxes;
     idx.local_stride(max_entries, [&](auto i) {
         const index_int batch_class_idx = i / NumBoxes;
-        const index_int box_idx = i & NumBoxes;
+        const index_int box_idx = i % NumBoxes;
         if(box_idx < bc_counts[batch_class_idx])
         {
             for(int k = 0; k < 3; ++k)
             {
                 output[(offsets[batch_class_idx] + box_idx) * index_size + k] =
-                indices[batch_class_idx * NumBoxes + box_idx * index_size + k] ;
+                indices[(batch_class_idx * NumBoxes + box_idx) * index_size + k] ;
             }
         }
     });
diff --git a/src/targets/gpu/prepare_nonmaxsuppression.cpp b/src/targets/gpu/prepare_nonmaxsuppression.cpp
index 187d193b1bc..25f65c65b00 100644
--- a/src/targets/gpu/prepare_nonmaxsuppression.cpp
+++ b/src/targets/gpu/prepare_nonmaxsuppression.cpp
@@ -189,7 +189,7 @@ struct find_nonmaxsuppression
             m.insert_instruction(ins, make_op("get_tuple_elem", {{"index", 1}}), filter);
 
         auto compact =
-            m.insert_instruction(ins, make_op("gpu::nms_compact"), bc_counts, output);
+            m.insert_instruction(ins, make_op("gpu::nms_compact"), output, bc_counts);
 
         m.replace_instruction(ins, compact);
     }

From e1e936b27c4cff366b446f39b832de5c08719472 Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Mon, 18 May 2026 17:43:35 -0500
Subject: [PATCH 12/32] Add ref-like tests for GPU NMS, rename shape's flatten
 to flatten_shapes beacuse of operator::flatten same name clashing.

---
 src/include/migraphx/op/nonmaxsuppression.hpp |   7 +-
 src/include/migraphx/shape.hpp                |   2 +-
 src/shape.cpp                                 |   4 +-
 src/targets/gpu/code_object_op.cpp            |   2 +-
 src/targets/gpu/jit/nonmaxsuppression.cpp     |   4 +-
 src/targets/gpu/jit/pointwise.cpp             |   2 +-
 src/targets/gpu/jit/reduce.cpp                |   2 +-
 src/targets/gpu/jit/topk.cpp                  |   2 +-
 .../migraphx/kernels/nonmaxsuppression.hpp    |  12 +-
 test/gpu/nonmaxsuppression.cpp                | 311 ++++++++++++++++++
 10 files changed, 332 insertions(+), 16 deletions(-)
 create mode 100644 test/gpu/nonmaxsuppression.cpp

diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp
index 68ea521f4f6..b6cbd4c9bc1 100644
--- a/src/include/migraphx/op/nonmaxsuppression.hpp
+++ b/src/include/migraphx/op/nonmaxsuppression.hpp
@@ -36,6 +36,7 @@
 #include <migraphx/tensor_view.hpp>
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/check_shapes.hpp>
+#include <migraphx/shape.hpp>
 #include <migraphx/output_iterator.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/par.hpp>
@@ -294,7 +295,7 @@ struct nonmaxsuppression
     argument compute(const shape& output_shape, std::vector<argument> args) const
     {
         // make buffer of maximum size
-        auto output_shapes = flatten({output_shape});
+        auto output_shapes = flatten_shapes({output_shape});
         shape max_output_shape = {output_shapes.at(0).type(), output_shapes.at(0).max_lens()};
         argument result{max_output_shape};
         argument num_selected_result{output_shapes.at(1)};
@@ -304,7 +305,7 @@ struct nonmaxsuppression
         if(max_output_boxes_per_class == 0)
         {
             num_selected_result.visit([&](auto output){
-                output.at(0) = 0;
+                output[0] = 0;
             });
             return {{result, num_selected_result}};
         }
@@ -323,7 +324,7 @@ struct nonmaxsuppression
             });
         });
         num_selected_result.visit([&](auto output){
-            output.at(0) = num_selected;
+            output[0] = num_selected;
         });
         return {{result, num_selected_result}};
     }
diff --git a/src/include/migraphx/shape.hpp b/src/include/migraphx/shape.hpp
index 1a8c1f9d53e..08bff8ee04c 100644
--- a/src/include/migraphx/shape.hpp
+++ b/src/include/migraphx/shape.hpp
@@ -615,7 +615,7 @@ struct MIGRAPHX_EXPORT shape
 };
 
 /// Flatten subshapes to a single vector of non-tuple type of shapes
-MIGRAPHX_EXPORT std::vector<shape> flatten(const std::vector<shape>& shapes);
+MIGRAPHX_EXPORT std::vector<shape> flatten_shapes(const std::vector<shape>& shapes);
 
 MIGRAPHX_EXPORT void migraphx_to_value(value& v, const shape& s);
 MIGRAPHX_EXPORT void migraphx_from_value(const value& v, shape& s);
diff --git a/src/shape.cpp b/src/shape.cpp
index 7732ead6b11..afd716f72d2 100644
--- a/src/shape.cpp
+++ b/src/shape.cpp
@@ -1378,14 +1378,14 @@ const std::vector<shape>& shape::sub_shapes() const { return impl->m_shapes; }
 
 void shape::debug_print() const { std::cout << *this << std::endl; }
 
-std::vector<shape> flatten(const std::vector<shape>& shapes)
+std::vector<shape> flatten_shapes(const std::vector<shape>& shapes)
 {
     std::vector<shape> result;
     for(const auto& s : shapes)
     {
         if(s.type() == shape::tuple_type)
         {
-            auto subs = flatten(s.sub_shapes());
+            auto subs = flatten_shapes(s.sub_shapes());
             result.insert(result.end(), subs.begin(), subs.end());
         }
         else
diff --git a/src/targets/gpu/code_object_op.cpp b/src/targets/gpu/code_object_op.cpp
index 6a567329c6b..e53724c23ce 100644
--- a/src/targets/gpu/code_object_op.cpp
+++ b/src/targets/gpu/code_object_op.cpp
@@ -41,7 +41,7 @@ shape code_object_op::compute_shape(std::vector<shape> inputs) const
     std::transform(einputs.begin(), einputs.end(), einputs.begin(), [](const shape& s) {
         return s.normalize_standard();
     });
-    if(not migraphx::equal(flatten(einputs), flatten(inputs), &shape::is_compatible))
+    if(not migraphx::equal(flatten_shapes(einputs), flatten_shapes(inputs), &shape::is_compatible))
         MIGRAPHX_THROW("Input shapes have changed: [" + to_string_range(einputs) + "] -> [" +
                        to_string_range(inputs) + "]");
     auto output_buffer_shape = inputs.at(get_output_arg(inputs.size()));
diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp
index 2581f975d8b..dfa5aaffcba 100644
--- a/src/targets/gpu/jit/nonmaxsuppression.cpp
+++ b/src/targets/gpu/jit/nonmaxsuppression.cpp
@@ -177,7 +177,7 @@ struct nms_filter_compiler : compiler<nms_filter_compiler>
         const auto block_size = compute_block_size(ctx, num_boxes/2, 256);
 
         hip_compile_options options;
-        options.inputs         = flatten(inputs);
+        options.inputs         = flatten_shapes(inputs);
         options.output         = inputs.back();
         options.kernel_name    = "nms_filter_kernel";
         options.virtual_inputs = options.inputs;
@@ -218,7 +218,7 @@ struct nms_compact_compiler : compiler<nms_compact_compiler>
         const auto block_size = compute_block_size(ctx, num_batch_class * num_boxes, 256);
 
         hip_compile_options options;
-        options.inputs         = flatten(inputs);
+        options.inputs         = flatten_shapes(inputs);
         options.output         = inputs.back();
         options.kernel_name    = "nms_compact_kernel";
         options.virtual_inputs = options.inputs;
diff --git a/src/targets/gpu/jit/pointwise.cpp b/src/targets/gpu/jit/pointwise.cpp
index c3ce45c12cf..80597139dac 100644
--- a/src/targets/gpu/jit/pointwise.cpp
+++ b/src/targets/gpu/jit/pointwise.cpp
@@ -74,7 +74,7 @@ struct pointwise_compiler : compiler<pointwise_compiler>
     operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
         hip_compile_options options;
-        options.inputs         = flatten(inputs);
+        options.inputs         = flatten_shapes(inputs);
         options.output         = inputs.back();
         options.virtual_inputs = reduce_dims(normalize_permutation(options.inputs));
         options.emplace_param("-Wno-float-equal");
diff --git a/src/targets/gpu/jit/reduce.cpp b/src/targets/gpu/jit/reduce.cpp
index a9d506c41e7..a5b9613c5b2 100644
--- a/src/targets/gpu/jit/reduce.cpp
+++ b/src/targets/gpu/jit/reduce.cpp
@@ -322,7 +322,7 @@ struct fused_reduce_compiler : compiler<fused_reduce_compiler>
     {
         auto assign         = v.get("assign", "assign_none");
         auto axes           = v.at("axes").to_vector<std::size_t>();
-        auto finputs        = flatten(inputs);
+        auto finputs        = flatten_shapes(inputs);
         auto noutputs       = finputs.size() - inputs.size() + 1;
         auto virtual_inputs = finputs;
         virtual_inputs.push_back(get_reduced_shape(get_input_shape(finputs), axes));
diff --git a/src/targets/gpu/jit/topk.cpp b/src/targets/gpu/jit/topk.cpp
index 745d5b2c7da..1deafb2db60 100644
--- a/src/targets/gpu/jit/topk.cpp
+++ b/src/targets/gpu/jit/topk.cpp
@@ -65,7 +65,7 @@ struct topk_compiler : compiler<topk_compiler>
     {
         hip_compile_options options;
         options.output      = inputs.back();
-        options.inputs      = flatten(inputs);
+        options.inputs      = flatten_shapes(inputs);
         options.kernel_name = "topk_kernel";
 
         auto axis           = v.at("axis").to<int64_t>();
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
index ca4236c65fa..c1d4398acc9 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
@@ -137,15 +137,19 @@ __device__ void nonmaxsuppression_sort(Boxes boxes_tv, Scores scores_tv, Output
     auto* p = reinterpret_cast<nms_data*>(out_tv.data()) + block_id * AlignedNumBoxes;
     auto block_out_tv = make_tensor_view<nms_data>(p, block_out_shape);
 
-    const auto* boxes_b   = boxes_tv.data() + batch_idx * NumBoxes * 4;
-    const auto* scores_bc = scores_tv.data() + (batch_idx * NumClasses + class_idx) * NumBoxes;
+    //const auto* boxes_b   = boxes_tv.data() + batch_idx * NumBoxes * 4;
+    //const auto* scores_bc = scores_tv.data() + (batch_idx * NumClasses + class_idx) * NumBoxes;
+    // Get tensor_view slice of boxes. numpy slicing: boxes[batch_idx, :, :]
+    const auto my_boxes = slice_tensor(boxes_tv, batch_idx, slice_axes<1, 2>());
+    // Get tensor_view slice of scores. numpy slicing: scores[batch_idx, class_idx, :]
+    const auto my_scores = slice_tensor(scores_tv, block_id, slice_axes<2>());
 
     nms_data tmp_data;
     idx.local_stride(AlignedNumBoxes, [&](auto i) {
         if(i < NumBoxes)
         {
-            tmp_data.score     = scores_bc[i];
-            tmp_data.box       = nms_normalize_box<CenterPointBox>(boxes_b + i * 4);
+            tmp_data.score     = my_scores[i];
+            tmp_data.box       = nms_normalize_box<CenterPointBox>(my_boxes + i * 4);
             tmp_data.box_index = static_cast<int>(i);
         }
         else
diff --git a/test/gpu/nonmaxsuppression.cpp b/test/gpu/nonmaxsuppression.cpp
new file mode 100644
index 00000000000..ad4c1f27fe4
--- /dev/null
+++ b/test/gpu/nonmaxsuppression.cpp
@@ -0,0 +1,311 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/instruction.hpp>
+#include <migraphx/literal.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/register_target.hpp>
+#include <migraphx/verify.hpp>
+
+#include <test.hpp>
+
+static std::pair<std::vector<int64_t>, int64_t>
+run_gpu_nms(migraphx::program p, const migraphx::parameter_map& host_params = {})
+{
+    migraphx::target t = migraphx::make_target("gpu");
+    p.compile(t);
+
+    migraphx::parameter_map gpu_params;
+    for(auto&& x : p.get_parameter_shapes())
+    {
+        auto it = host_params.find(x.first);
+        if(it != host_params.end())
+            gpu_params[x.first] = t.copy_to(it->second);
+        else
+            gpu_params[x.first] = t.allocate(x.second);
+    }
+
+    auto results  = p.eval(gpu_params);
+    auto idx_host = t.copy_from(results.at(0));
+    auto cnt_host = t.copy_from(results.at(1));
+
+    std::vector<int64_t> indices;
+    idx_host.visit([&](auto v) { indices.assign(v.begin(), v.end()); });
+
+    int64_t num_selected = 0;
+    cnt_host.visit([&](auto v) { num_selected = static_cast<int64_t>(v[0]); });
+
+    return {indices, num_selected};
+}
+
+static void add_nms_return(migraphx::module* mm, migraphx::instruction_ref nms)
+{
+    auto idx = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms);
+    auto cnt = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 1}}), nms);
+    mm->add_return({idx, cnt});
+}
+
+TEST_CASE(nms_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}};
+    std::vector<float> boxes_vec = {0.5, 0.5,  1.0, 1.0, 0.5, 0.6,  1.0, 1.0, 0.5, 0.4,   1.0, 1.0,
+                                    0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0};
+
+    migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}};
+    std::vector<float> scores_vec = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+
+    auto boxes_l         = mm->add_literal(migraphx::literal(boxes_s, boxes_vec));
+    auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
+    auto max_out_l       = mm->add_literal(int64_t{4});
+    auto iou_threshold   = mm->add_literal(0.5f);
+    auto score_threshold = mm->add_literal(0.0f);
+
+    auto nms =
+        mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
+                            boxes_l,
+                            scores_l,
+                            max_out_l,
+                            iou_threshold,
+                            score_threshold);
+    add_nms_return(mm, nms);
+
+    auto [indices, num_selected] = run_gpu_nms(std::move(p));
+    indices.resize(static_cast<std::size_t>(num_selected) * 3);
+    std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 0, 5};
+    EXPECT(migraphx::verify::verify_rms_range(indices, gold));
+    EXPECT(num_selected == 3);
+}
+
+TEST_CASE(nms_identical_all_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}};
+    std::vector<float> boxes_vec = {0.5, 0.5, 0.7, 0.7, 0.7, 0.7, 0.5, 0.5, 0.7, 0.7, 0.5, 0.5,
+                                    0.5, 0.5, 0.7, 0.7, 0.5, 0.5, 0.7, 0.7, 0.7, 0.7, 0.5, 0.5};
+    migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}};
+    std::vector<float> scores_vec = {0.9, 0.9, 0.9, 0.9, 0.9, 0.9};
+
+    auto boxes_l         = mm->add_literal(migraphx::literal(boxes_s, boxes_vec));
+    auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
+    auto max_out_l       = mm->add_literal(int64_t{6});
+    auto iou_threshold   = mm->add_literal(0.1f);
+    auto score_threshold = mm->add_literal(0.0f);
+
+    auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"),
+                                   boxes_l,
+                                   scores_l,
+                                   max_out_l,
+                                   iou_threshold,
+                                   score_threshold);
+    add_nms_return(mm, nms);
+
+    auto [indices, num_selected] = run_gpu_nms(std::move(p));
+    indices.resize(static_cast<std::size_t>(num_selected) * 3);
+    std::vector<int64_t> gold = {0, 0, 0};
+    EXPECT(migraphx::verify::verify_rms_range(indices, gold));
+    EXPECT(num_selected == 1);
+}
+
+TEST_CASE(nms_not_center_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}};
+    std::vector<float> boxes_vec = {1.0, 1.0,  0.0, 0.0,  0.0, 0.1,   1.0, 1.1,
+                                    0.0, 0.9,  1.0, -0.1, 0.0, 10.0,  1.0, 11.0,
+                                    1.0, 10.1, 0.0, 11.1, 1.0, 101.0, 0.0, 100.0};
+
+    migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}};
+    std::vector<float> scores_vec = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+
+    auto boxes_l         = mm->add_literal(migraphx::literal(boxes_s, boxes_vec));
+    auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
+    auto max_out_l       = mm->add_literal(int64_t{4});
+    auto iou_threshold   = mm->add_literal(0.5f);
+    auto score_threshold = mm->add_literal(0.0f);
+
+    auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"),
+                                   boxes_l,
+                                   scores_l,
+                                   max_out_l,
+                                   iou_threshold,
+                                   score_threshold);
+    add_nms_return(mm, nms);
+
+    auto [indices, num_selected] = run_gpu_nms(std::move(p));
+    indices.resize(static_cast<std::size_t>(num_selected) * 3);
+    std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 0, 5};
+    EXPECT(migraphx::verify::verify_rms_range(indices, gold));
+    EXPECT(num_selected == 3);
+}
+
+TEST_CASE(nms_transpose1_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape boxes_s{migraphx::shape::float_type, {1, 4, 6}};
+    std::vector<float> boxes_vec = {
+        0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.6, 0.4, 10.5, 10.6, 100.5,
+        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,  1.0,  1.0,
+    };
+
+    migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}};
+    std::vector<float> scores_vec = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+
+    auto t_boxes_l       = mm->add_literal(migraphx::literal(boxes_s, boxes_vec));
+    auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
+    auto max_out_l       = mm->add_literal(int64_t{4});
+    auto iou_threshold   = mm->add_literal(0.5f);
+    auto score_threshold = mm->add_literal(0.0f);
+
+    auto transpose_boxes = mm->add_instruction(
+        migraphx::make_op("transpose", {{"permutation", {0, 2, 1}}}), t_boxes_l);
+    auto nms =
+        mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
+                            transpose_boxes,
+                            scores_l,
+                            max_out_l,
+                            iou_threshold,
+                            score_threshold);
+    add_nms_return(mm, nms);
+
+    auto [indices, num_selected] = run_gpu_nms(std::move(p));
+    indices.resize(static_cast<std::size_t>(num_selected) * 3);
+    std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 0, 5};
+    EXPECT(migraphx::verify::verify_rms_range(indices, gold));
+    EXPECT(num_selected == 3);
+}
+
+TEST_CASE(nms_transpose2_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape boxes_s{migraphx::shape::float_type, {4, 1, 6}};
+    std::vector<float> boxes_vec = {
+        0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.6, 0.4, 10.5, 10.6, 100.5,
+        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,  1.0,  1.0,
+    };
+
+    migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}};
+    std::vector<float> scores_vec = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+
+    auto t_boxes_l       = mm->add_literal(migraphx::literal(boxes_s, boxes_vec));
+    auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
+    auto max_out_l       = mm->add_literal(int64_t{4});
+    auto iou_threshold   = mm->add_literal(0.5f);
+    auto score_threshold = mm->add_literal(0.0f);
+
+    auto transpose_boxes = mm->add_instruction(
+        migraphx::make_op("transpose", {{"permutation", {1, 2, 0}}}), t_boxes_l);
+    auto nms =
+        mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
+                            transpose_boxes,
+                            scores_l,
+                            max_out_l,
+                            iou_threshold,
+                            score_threshold);
+    add_nms_return(mm, nms);
+
+    auto [indices, num_selected] = run_gpu_nms(std::move(p));
+    indices.resize(static_cast<std::size_t>(num_selected) * 3);
+    std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 0, 5};
+    EXPECT(migraphx::verify::verify_rms_range(indices, gold));
+    EXPECT(num_selected == 3);
+}
+
+TEST_CASE(nms_multi_batch_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape boxes_s{migraphx::shape::float_type, {2, 6, 4}};
+    std::vector<float> boxes_vec = {0.5, 0.5,  1.0, 1.0, 0.5, 0.6,  1.0, 1.0, 0.5, 0.4,   1.0, 1.0,
+                                    0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0,
+                                    0.5, 0.5,  1.0, 1.0, 0.5, 0.6,  1.0, 1.0, 0.5, 0.4,   1.0, 1.0,
+                                    0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0};
+
+    migraphx::shape scores_s{migraphx::shape::float_type, {2, 1, 6}};
+    std::vector<float> scores_vec = {
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+
+    auto boxes_l         = mm->add_literal(migraphx::literal(boxes_s, boxes_vec));
+    auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
+    auto max_out_l       = mm->add_literal(int64_t{4});
+    auto iou_threshold   = mm->add_literal(0.5f);
+    auto score_threshold = mm->add_literal(0.0f);
+
+    auto nms =
+        mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
+                            boxes_l,
+                            scores_l,
+                            max_out_l,
+                            iou_threshold,
+                            score_threshold);
+    add_nms_return(mm, nms);
+
+    auto [indices, num_selected] = run_gpu_nms(std::move(p));
+    indices.resize(static_cast<std::size_t>(num_selected) * 3);
+    std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 0, 5, 1, 0, 3, 1, 0, 0, 1, 0, 5};
+    EXPECT(migraphx::verify::verify_rms_range(indices, gold));
+    EXPECT(num_selected == 6);
+}
+
+TEST_CASE(nms_multi_class_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}};
+    std::vector<float> boxes_vec = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                    0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                    0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0};
+
+    migraphx::shape scores_s{migraphx::shape::float_type, {1, 2, 6}};
+    std::vector<float> scores_vec = {
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+
+    auto boxes_l         = mm->add_literal(migraphx::literal(boxes_s, boxes_vec));
+    auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
+    auto max_out_l       = mm->add_literal(int64_t{2});
+    auto iou_threshold   = mm->add_literal(0.5f);
+    auto score_threshold = mm->add_literal(0.0f);
+
+    auto nms =
+        mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
+                            boxes_l,
+                            scores_l,
+                            max_out_l,
+                            iou_threshold,
+                            score_threshold);
+    add_nms_return(mm, nms);
+
+    auto [indices, num_selected] = run_gpu_nms(std::move(p));
+    indices.resize(static_cast<std::size_t>(num_selected) * 3);
+    std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 1, 3, 0, 1, 0};
+    EXPECT(migraphx::verify::verify_rms_range(indices, gold));
+    EXPECT(num_selected == 4);
+}
+
+int main(int argc, const char* argv[]) { test::run(argc, argv); }

From fc728f3642e9bbcbb8dbd96894b12ebffeb6f27d Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Mon, 18 May 2026 17:44:34 -0500
Subject: [PATCH 13/32] Remove verify NMS tests. They don't make sense for
 random data.

---
 test/verify/test_nms.cpp | 201 ---------------------------------------
 1 file changed, 201 deletions(-)
 delete mode 100644 test/verify/test_nms.cpp

diff --git a/test/verify/test_nms.cpp b/test/verify/test_nms.cpp
deleted file mode 100644
index 99a502dbbfc..00000000000
--- a/test/verify/test_nms.cpp
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#include "verify_program.hpp"
-#include <migraphx/program.hpp>
-#include <migraphx/generate.hpp>
-#include <migraphx/make_op.hpp>
-
-struct test_nms : verify_program<test_nms>
-{
-    migraphx::program create_program() const
-    {
-        migraphx::program p;
-        auto* mm = p.get_main_module();
-
-        migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}};
-        migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}};
-
-        auto boxes_l         = mm->add_parameter("boxes", boxes_s);
-        auto scores_l        = mm->add_parameter("scores", scores_s);
-        auto max_out_l       = mm->add_literal(int64_t{4});
-        auto iou_threshold   = mm->add_literal(0.5f);
-        auto score_threshold = mm->add_literal(0.0f);
-
-        auto nms =
-            mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", 1}}),
-                                boxes_l,
-                                scores_l,
-                                max_out_l,
-                                iou_threshold,
-                                score_threshold);
-
-        //auto indices = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms);
-        auto num_selected = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 1}}), nms);
-        //auto slice_ins = mm->add_instruction(migraphx::make_op("slice", {{"axes", {0}}, {"starts", {0}}}), indices, num_selected);
-        //mm->add_return({slice_ins});
-        mm->add_return({num_selected});
-
-        return p;
-    }
-};
-
-// Multi-batch fixed-output NMS exercises the (batch_idx, class_idx) -> block_id
-// dispatch in the GPU kernel.
-struct test_nms_multi_batch : verify_program<test_nms_multi_batch>
-{
-    migraphx::program create_program() const
-    {
-        migraphx::program p;
-        auto* mm = p.get_main_module();
-
-        migraphx::shape boxes_s{migraphx::shape::float_type, {2, 6, 4}};
-        migraphx::shape scores_s{migraphx::shape::float_type, {2, 1, 6}};
-
-        auto boxes_l         = mm->add_parameter("boxes", boxes_s);
-        auto scores_l        = mm->add_parameter("scores", scores_s);
-        auto max_out_l       = mm->add_literal(int64_t{4});
-        auto iou_threshold   = mm->add_literal(0.5f);
-        auto score_threshold = mm->add_literal(0.0f);
-
-        auto r =
-            mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", 1}}),
-                                boxes_l,
-                                scores_l,
-                                max_out_l,
-                                iou_threshold,
-                                score_threshold);
-        mm->add_return({r});
-
-        return p;
-    }
-};
-
-// Multi-class fixed-output NMS exercises per-class greedy filtering with
-// outputs interleaved by the global atomic counter.
-struct test_nms_multi_class : verify_program<test_nms_multi_class>
-{
-    migraphx::program create_program() const
-    {
-        migraphx::program p;
-        auto* mm = p.get_main_module();
-
-        migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}};
-        migraphx::shape scores_s{migraphx::shape::float_type, {1, 2, 6}};
-
-        auto boxes_l         = mm->add_parameter("boxes", boxes_s);
-        auto scores_l        = mm->add_parameter("scores", scores_s);
-        auto max_out_l       = mm->add_literal(int64_t{2});
-        auto iou_threshold   = mm->add_literal(0.5f);
-        auto score_threshold = mm->add_literal(0.0f);
-
-        auto r =
-            mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", 1}}),
-                                boxes_l,
-                                scores_l,
-                                max_out_l,
-                                iou_threshold,
-                                score_threshold);
-        mm->add_return({r});
-
-        return p;
-    }
-};
-
-// center_point_box=0 path with potentially flipped corner coordinates.
-struct test_nms_not_center : verify_program<test_nms_not_center>
-{
-    migraphx::program create_program() const
-    {
-        migraphx::program p;
-        auto* mm = p.get_main_module();
-
-        migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}};
-        migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}};
-
-        auto boxes_l         = mm->add_parameter("boxes", boxes_s);
-        auto scores_l        = mm->add_parameter("scores", scores_s);
-        auto max_out_l       = mm->add_literal(int64_t{4});
-        auto iou_threshold   = mm->add_literal(0.5f);
-        auto score_threshold = mm->add_literal(0.0f);
-
-        auto r =
-            mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", 0}}),
-                                boxes_l,
-                                scores_l,
-                                max_out_l,
-                                iou_threshold,
-                                score_threshold);
-        mm->add_return({r});
-
-        return p;
-    }
-};
-
-// TODO: update this test
-// Test NMS with dynamic inputs that have different compile-time spatial ranges.
-// This reproduces the scenario from nms_repro_minidimmismatch.py where
-// boxes has 10 spatial entries and scores has 5, but at runtime both are
-// sliced/provided with spatial_dimension=5. The compile-time ranges differ:
-//   boxes spatial: {4, 10}, scores spatial: {4, 5}
-// but runtime spatial dimensions match so NMS should succeed.
-struct test_nms_dyn_slice : verify_program<test_nms_dyn_slice>
-{
-    migraphx::program create_program() const
-    {
-        migraphx::program p;
-        auto* mm = p.get_main_module();
-
-        // boxes: [1, {4..10}, 4] — up to 10 spatial entries
-        migraphx::shape boxes_s{migraphx::shape::float_type, {{1, 1}, {4, 10}, {4, 4}}};
-        // scores: [1, 1, {4..5}] — up to 5 spatial entries (different range!)
-        migraphx::shape scores_s{migraphx::shape::float_type, {{1, 1}, {1, 1}, {4, 5}}};
-
-        auto boxes_l  = mm->add_parameter("boxes", boxes_s);
-        auto scores_l = mm->add_parameter("scores", scores_s);
-
-        auto max_out_l       = mm->add_literal(int64_t{4});
-        auto iou_threshold   = mm->add_literal(0.5f);
-        auto score_threshold = mm->add_literal(0.0f);
-
-        auto r =
-            mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"use_dyn_output", true}}),
-                                boxes_l,
-                                scores_l,
-                                max_out_l,
-                                iou_threshold,
-                                score_threshold);
-        mm->add_return({r});
-
-        return p;
-    }
-
-    // At runtime, both have spatial_dimension=5 (matching)
-    std::unordered_map<std::string, migraphx::shape> get_test_dims() const
-    {
-        return {{"boxes", migraphx::shape{migraphx::shape::float_type, {1, 5, 4}}},
-                {"scores", migraphx::shape{migraphx::shape::float_type, {1, 1, 5}}}};
-    }
-};
-

From c2ddb73bbdbb89e0a198bc491bbfbb3d8cdb05dd Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Tue, 19 May 2026 16:09:35 -0500
Subject: [PATCH 14/32] Fix kernels and tests

---
 .../migraphx/kernels/nonmaxsuppression.hpp    |  24 +-
 src/targets/gpu/prepare_nonmaxsuppression.cpp |   5 +-
 test/gpu/nonmaxsuppression.cpp                | 737 ++++++++++++++++--
 3 files changed, 667 insertions(+), 99 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
index c1d4398acc9..bde081bbc69 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
@@ -34,6 +34,7 @@
 #include <migraphx/kernels/tensor_view.hpp>
 #include <migraphx/kernels/types.hpp>
 #include <migraphx/kernels/slice.hpp>
+#include <migraphx/kernels/type_traits.hpp>
 
 namespace migraphx {
 
@@ -130,32 +131,29 @@ __device__ void nonmaxsuppression_sort(Boxes boxes_tv, Scores scores_tv, Output
 
     auto idx = make_index();
     const index_int block_id = idx.group;
-    const int batch_idx      = static_cast<int>(block_id / NumClasses);
-    const int class_idx      = static_cast<int>(block_id % NumClasses);
+    const int batch_idx = block_id / NumClasses;
+    const int class_idx = block_id % NumClasses;
     
     constexpr auto block_out_shape = make_shape(index_ints<AlignedNumBoxes>{});
     auto* p = reinterpret_cast<nms_data*>(out_tv.data()) + block_id * AlignedNumBoxes;
     auto block_out_tv = make_tensor_view<nms_data>(p, block_out_shape);
 
-    //const auto* boxes_b   = boxes_tv.data() + batch_idx * NumBoxes * 4;
-    //const auto* scores_bc = scores_tv.data() + (batch_idx * NumClasses + class_idx) * NumBoxes;
-    // Get tensor_view slice of boxes. numpy slicing: boxes[batch_idx, :, :]
-    const auto my_boxes = slice_tensor(boxes_tv, batch_idx, slice_axes<1, 2>());
-    // Get tensor_view slice of scores. numpy slicing: scores[batch_idx, class_idx, :]
-    const auto my_scores = slice_tensor(scores_tv, block_id, slice_axes<2>());
+    // numpy indexing: scores[batch_idx, class_idx, :]
+    const auto my_scores = slice_tensor(scores_tv, array<index_int, 3>{batch_idx, class_idx, 0}, slice_axes<2>());
 
     nms_data tmp_data;
     idx.local_stride(AlignedNumBoxes, [&](auto i) {
         if(i < NumBoxes)
         {
             tmp_data.score     = my_scores[i];
-            tmp_data.box       = nms_normalize_box<CenterPointBox>(my_boxes + i * 4);
+            // numpy indexing: boxes[batch_idx, i, :]
+            tmp_data.box       = nms_normalize_box<CenterPointBox>(slice_tensor(boxes_tv, array<index_int, 3>{batch_idx, i, 0}, slice_axes<2>()));
             tmp_data.box_index = static_cast<int>(i);
         }
         else
         {
-            // Sentinel: score so it never beats any real entry
-            tmp_data.score     = numeric_limits<Boxes::value_type>::lowest();
+            // Sentinel score so it never beats any real entry
+            tmp_data.score     = numeric_lowest<typename Boxes::type>();
             tmp_data.box       = array<float, 4>{0.f, 0.f, 0.f, 0.f};
             tmp_data.box_index = -1;
         }
@@ -317,8 +315,8 @@ template <index_int NumBatchClass,
           class Out>
 __device__ void nonmaxsuppression_compact(const Counts bc_counts,
                                           const Idx indices,
-                                          Num num_selected,
-                                          Out output)
+                                          Out output,
+                                          Num num_selected)
 {
     static_assert(NumBatchClass > 0);
     static_assert(NumBoxes > 0);
diff --git a/src/targets/gpu/prepare_nonmaxsuppression.cpp b/src/targets/gpu/prepare_nonmaxsuppression.cpp
index 25f65c65b00..53514963c13 100644
--- a/src/targets/gpu/prepare_nonmaxsuppression.cpp
+++ b/src/targets/gpu/prepare_nonmaxsuppression.cpp
@@ -148,6 +148,7 @@ struct find_nonmaxsuppression
         const auto iou_packed = (num_boxes * (num_boxes - 1) / 2);
 
         // Fill in missing optional scalar inputs with default literals.
+        // TODO: this is the wrong way to handle this. Should be checking if the input is eval'able.
         const shape default_max_s{shape::int64_type, {1}};
         const shape default_iou_s{shape::float_type, {1}};
         const shape default_thr_s{shape::float_type, {1}};
@@ -183,13 +184,13 @@ struct find_nonmaxsuppression
             inputs[4],
             mask_alloc);
 
-        auto output =
+        auto raw_output =
             m.insert_instruction(ins, make_op("get_tuple_elem", {{"index", 0}}), filter);
         auto bc_counts =
             m.insert_instruction(ins, make_op("get_tuple_elem", {{"index", 1}}), filter);
 
         auto compact =
-            m.insert_instruction(ins, make_op("gpu::nms_compact"), output, bc_counts);
+            m.insert_instruction(ins, make_op("gpu::nms_compact"), bc_counts, raw_output);
 
         m.replace_instruction(ins, compact);
     }
diff --git a/test/gpu/nonmaxsuppression.cpp b/test/gpu/nonmaxsuppression.cpp
index ad4c1f27fe4..3f7aab9b432 100644
--- a/test/gpu/nonmaxsuppression.cpp
+++ b/test/gpu/nonmaxsuppression.cpp
@@ -71,28 +71,40 @@ TEST_CASE(nms_test)
     migraphx::program p;
     auto* mm = p.get_main_module();
     migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}};
-    std::vector<float> boxes_vec = {0.5, 0.5,  1.0, 1.0, 0.5, 0.6,  1.0, 1.0, 0.5, 0.4,   1.0, 1.0,
-                                    0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0};
-
     migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}};
-    std::vector<float> scores_vec = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+    migraphx::shape scalar_s{migraphx::shape::float_type, {1}};
+    migraphx::shape int_scalar_s{migraphx::shape::int64_type, {1}};
 
-    auto boxes_l         = mm->add_literal(migraphx::literal(boxes_s, boxes_vec));
-    auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
-    auto max_out_l       = mm->add_literal(int64_t{4});
-    auto iou_threshold   = mm->add_literal(0.5f);
-    auto score_threshold = mm->add_literal(0.0f);
+    auto boxes_p         = mm->add_parameter("boxes", boxes_s);
+    auto scores_p        = mm->add_parameter("scores", scores_s);
+    auto max_out_p       = mm->add_parameter("max_out", int_scalar_s);
+    auto iou_threshold   = mm->add_parameter("iou_threshold", scalar_s);
+    auto score_threshold = mm->add_parameter("score_threshold", scalar_s);
 
     auto nms =
         mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
-                            boxes_l,
-                            scores_l,
-                            max_out_l,
+                            boxes_p,
+                            scores_p,
+                            max_out_p,
                             iou_threshold,
                             score_threshold);
     add_nms_return(mm, nms);
 
-    auto [indices, num_selected] = run_gpu_nms(std::move(p));
+    std::vector<float> boxes_vec = {0.5, 0.5,  1.0, 1.0, 0.5, 0.6,  1.0, 1.0, 0.5, 0.4,   1.0, 1.0,
+                                    0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0};
+    std::vector<float> scores_vec = {0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f};
+    int64_t max_out_val = 4;
+    float iou_val       = 0.5f;
+    float score_val     = 0.0f;
+
+    migraphx::parameter_map host_params;
+    host_params["boxes"]           = migraphx::argument(boxes_s, boxes_vec.data());
+    host_params["scores"]          = migraphx::argument(scores_s, scores_vec.data());
+    host_params["max_out"]         = migraphx::argument(int_scalar_s, &max_out_val);
+    host_params["iou_threshold"]   = migraphx::argument(scalar_s, &iou_val);
+    host_params["score_threshold"] = migraphx::argument(scalar_s, &score_val);
+
+    auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params);
     indices.resize(static_cast<std::size_t>(num_selected) * 3);
     std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 0, 5};
     EXPECT(migraphx::verify::verify_rms_range(indices, gold));
@@ -104,26 +116,34 @@ TEST_CASE(nms_identical_all_test)
     migraphx::program p;
     auto* mm = p.get_main_module();
     migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}};
-    std::vector<float> boxes_vec = {0.5, 0.5, 0.7, 0.7, 0.7, 0.7, 0.5, 0.5, 0.7, 0.7, 0.5, 0.5,
-                                    0.5, 0.5, 0.7, 0.7, 0.5, 0.5, 0.7, 0.7, 0.7, 0.7, 0.5, 0.5};
     migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}};
-    std::vector<float> scores_vec = {0.9, 0.9, 0.9, 0.9, 0.9, 0.9};
+    migraphx::shape scalar_s{migraphx::shape::float_type, {1}};
 
-    auto boxes_l         = mm->add_literal(migraphx::literal(boxes_s, boxes_vec));
-    auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
+    auto boxes_p         = mm->add_parameter("boxes", boxes_s);
+    auto scores_p        = mm->add_parameter("scores", scores_s);
     auto max_out_l       = mm->add_literal(int64_t{6});
-    auto iou_threshold   = mm->add_literal(0.1f);
+    auto iou_threshold   = mm->add_parameter("iou_threshold", scalar_s);
     auto score_threshold = mm->add_literal(0.0f);
 
     auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"),
-                                   boxes_l,
-                                   scores_l,
+                                   boxes_p,
+                                   scores_p,
                                    max_out_l,
                                    iou_threshold,
                                    score_threshold);
     add_nms_return(mm, nms);
 
-    auto [indices, num_selected] = run_gpu_nms(std::move(p));
+    std::vector<float> boxes_vec = {0.5, 0.5, 0.7, 0.7, 0.7, 0.7, 0.5, 0.5, 0.7, 0.7, 0.5, 0.5,
+                                    0.5, 0.5, 0.7, 0.7, 0.5, 0.5, 0.7, 0.7, 0.7, 0.7, 0.5, 0.5};
+    std::vector<float> scores_vec = {0.9f, 0.9f, 0.9f, 0.9f, 0.9f, 0.9f};
+    float iou_val = 0.1f;
+
+    migraphx::parameter_map host_params;
+    host_params["boxes"]         = migraphx::argument(boxes_s, boxes_vec.data());
+    host_params["scores"]        = migraphx::argument(scores_s, scores_vec.data());
+    host_params["iou_threshold"] = migraphx::argument(scalar_s, &iou_val);
+
+    auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params);
     indices.resize(static_cast<std::size_t>(num_selected) * 3);
     std::vector<int64_t> gold = {0, 0, 0};
     EXPECT(migraphx::verify::verify_rms_range(indices, gold));
@@ -135,28 +155,32 @@ TEST_CASE(nms_not_center_test)
     migraphx::program p;
     auto* mm = p.get_main_module();
     migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}};
-    std::vector<float> boxes_vec = {1.0, 1.0,  0.0, 0.0,  0.0, 0.1,   1.0, 1.1,
-                                    0.0, 0.9,  1.0, -0.1, 0.0, 10.0,  1.0, 11.0,
-                                    1.0, 10.1, 0.0, 11.1, 1.0, 101.0, 0.0, 100.0};
-
     migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}};
-    std::vector<float> scores_vec = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
 
-    auto boxes_l         = mm->add_literal(migraphx::literal(boxes_s, boxes_vec));
-    auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
+    auto boxes_p         = mm->add_parameter("boxes", boxes_s);
+    auto scores_p        = mm->add_parameter("scores", scores_s);
     auto max_out_l       = mm->add_literal(int64_t{4});
     auto iou_threshold   = mm->add_literal(0.5f);
     auto score_threshold = mm->add_literal(0.0f);
 
     auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"),
-                                   boxes_l,
-                                   scores_l,
+                                   boxes_p,
+                                   scores_p,
                                    max_out_l,
                                    iou_threshold,
                                    score_threshold);
     add_nms_return(mm, nms);
 
-    auto [indices, num_selected] = run_gpu_nms(std::move(p));
+    std::vector<float> boxes_vec = {1.0, 1.0,  0.0, 0.0,  0.0, 0.1,   1.0, 1.1,
+                                    0.0, 0.9,  1.0, -0.1, 0.0, 10.0,  1.0, 11.0,
+                                    1.0, 10.1, 0.0, 11.1, 1.0, 101.0, 0.0, 100.0};
+    std::vector<float> scores_vec = {0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f};
+
+    migraphx::parameter_map host_params;
+    host_params["boxes"]  = migraphx::argument(boxes_s, boxes_vec.data());
+    host_params["scores"] = migraphx::argument(scores_s, scores_vec.data());
+
+    auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params);
     indices.resize(static_cast<std::size_t>(num_selected) * 3);
     std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 0, 5};
     EXPECT(migraphx::verify::verify_rms_range(indices, gold));
@@ -168,32 +192,39 @@ TEST_CASE(nms_transpose1_test)
     migraphx::program p;
     auto* mm = p.get_main_module();
     migraphx::shape boxes_s{migraphx::shape::float_type, {1, 4, 6}};
-    std::vector<float> boxes_vec = {
-        0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.6, 0.4, 10.5, 10.6, 100.5,
-        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,  1.0,  1.0,
-    };
-
     migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}};
-    std::vector<float> scores_vec = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+    migraphx::shape int_scalar_s{migraphx::shape::int64_type, {1}};
 
-    auto t_boxes_l       = mm->add_literal(migraphx::literal(boxes_s, boxes_vec));
-    auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
-    auto max_out_l       = mm->add_literal(int64_t{4});
+    auto t_boxes_p       = mm->add_parameter("boxes", boxes_s);
+    auto scores_p        = mm->add_parameter("scores", scores_s);
+    auto max_out_p       = mm->add_parameter("max_out", int_scalar_s);
     auto iou_threshold   = mm->add_literal(0.5f);
     auto score_threshold = mm->add_literal(0.0f);
 
     auto transpose_boxes = mm->add_instruction(
-        migraphx::make_op("transpose", {{"permutation", {0, 2, 1}}}), t_boxes_l);
+        migraphx::make_op("transpose", {{"permutation", {0, 2, 1}}}), t_boxes_p);
     auto nms =
         mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
                             transpose_boxes,
-                            scores_l,
-                            max_out_l,
+                            scores_p,
+                            max_out_p,
                             iou_threshold,
                             score_threshold);
     add_nms_return(mm, nms);
 
-    auto [indices, num_selected] = run_gpu_nms(std::move(p));
+    std::vector<float> boxes_vec = {
+        0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.6, 0.4, 10.5, 10.6, 100.5,
+        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,  1.0,  1.0,
+    };
+    std::vector<float> scores_vec = {0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f};
+    int64_t max_out_val = 4;
+
+    migraphx::parameter_map host_params;
+    host_params["boxes"]   = migraphx::argument(boxes_s, boxes_vec.data());
+    host_params["scores"]  = migraphx::argument(scores_s, scores_vec.data());
+    host_params["max_out"] = migraphx::argument(int_scalar_s, &max_out_val);
+
+    auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params);
     indices.resize(static_cast<std::size_t>(num_selected) * 3);
     std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 0, 5};
     EXPECT(migraphx::verify::verify_rms_range(indices, gold));
@@ -205,32 +236,36 @@ TEST_CASE(nms_transpose2_test)
     migraphx::program p;
     auto* mm = p.get_main_module();
     migraphx::shape boxes_s{migraphx::shape::float_type, {4, 1, 6}};
-    std::vector<float> boxes_vec = {
-        0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.6, 0.4, 10.5, 10.6, 100.5,
-        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,  1.0,  1.0,
-    };
-
     migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}};
-    std::vector<float> scores_vec = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
 
-    auto t_boxes_l       = mm->add_literal(migraphx::literal(boxes_s, boxes_vec));
-    auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
+    auto t_boxes_p       = mm->add_parameter("boxes", boxes_s);
+    auto scores_p        = mm->add_parameter("scores", scores_s);
     auto max_out_l       = mm->add_literal(int64_t{4});
     auto iou_threshold   = mm->add_literal(0.5f);
     auto score_threshold = mm->add_literal(0.0f);
 
     auto transpose_boxes = mm->add_instruction(
-        migraphx::make_op("transpose", {{"permutation", {1, 2, 0}}}), t_boxes_l);
+        migraphx::make_op("transpose", {{"permutation", {1, 2, 0}}}), t_boxes_p);
     auto nms =
         mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
                             transpose_boxes,
-                            scores_l,
+                            scores_p,
                             max_out_l,
                             iou_threshold,
                             score_threshold);
     add_nms_return(mm, nms);
 
-    auto [indices, num_selected] = run_gpu_nms(std::move(p));
+    std::vector<float> boxes_vec = {
+        0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.6, 0.4, 10.5, 10.6, 100.5,
+        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,  1.0,  1.0,
+    };
+    std::vector<float> scores_vec = {0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f};
+
+    migraphx::parameter_map host_params;
+    host_params["boxes"]  = migraphx::argument(boxes_s, boxes_vec.data());
+    host_params["scores"] = migraphx::argument(scores_s, scores_vec.data());
+
+    auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params);
     indices.resize(static_cast<std::size_t>(num_selected) * 3);
     std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 0, 5};
     EXPECT(migraphx::verify::verify_rms_range(indices, gold));
@@ -242,31 +277,43 @@ TEST_CASE(nms_multi_batch_test)
     migraphx::program p;
     auto* mm = p.get_main_module();
     migraphx::shape boxes_s{migraphx::shape::float_type, {2, 6, 4}};
-    std::vector<float> boxes_vec = {0.5, 0.5,  1.0, 1.0, 0.5, 0.6,  1.0, 1.0, 0.5, 0.4,   1.0, 1.0,
-                                    0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0,
-                                    0.5, 0.5,  1.0, 1.0, 0.5, 0.6,  1.0, 1.0, 0.5, 0.4,   1.0, 1.0,
-                                    0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0};
-
     migraphx::shape scores_s{migraphx::shape::float_type, {2, 1, 6}};
-    std::vector<float> scores_vec = {
-        0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+    migraphx::shape scalar_s{migraphx::shape::float_type, {1}};
+    migraphx::shape int_scalar_s{migraphx::shape::int64_type, {1}};
 
-    auto boxes_l         = mm->add_literal(migraphx::literal(boxes_s, boxes_vec));
-    auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
-    auto max_out_l       = mm->add_literal(int64_t{4});
-    auto iou_threshold   = mm->add_literal(0.5f);
-    auto score_threshold = mm->add_literal(0.0f);
+    auto boxes_p         = mm->add_parameter("boxes", boxes_s);
+    auto scores_p        = mm->add_parameter("scores", scores_s);
+    auto max_out_p       = mm->add_parameter("max_out", int_scalar_s);
+    auto iou_threshold   = mm->add_parameter("iou_threshold", scalar_s);
+    auto score_threshold = mm->add_parameter("score_threshold", scalar_s);
 
     auto nms =
         mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
-                            boxes_l,
-                            scores_l,
-                            max_out_l,
+                            boxes_p,
+                            scores_p,
+                            max_out_p,
                             iou_threshold,
                             score_threshold);
     add_nms_return(mm, nms);
 
-    auto [indices, num_selected] = run_gpu_nms(std::move(p));
+    std::vector<float> boxes_vec = {0.5, 0.5,  1.0, 1.0, 0.5, 0.6,  1.0, 1.0, 0.5, 0.4,   1.0, 1.0,
+                                    0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0,
+                                    0.5, 0.5,  1.0, 1.0, 0.5, 0.6,  1.0, 1.0, 0.5, 0.4,   1.0, 1.0,
+                                    0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0};
+    std::vector<float> scores_vec = {
+        0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f, 0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f};
+    int64_t max_out_val = 4;
+    float iou_val       = 0.5f;
+    float score_val     = 0.0f;
+
+    migraphx::parameter_map host_params;
+    host_params["boxes"]           = migraphx::argument(boxes_s, boxes_vec.data());
+    host_params["scores"]          = migraphx::argument(scores_s, scores_vec.data());
+    host_params["max_out"]         = migraphx::argument(int_scalar_s, &max_out_val);
+    host_params["iou_threshold"]   = migraphx::argument(scalar_s, &iou_val);
+    host_params["score_threshold"] = migraphx::argument(scalar_s, &score_val);
+
+    auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params);
     indices.resize(static_cast<std::size_t>(num_selected) * 3);
     std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 0, 5, 1, 0, 3, 1, 0, 0, 1, 0, 5};
     EXPECT(migraphx::verify::verify_rms_range(indices, gold));
@@ -278,34 +325,556 @@ TEST_CASE(nms_multi_class_test)
     migraphx::program p;
     auto* mm = p.get_main_module();
     migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}};
-    std::vector<float> boxes_vec = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
-                                    0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
-                                    0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0};
-
     migraphx::shape scores_s{migraphx::shape::float_type, {1, 2, 6}};
-    std::vector<float> scores_vec = {
-        0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+    migraphx::shape scalar_s{migraphx::shape::float_type, {1}};
 
-    auto boxes_l         = mm->add_literal(migraphx::literal(boxes_s, boxes_vec));
-    auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
+    auto boxes_p         = mm->add_parameter("boxes", boxes_s);
+    auto scores_p        = mm->add_parameter("scores", scores_s);
     auto max_out_l       = mm->add_literal(int64_t{2});
     auto iou_threshold   = mm->add_literal(0.5f);
-    auto score_threshold = mm->add_literal(0.0f);
+    auto score_threshold = mm->add_parameter("score_threshold", scalar_s);
 
     auto nms =
         mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
-                            boxes_l,
-                            scores_l,
+                            boxes_p,
+                            scores_p,
                             max_out_l,
                             iou_threshold,
                             score_threshold);
     add_nms_return(mm, nms);
 
-    auto [indices, num_selected] = run_gpu_nms(std::move(p));
+    std::vector<float> boxes_vec = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                    0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                    0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0};
+    std::vector<float> scores_vec = {
+        0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f, 0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f};
+    float score_val = 0.0f;
+
+    migraphx::parameter_map host_params;
+    host_params["boxes"]           = migraphx::argument(boxes_s, boxes_vec.data());
+    host_params["scores"]          = migraphx::argument(scores_s, scores_vec.data());
+    host_params["score_threshold"] = migraphx::argument(scalar_s, &score_val);
+
+    auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params);
     indices.resize(static_cast<std::size_t>(num_selected) * 3);
     std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 1, 3, 0, 1, 0};
     EXPECT(migraphx::verify::verify_rms_range(indices, gold));
     EXPECT(num_selected == 4);
 }
 
+TEST_CASE(nms_20boxes_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape boxes_s{migraphx::shape::float_type, {1, 20, 4}};
+    migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 20}};
+
+    auto boxes_p         = mm->add_parameter("boxes", boxes_s);
+    auto scores_p        = mm->add_parameter("scores", scores_s);
+    auto max_out_l       = mm->add_literal(int64_t{10});
+    auto iou_threshold   = mm->add_literal(0.5000f);
+    auto score_threshold = mm->add_literal(0.0000f);
+
+    auto nms =
+        mm->add_instruction(migraphx::make_op("nonmaxsuppression"),
+                            boxes_p,
+                            scores_p,
+                            max_out_l,
+                            iou_threshold,
+                            score_threshold);
+    add_nms_return(mm, nms);
+
+    std::vector<float> boxes_vec = {
+        32.7256f, 35.1377f, 43.0832f, 42.2579f, 13.9286f, 15.6152f, 21.5240f, 28.2727f, 44.0782f, 37.5280f, 52.9916f, 48.3318f,
+        38.8011f, 32.1818f, 50.5110f, 37.5550f, 33.9761f, -1.6170f, 43.8622f, 11.0347f, 5.3569f, 42.6478f, 14.1070f, 54.9145f,
+        18.9216f, 34.8446f, 27.7505f, 41.2693f, -0.4375f, 36.7849f, 4.8178f, 41.8215f, 6.9987f, 1.1282f, 8.4302f, 11.6832f,
+        30.5954f, 21.0410f, 37.7095f, 23.9976f, 35.2360f, 16.6405f, 39.2402f, 20.4393f, 45.0158f, 45.7867f, 51.7352f, 46.8898f,
+        9.8174f, 26.1848f, 22.7651f, 38.2017f, 16.3854f, 35.9841f, 20.6606f, 46.2920f, 22.5697f, 16.7346f, 24.3859f, 27.6069f,
+        7.0039f, 5.3968f, 11.9433f, 17.3270f, 3.9409f, 24.0168f, 9.0512f, 31.4417f, 18.6518f, -1.2903f, 28.9187f, 7.6721f,
+        6.9462f, 39.9030f, 15.7447f, 42.8601f, 27.5034f, 30.2815f, 39.4780f, 32.8849f};
+    std::vector<float> scores_vec = {
+        0.6979f, 0.4657f, 0.8326f, 0.2503f, 0.1204f, 0.1810f, 0.7501f, 0.5157f, 0.2451f, 0.5509f, 0.2371f, 0.7267f,
+        0.5015f, 0.4429f, 0.3714f, 0.6673f, 0.4256f, 0.1789f, 0.2062f, 0.9657f};
+
+    migraphx::parameter_map host_params;
+    host_params["boxes"]  = migraphx::argument(boxes_s, boxes_vec.data());
+    host_params["scores"] = migraphx::argument(scores_s, scores_vec.data());
+
+    auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params);
+    indices.resize(static_cast<std::size_t>(num_selected) * 3);
+    std::vector<int64_t> gold = {0, 0, 19, 0, 0, 2, 0, 0, 6, 0, 0, 11, 0, 0, 0, 0, 0, 15, 0, 0, 9, 0, 0, 7, 0, 0, 12, 0, 0, 1};
+    EXPECT(migraphx::verify::verify_rms_range(indices, gold));
+    EXPECT(num_selected == 10);
+}
+
+TEST_CASE(nms_50boxes_center_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape boxes_s{migraphx::shape::float_type, {1, 50, 4}};
+    migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 50}};
+
+    auto boxes_p         = mm->add_parameter("boxes", boxes_s);
+    auto scores_p        = mm->add_parameter("scores", scores_s);
+    auto max_out_l       = mm->add_literal(int64_t{20});
+    auto iou_threshold   = mm->add_literal(0.4000f);
+    auto score_threshold = mm->add_literal(0.2000f);
+
+    auto nms =
+        mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
+                            boxes_p,
+                            scores_p,
+                            max_out_l,
+                            iou_threshold,
+                            score_threshold);
+    add_nms_return(mm, nms);
+
+    std::vector<float> boxes_vec = {
+        90.8581f, 82.6292f, 23.5447f, 19.9060f, 69.9707f, 89.6161f, 29.1830f, 26.1572f, 26.5870f, 14.0249f, 15.5215f, 14.1630f,
+        96.9176f, 55.4036f, 5.1730f, 8.1873f, 77.8751f, 10.8576f, 1.4042f, 7.8632f, 71.6890f, 67.2240f, 7.6600f, 22.6344f,
+        44.9361f, 28.1234f, 4.8228f, 24.6805f, 27.2242f, 65.9423f, 20.6521f, 4.0531f, 9.6391f, 72.6995f, 4.5331f, 2.9302f,
+        90.2602f, 76.8647f, 15.6836f, 18.2386f, 45.5776f, 10.7741f, 21.1336f, 5.2390f, 20.2363f, 91.6012f, 17.8524f, 24.9153f,
+        30.5957f, 23.0214f, 6.7935f, 9.9997f, 57.9220f, 3.7413f, 24.3196f, 5.1723f, 17.6773f, 55.4852f, 21.7468f, 27.7081f,
+        85.6614f, 37.0922f, 22.4305f, 5.8004f, 75.8520f, 82.9790f, 4.8007f, 9.2569f, 71.9463f, 80.8251f, 4.5889f, 5.4548f,
+        43.2093f, 31.7139f, 27.8993f, 4.3492f, 62.7309f, 95.2899f, 12.5298f, 1.6133f, 58.4098f, 29.0918f, 9.7275f, 2.6065f,
+        64.9847f, 51.5057f, 15.1689f, 6.0646f, 8.4444f, 25.5965f, 20.2231f, 2.5481f, 41.5807f, 93.6044f, 28.7131f, 18.1432f,
+        4.1614f, 16.4608f, 9.3069f, 20.7407f, 49.3991f, 4.4911f, 27.8194f, 12.4153f, 32.9861f, 43.5097f, 1.7209f, 10.2217f,
+        14.4524f, 99.2376f, 17.1007f, 15.6313f, 10.3403f, 89.1677f, 19.3853f, 26.3751f, 58.7645f, 74.8608f, 4.0710f, 25.6828f,
+        17.0593f, 89.0792f, 5.0698f, 2.2608f, 92.5120f, 89.3447f, 13.1543f, 6.2635f, 58.1061f, 51.8858f, 29.0207f, 7.8656f,
+        34.6870f, 31.5929f, 18.2852f, 8.2322f, 59.0915f, 77.2012f, 28.0577f, 17.5657f, 2.2804f, 66.1661f, 24.3265f, 13.0716f,
+        95.8559f, 37.3658f, 14.5541f, 2.4284f, 48.2303f, 9.4467f, 23.7581f, 11.8348f, 78.2735f, 74.6790f, 1.5173f, 16.1888f,
+        8.2730f, 26.2461f, 4.1652f, 3.9485f, 48.6658f, 93.6813f, 25.0534f, 25.1703f, 49.0707f, 24.0971f, 24.1077f, 2.5069f,
+        93.7826f, 12.2758f, 7.7466f, 27.8204f, 57.1728f, 83.1113f, 16.3923f, 3.8743f, 47.3489f, 15.3284f, 18.5745f, 25.4637f,
+        26.6976f, 17.9268f, 26.1644f, 27.1769f, 33.1569f, 59.9383f, 18.4901f, 29.4075f, 52.0672f, 87.4562f, 12.9646f, 24.2588f,
+        43.8911f, 19.6435f, 11.8513f, 23.6048f, 2.1612f, 31.0324f, 13.3506f, 19.6320f};
+    std::vector<float> scores_vec = {
+        0.8011f, 0.2211f, 0.5825f, 0.5628f, 0.8718f, 0.5165f, 0.4466f, 0.6756f, 0.3398f, 0.2258f, 0.5301f, 0.4752f,
+        0.3093f, 0.4308f, 0.4298f, 0.3947f, 0.4415f, 0.7172f, 0.3672f, 0.9540f, 0.9247f, 0.5328f, 0.3955f, 0.5819f,
+        0.8637f, 0.6873f, 0.8240f, 0.5795f, 0.6696f, 0.3593f, 0.7614f, 0.2822f, 0.7253f, 0.8746f, 0.2189f, 0.6529f,
+        0.1856f, 0.7531f, 0.1760f, 0.9423f, 0.2237f, 0.9630f, 0.8208f, 0.6343f, 0.8044f, 0.8156f, 0.9514f, 0.3280f,
+        0.6311f, 0.1855f};
+
+    migraphx::parameter_map host_params;
+    host_params["boxes"]  = migraphx::argument(boxes_s, boxes_vec.data());
+    host_params["scores"] = migraphx::argument(scores_s, scores_vec.data());
+
+    auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params);
+    indices.resize(static_cast<std::size_t>(num_selected) * 3);
+    std::vector<int64_t> gold = {0, 0, 41, 0, 0, 19, 0, 0, 46, 0, 0, 39, 0, 0, 20, 0, 0, 33, 0, 0, 4, 0, 0, 24, 0, 0, 26, 0, 0, 42, 0, 0, 45, 0, 0, 44, 0, 0, 0, 0, 0, 30, 0, 0, 32, 0, 0, 17, 0, 0, 25, 0, 0, 7, 0, 0, 28, 0, 0, 35};
+    EXPECT(migraphx::verify::verify_rms_range(indices, gold));
+    EXPECT(num_selected == 20);
+}
+
+TEST_CASE(nms_100boxes_2batch_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape boxes_s{migraphx::shape::float_type, {2, 100, 4}};
+    migraphx::shape scores_s{migraphx::shape::float_type, {2, 1, 100}};
+
+    auto boxes_p         = mm->add_parameter("boxes", boxes_s);
+    auto scores_p        = mm->add_parameter("scores", scores_s);
+    auto max_out_l       = mm->add_literal(int64_t{15});
+    auto iou_threshold   = mm->add_literal(0.5000f);
+    auto score_threshold = mm->add_literal(0.1000f);
+
+    auto nms =
+        mm->add_instruction(migraphx::make_op("nonmaxsuppression"),
+                            boxes_p,
+                            scores_p,
+                            max_out_l,
+                            iou_threshold,
+                            score_threshold);
+    add_nms_return(mm, nms);
+
+    std::vector<float> boxes_vec = {
+        -3.8699f, 108.8880f, 20.8101f, 137.5783f, 149.9079f, 29.3134f, 203.7504f, 39.2031f, 121.6031f, 107.1528f, 162.2282f, 118.8275f,
+        27.1146f, 87.2265f, 42.1365f, 141.7457f, -7.3128f, 91.3799f, 44.0012f, 95.0142f, 25.9397f, 97.1572f, 47.4736f, 111.8955f,
+        170.3318f, 143.6689f, 221.6791f, 161.9004f, 82.3933f, 144.8881f, 101.0310f, 174.8098f, 138.9017f, 80.6305f, 174.7306f, 116.2308f,
+        115.0719f, 104.8666f, 139.4914f, 134.9707f, 105.8753f, 183.2658f, 123.0900f, 189.2287f, 2.3726f, 16.2585f, 55.6795f, 31.6349f,
+        183.1709f, -1.9651f, 195.2389f, 48.8066f, 57.2666f, -1.7671f, 63.2705f, 36.8507f, 105.0166f, 111.9228f, 126.1903f, 151.2225f,
+        118.2848f, 63.4507f, 161.6255f, 103.9927f, 105.5274f, 131.8586f, 154.1659f, 177.8699f, 158.1560f, 132.0321f, 218.0818f, 136.4605f,
+        20.4451f, 55.4126f, 38.9305f, 78.0425f, 89.1363f, 163.2572f, 114.2048f, 196.0894f, 76.2707f, 142.0220f, 85.3431f, 162.9909f,
+        77.3750f, 28.6949f, 112.2925f, 79.5191f, -6.0851f, 58.1025f, 53.7721f, 87.5743f, 5.6429f, 39.7135f, 47.9949f, 86.0625f,
+        37.5563f, 5.8879f, 73.6739f, 57.1568f, 48.8660f, 14.1653f, 73.0158f, 44.9480f, 58.0793f, 159.8937f, 113.0820f, 214.5573f,
+        107.0385f, 69.7607f, 137.3566f, 105.4010f, 122.4620f, 51.0809f, 131.3896f, 102.2471f, 71.0835f, 135.3897f, 93.6408f, 156.4846f,
+        79.2752f, 95.3835f, 84.2380f, 125.8137f, 37.0673f, 171.0514f, 49.9841f, 203.4046f, 116.6400f, 152.4634f, 118.6825f, 159.6572f,
+        49.5364f, 83.6166f, 77.2799f, 108.1312f, -12.0070f, 47.7104f, 26.4309f, 102.8334f, 73.0529f, 178.2168f, 94.3071f, 216.4359f,
+        81.9253f, 137.8156f, 107.7278f, 149.2885f, 16.3219f, 179.7427f, 73.9152f, 200.7352f, 91.8087f, 17.5434f, 137.1745f, 29.8480f,
+        96.6991f, 168.8745f, 129.6096f, 171.3390f, 131.5065f, 99.5547f, 149.2944f, 155.2749f, 102.6283f, 10.6622f, 156.5511f, 38.1065f,
+        123.0512f, 108.0793f, 137.9220f, 127.2239f, 53.1452f, 119.0642f, 73.3404f, 155.3743f, 130.1690f, 1.7448f, 184.8039f, 3.1763f,
+        93.7074f, 82.1619f, 125.9504f, 99.5652f, 63.8853f, 143.8404f, 108.6820f, 186.3194f, 107.2755f, 39.8756f, 143.1295f, 78.2680f,
+        52.3550f, 62.2463f, 91.9079f, 121.1729f, 93.2160f, 69.6623f, 111.8797f, 107.2634f, 139.7207f, 45.7991f, 154.9616f, 74.9719f,
+        167.2671f, 160.7261f, 187.2941f, 206.6506f, 179.1259f, 129.1106f, 189.2970f, 183.4070f, 74.4343f, 0.3572f, 127.0189f, 43.8782f,
+        95.1992f, 170.4922f, 112.9108f, 228.3217f, 142.9101f, 152.2709f, 177.0380f, 199.4092f, 39.0269f, 30.7110f, 86.7534f, 82.8523f,
+        143.8537f, 163.5132f, 191.0993f, 171.2454f, 85.3959f, -0.8223f, 112.2607f, 43.3901f, 8.6218f, 186.3383f, 37.7209f, 213.3036f,
+        -15.4319f, 116.3204f, 44.2555f, 149.9535f, 147.9980f, 110.2290f, 188.7993f, 149.8210f, -13.4183f, -11.0214f, 35.6454f, 47.1977f,
+        28.9969f, 149.8616f, 83.2476f, 208.9517f, 43.0921f, -3.2028f, 90.5599f, 14.8026f, 28.6361f, 26.0199f, 40.5617f, 70.3113f,
+        45.6946f, 5.9799f, 79.8627f, 51.2289f, 145.0326f, 144.6320f, 152.0444f, 166.0751f, -16.8246f, 35.4867f, 22.6978f, 43.7950f,
+        136.7519f, 180.4197f, 194.1175f, 183.8356f, 155.6840f, 107.8222f, 186.9352f, 154.6854f, 61.1796f, -7.7136f, 87.7250f, 22.1787f,
+        29.1652f, -28.4875f, 32.2799f, 30.6594f, 91.3547f, -3.8851f, 148.9814f, 24.5483f, 20.3959f, 91.8365f, 27.4731f, 150.5336f,
+        71.2720f, 147.6549f, 74.6957f, 172.9379f, 183.9269f, 23.7969f, 199.4448f, 71.6242f, 196.6597f, 166.8796f, 201.5260f, 172.8839f,
+        140.4950f, -5.4397f, 168.3470f, 28.3325f, 46.4677f, 136.0320f, 77.9169f, 184.3535f, 127.8122f, 157.7804f, 147.2538f, 213.3378f,
+        139.0779f, 129.6555f, 143.0846f, 179.1879f, 73.7761f, 138.0335f, 81.3605f, 141.2148f, 116.3348f, 156.1013f, 140.0206f, 179.0908f,
+        -0.1401f, 6.0937f, 4.4311f, 9.9669f, 20.7149f, 36.6326f, 62.9081f, 44.0802f, 98.4106f, 4.5632f, 111.6248f, 45.4062f,
+        23.3391f, 79.3651f, 42.1614f, 122.4473f, 21.0547f, 125.7129f, 45.3081f, 172.3624f, 154.4709f, 99.9714f, 180.0508f, 152.0333f,
+        197.2776f, 147.9130f, 198.3756f, 192.5394f, 107.3878f, 6.9169f, 115.0000f, 55.1683f, 141.8624f, 144.9798f, 193.7655f, 148.8687f,
+        197.5280f, 31.1895f, 198.6007f, 46.0271f, 12.8282f, 35.3058f, 43.8101f, 72.9977f, 74.7088f, 116.1662f, 104.5894f, 167.7956f,
+        68.1883f, 195.4082f, 88.8408f, 196.6737f, 2.7857f, 106.6272f, 29.2340f, 137.9903f, 127.5389f, -9.5799f, 174.5932f, 31.3800f,
+        61.4403f, 121.8884f, 112.0713f, 124.6352f, 15.4868f, 35.9096f, 55.8899f, 68.2298f, 35.5922f, 56.6701f, 44.2246f, 72.3261f,
+        163.1796f, 40.7751f, 180.4136f, 56.2181f, 177.9262f, 90.7157f, 187.1069f, 101.2297f, 33.5656f, 108.4211f, 51.2933f, 164.8822f,
+        73.5555f, 18.9549f, 114.3649f, 72.3462f, 119.3443f, 42.7151f, 174.0536f, 89.5792f, 169.1987f, 170.3059f, 182.1476f, 201.8479f,
+        59.3192f, -5.2591f, 92.3019f, 24.6868f, 82.2129f, 76.0264f, 124.5949f, 108.2814f, 119.7321f, 125.9828f, 176.9545f, 158.6404f,
+        127.7304f, 16.7712f, 164.7240f, 43.4104f, 148.5664f, 5.0880f, 164.6177f, 13.8616f, 95.0352f, 23.4340f, 132.9384f, 31.8482f,
+        10.9685f, 155.1733f, 30.8775f, 212.3560f, 151.4989f, -12.8680f, 210.0904f, 16.5719f, 160.8241f, 9.0448f, 185.4050f, 66.2840f,
+        138.8994f, 0.9312f, 180.3396f, 11.5822f, 18.7873f, 5.2706f, 21.1577f, 38.9812f, 28.5777f, 117.4022f, 53.1813f, 130.6575f,
+        122.4044f, 40.3588f, 175.0358f, 56.2967f, -13.8737f, 112.4558f, 23.1297f, 115.2290f, 182.2486f, 114.0300f, 209.4412f, 122.0482f,
+        47.3188f, 142.3400f, 103.5391f, 197.4341f, 118.1700f, -9.0369f, 169.5550f, 10.9335f, 167.5089f, 152.2341f, 187.5196f, 189.1137f,
+        62.3618f, 109.6059f, 95.4902f, 138.0417f, 48.8767f, 20.2354f, 78.7763f, 44.8620f, 102.5983f, 138.3968f, 140.8982f, 170.7781f,
+        105.8416f, 165.0748f, 126.5542f, 177.1219f, 74.1239f, 21.1889f, 89.5320f, 80.5165f, 92.9311f, 159.1187f, 147.7788f, 208.3988f,
+        159.3220f, 68.5139f, 214.8306f, 113.2691f, 68.1500f, 106.3565f, 118.9061f, 135.0133f, 9.9914f, 191.9200f, 68.7055f, 201.9398f,
+        52.9639f, 44.6476f, 97.9184f, 99.9669f, 55.7637f, 152.0609f, 101.8791f, 173.2028f, 3.2253f, 61.7017f, 49.2181f, 65.6580f,
+        17.8964f, 149.2418f, 47.2522f, 170.4436f, 122.9471f, 96.2103f, 150.8778f, 144.0833f, 60.3089f, 24.4012f, 75.4822f, 62.1410f,
+        171.4575f, 60.1555f, 210.5018f, 105.4550f, 39.6844f, 39.6149f, 57.7543f, 87.4394f, 11.6796f, 8.8690f, 27.8902f, 22.3743f,
+        132.9151f, -21.7847f, 168.4868f, 33.7186f, 163.6127f, 55.8750f, 188.8017f, 82.7164f, 48.6664f, -15.5441f, 62.5789f, 23.1577f,
+        15.8440f, 32.5294f, 64.9913f, 33.6657f, 11.2664f, 115.2323f, 63.0400f, 174.8410f, 98.9553f, 132.8318f, 109.8496f, 150.4047f,
+        92.9619f, 145.3852f, 94.4048f, 150.0469f, 41.4721f, 49.4119f, 62.3038f, 77.4494f, -14.9919f, 173.6975f, 33.0612f, 182.3103f,
+        71.0426f, 113.7725f, 121.5539f, 123.7598f, 187.2858f, 6.0529f, 196.4472f, 44.3576f, 107.1609f, 16.6524f, 153.8468f, 40.8351f,
+        95.1880f, 110.9244f, 103.0146f, 166.3137f, 10.1316f, 24.6737f, 34.1453f, 44.5039f, 20.5283f, 79.5362f, 80.4462f, 123.3809f,
+        52.7734f, 184.2525f, 65.1362f, 212.4573f, 147.9188f, -19.1670f, 158.0026f, 20.7701f, 162.3696f, -14.8751f, 188.3148f, 21.5070f,
+        161.5482f, 184.1698f, 199.1086f, 213.0640f, 168.8931f, 88.4010f, 224.9343f, 145.4546f, 167.0391f, 14.7719f, 225.9076f, 35.9920f,
+        188.0454f, 173.7320f, 193.1542f, 185.1889f, 9.7935f, 155.5723f, 18.9354f, 196.5798f, 3.7319f, 81.7829f, 51.3855f, 132.6973f,
+        52.4097f, 122.6709f, 69.3770f, 126.0459f, 83.9766f, 40.8733f, 137.1827f, 68.4016f, -0.6763f, -16.7244f, 39.4674f, 36.9323f,
+        165.3600f, 96.2998f, 172.9588f, 141.5273f, 98.2916f, 29.1927f, 148.4108f, 88.7094f, 102.7704f, 116.5475f, 114.1754f, 148.9009f,
+        20.0692f, 147.2792f, 46.0554f, 187.2189f, 33.8616f, -5.7911f, 67.4406f, 13.0553f, 16.7898f, 90.6905f, 47.3350f, 147.5951f,
+        149.6448f, 34.9492f, 191.1284f, 57.5630f, 97.0913f, 152.4916f, 136.5998f, 197.0638f, 117.2606f, 38.3403f, 176.7911f, 63.1255f,
+        29.2236f, 105.0804f, 89.1895f, 139.2277f, 58.5150f, 88.9746f, 89.9861f, 132.4418f, 77.6626f, 63.7197f, 84.2794f, 94.7469f,
+        130.0316f, 108.2651f, 173.9744f, 162.7832f, 125.1590f, 132.2845f, 183.7822f, 158.0233f, 31.4721f, 93.7989f, 51.2533f, 132.9762f,
+        174.2021f, 141.0848f, 202.4134f, 162.2841f, 11.1001f, 184.1428f, 37.1620f, 209.2240f, 177.2076f, 70.3730f, 181.2413f, 97.3360f,
+        -0.2527f, 98.7053f, 40.4109f, 107.1279f, 41.9845f, -0.7119f, 63.8314f, 5.6998f, 145.5655f, 139.0148f, 193.0259f, 179.3967f,
+        10.8509f, 84.2082f, 60.9460f, 123.8838f, 57.9873f, 61.5364f, 107.4399f, 101.6481f, 77.1802f, 17.7313f, 102.7635f, 19.8975f,
+        39.0662f, 167.7982f, 59.0374f, 188.0644f, 119.4588f, 72.6661f, 164.6393f, 85.3368f, 146.1259f, 113.0609f, 194.4079f, 159.9718f,
+        159.9229f, 3.9862f, 189.9071f, 55.7634f, 41.0200f, 184.5329f, 94.7088f, 200.0870f};
+    std::vector<float> scores_vec = {
+        0.1439f, 0.8791f, 0.0961f, 0.1535f, 0.5338f, 0.0675f, 0.0528f, 0.0005f, 0.4363f, 0.7746f, 0.0348f, 0.6523f,
+        0.8231f, 0.1680f, 0.1469f, 0.8608f, 0.8231f, 0.5389f, 0.8192f, 0.0928f, 0.3945f, 0.7378f, 0.2575f, 0.7523f,
+        0.5042f, 0.7503f, 0.4647f, 0.3679f, 0.2192f, 0.2084f, 0.7515f, 0.1189f, 0.0860f, 0.1763f, 0.1753f, 0.8231f,
+        0.3985f, 0.9904f, 0.1372f, 0.6535f, 0.4487f, 0.3929f, 0.8751f, 0.9756f, 0.8729f, 0.1923f, 0.2208f, 0.6561f,
+        0.2891f, 0.7347f, 0.5664f, 0.5509f, 0.8285f, 0.7105f, 0.0266f, 0.0495f, 0.6016f, 0.4862f, 0.2602f, 0.4187f,
+        0.7579f, 0.8266f, 0.5612f, 0.3854f, 0.2707f, 0.5219f, 0.3147f, 0.5641f, 0.6767f, 0.0661f, 0.0011f, 0.2123f,
+        0.8945f, 0.6463f, 0.1720f, 0.8903f, 0.4700f, 0.4761f, 0.9355f, 0.0595f, 0.2152f, 0.5858f, 0.1955f, 0.6795f,
+        0.2141f, 0.0992f, 0.2070f, 0.4227f, 0.1761f, 0.1347f, 0.8603f, 0.3204f, 0.3608f, 0.0553f, 0.3574f, 0.2648f,
+        0.6105f, 0.2054f, 0.8884f, 0.9297f, 0.0998f, 0.1074f, 0.1153f, 0.6196f, 0.1220f, 0.8524f, 0.7543f, 0.8198f,
+        0.5261f, 0.9967f, 0.0442f, 0.4013f, 0.3239f, 0.9486f, 0.5769f, 0.8062f, 0.1703f, 0.9786f, 0.4986f, 0.4937f,
+        0.9709f, 0.3807f, 0.3975f, 0.5848f, 0.1281f, 0.3211f, 0.1932f, 0.1033f, 0.8661f, 0.5893f, 0.3587f, 0.4087f,
+        0.4315f, 0.6331f, 0.9268f, 0.9328f, 0.3915f, 0.3293f, 0.4510f, 0.5679f, 0.4618f, 0.6588f, 0.5544f, 0.3207f,
+        0.3457f, 0.3786f, 0.0946f, 0.1661f, 0.7231f, 0.3891f, 0.2145f, 0.5627f, 0.7555f, 0.2574f, 0.8268f, 0.9275f,
+        0.5974f, 0.6689f, 0.0526f, 0.9455f, 0.3925f, 0.9239f, 0.5790f, 0.0046f, 0.0385f, 0.6804f, 0.5627f, 0.0265f,
+        0.7435f, 0.8521f, 0.4964f, 0.4658f, 0.0055f, 0.7866f, 0.3307f, 0.8788f, 0.3731f, 0.5651f, 0.2703f, 0.1606f,
+        0.7749f, 0.4966f, 0.5365f, 0.9654f, 0.9636f, 0.8556f, 0.1876f, 0.5943f, 0.8781f, 0.3745f, 0.1011f, 0.8110f,
+        0.4818f, 0.5644f, 0.9821f, 0.6072f, 0.4250f, 0.3700f, 0.4176f, 0.1184f};
+
+    migraphx::parameter_map host_params;
+    host_params["boxes"]  = migraphx::argument(boxes_s, boxes_vec.data());
+    host_params["scores"] = migraphx::argument(scores_s, scores_vec.data());
+
+    auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params);
+    indices.resize(static_cast<std::size_t>(num_selected) * 3);
+    std::vector<int64_t> gold = {0, 0, 37, 0, 0, 43, 0, 0, 78, 0, 0, 99, 0, 0, 72, 0, 0, 75, 0, 0, 98, 0, 0, 1, 0, 0, 42, 0, 0, 44, 0, 0, 15, 0, 0, 90, 0, 0, 52, 0, 0, 61, 0, 0, 12, 1, 0, 9, 1, 0, 94, 1, 0, 17, 1, 0, 20, 1, 0, 83, 1, 0, 84, 1, 0, 13, 1, 0, 59, 1, 0, 35, 1, 0, 55, 1, 0, 34, 1, 0, 61, 1, 0, 75, 1, 0, 88, 1, 0, 28};
+    EXPECT(migraphx::verify::verify_rms_range(indices, gold));
+    EXPECT(num_selected == 30);
+}
+
+TEST_CASE(nms_30boxes_3class_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape boxes_s{migraphx::shape::float_type, {1, 30, 4}};
+    migraphx::shape scores_s{migraphx::shape::float_type, {1, 3, 30}};
+
+    auto boxes_p         = mm->add_parameter("boxes", boxes_s);
+    auto scores_p        = mm->add_parameter("scores", scores_s);
+    auto max_out_l       = mm->add_literal(int64_t{5});
+    auto iou_threshold   = mm->add_literal(0.4500f);
+    auto score_threshold = mm->add_literal(0.1500f);
+
+    auto nms =
+        mm->add_instruction(migraphx::make_op("nonmaxsuppression"),
+                            boxes_p,
+                            scores_p,
+                            max_out_l,
+                            iou_threshold,
+                            score_threshold);
+    add_nms_return(mm, nms);
+
+    std::vector<float> boxes_vec = {
+        31.2680f, 53.5348f, 37.7043f, 73.6253f, 1.8071f, 55.2945f, 3.9368f, 78.7402f, 40.5016f, 12.5670f, 45.0345f, 32.9366f,
+        78.2552f, 12.9548f, 80.7117f, 35.6526f, 73.9527f, 67.9870f, 79.4405f, 71.9065f, -3.8066f, -7.7339f, 10.2705f, 11.5692f,
+        45.4706f, 34.8613f, 67.4569f, 48.4119f, 17.4632f, 30.3439f, 30.8192f, 43.8443f, 64.5403f, 44.3725f, 79.9380f, 66.0477f,
+        0.7877f, 1.3956f, 6.4307f, 24.7471f, 65.1632f, 44.8608f, 84.5766f, 62.0721f, 59.3935f, 24.0849f, 74.6026f, 36.1925f,
+        -1.0372f, 43.7485f, 19.8379f, 55.2458f, -6.6257f, -1.7353f, 16.1976f, 8.1505f, 62.2758f, 32.2798f, 71.2775f, 41.5966f,
+        10.9190f, 36.7777f, 14.0023f, 46.7824f, 39.6937f, 15.6139f, 45.8900f, 18.6783f, 67.7244f, 9.7794f, 78.7948f, 12.5604f,
+        34.0204f, 5.6094f, 56.7713f, 24.5464f, 26.9281f, 21.9014f, 36.6292f, 33.1611f, 26.2374f, -3.4581f, 44.9652f, 18.9477f,
+        -1.6661f, 68.2450f, 11.7649f, 83.3261f, 74.8979f, 31.4950f, 80.1025f, 33.3041f, 20.6639f, 62.4061f, 29.0408f, 67.0291f,
+        7.1374f, 75.0864f, 23.1608f, 80.8203f, 14.6460f, -5.2621f, 31.1216f, 18.1798f, 71.6501f, 49.1185f, 82.6496f, 55.1487f,
+        4.4135f, 63.2815f, 10.6723f, 76.1439f, 60.5823f, 39.4727f, 78.1862f, 62.0048f, 54.1855f, 22.5844f, 59.0696f, 46.0598f};
+    std::vector<float> scores_vec = {
+        0.9367f, 0.1879f, 0.1073f, 0.4976f, 0.5195f, 0.5082f, 0.4367f, 0.9948f, 0.4863f, 0.4779f, 0.4218f, 0.0668f,
+        0.5930f, 0.2280f, 0.6376f, 0.0508f, 0.9814f, 0.4690f, 0.8968f, 0.4756f, 0.0603f, 0.8222f, 0.6482f, 0.7818f,
+        0.4282f, 0.6379f, 0.8562f, 0.6311f, 0.3477f, 0.6625f, 0.6719f, 0.9606f, 0.3709f, 0.4251f, 0.8121f, 0.5058f,
+        0.7366f, 0.4597f, 0.2155f, 0.7452f, 0.1312f, 0.1986f, 0.6268f, 0.7473f, 0.8947f, 0.2726f, 0.1107f, 0.9560f,
+        0.1544f, 0.1977f, 0.2913f, 0.5294f, 0.8828f, 0.7605f, 0.7082f, 0.1752f, 0.3577f, 0.4784f, 0.1474f, 0.2734f,
+        0.3083f, 0.1273f, 0.5502f, 0.7050f, 0.0699f, 0.4811f, 0.7822f, 0.7480f, 0.8151f, 0.4482f, 0.8206f, 0.2408f,
+        0.3608f, 0.1764f, 0.4675f, 0.3921f, 0.2409f, 0.7518f, 0.3138f, 0.2728f, 0.1309f, 0.4388f, 0.3030f, 0.3693f,
+        0.2360f, 0.7632f, 0.9300f, 0.4979f, 0.6430f, 0.8672f};
+
+    migraphx::parameter_map host_params;
+    host_params["boxes"]  = migraphx::argument(boxes_s, boxes_vec.data());
+    host_params["scores"] = migraphx::argument(scores_s, scores_vec.data());
+
+    auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params);
+    indices.resize(static_cast<std::size_t>(num_selected) * 3);
+    std::vector<int64_t> gold = {0, 0, 7, 0, 0, 16, 0, 0, 0, 0, 0, 18, 0, 0, 26, 0, 1, 1, 0, 1, 17, 0, 1, 14, 0, 1, 22, 0, 1, 4, 0, 2, 26, 0, 2, 29, 0, 2, 10, 0, 2, 6, 0, 2, 25};
+    EXPECT(migraphx::verify::verify_rms_range(indices, gold));
+    EXPECT(num_selected == 15);
+}
+
+TEST_CASE(nms_200boxes_2batch_2class_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape boxes_s{migraphx::shape::float_type, {2, 200, 4}};
+    migraphx::shape scores_s{migraphx::shape::float_type, {2, 2, 200}};
+
+    auto boxes_p         = mm->add_parameter("boxes", boxes_s);
+    auto scores_p        = mm->add_parameter("scores", scores_s);
+    auto max_out_l       = mm->add_literal(int64_t{25});
+    auto iou_threshold   = mm->add_literal(0.3000f);
+    auto score_threshold = mm->add_literal(0.2500f);
+
+    auto nms =
+        mm->add_instruction(migraphx::make_op("nonmaxsuppression"),
+                            boxes_p,
+                            scores_p,
+                            max_out_l,
+                            iou_threshold,
+                            score_threshold);
+    add_nms_return(mm, nms);
+
+    std::vector<float> boxes_vec = {
+        132.1894f, 453.1169f, 199.9736f, 545.7127f, 64.3090f, 275.1729f, 104.8258f, 338.3436f, 76.1273f, 401.7875f, 135.6448f, 487.9920f,
+        12.8305f, 442.3624f, 77.1708f, 466.2458f, -5.9609f, 340.1129f, 126.0715f, 451.3386f, 15.0119f, 224.3769f, 56.2927f, 236.5545f,
+        427.8277f, -14.2917f, 561.9954f, 95.4457f, 4.7940f, -55.8461f, 69.2637f, 71.6517f, 41.3494f, 202.9014f, 91.1927f, 274.2992f,
+        375.6902f, 208.6749f, 451.5645f, 285.6396f, 258.4982f, 179.9212f, 321.7420f, 227.4412f, 367.5344f, 211.3590f, 406.8828f, 356.8083f,
+        277.5064f, 220.9636f, 353.4056f, 331.1991f, 429.2783f, 390.3169f, 452.8968f, 446.2962f, 292.5150f, 40.8054f, 345.9525f, 67.8517f,
+        218.4112f, 95.7302f, 303.7139f, 129.4475f, 325.0759f, 361.4403f, 387.6738f, 431.5647f, 161.8149f, 353.1971f, 285.5779f, 494.6398f,
+        153.4061f, 442.2182f, 192.6577f, 552.6060f, 161.0782f, 419.9203f, 306.5742f, 452.9917f, 25.8953f, 380.4122f, 143.8188f, 509.4868f,
+        325.7002f, 128.4980f, 470.8716f, 185.8499f, 67.4107f, 136.8775f, 193.2931f, 264.7841f, 65.6790f, 115.5359f, 87.8525f, 152.5492f,
+        83.4548f, 256.5595f, 162.8974f, 349.7399f, 407.8717f, 399.8657f, 434.1985f, 538.9396f, 103.6427f, 152.6073f, 226.5586f, 192.0336f,
+        299.0049f, 226.3779f, 387.0450f, 330.6239f, 408.0779f, 74.0950f, 448.3318f, 222.2046f, -30.8828f, 73.1804f, 108.6275f, 96.6196f,
+        373.4308f, 90.5068f, 391.5936f, 104.6787f, 111.3250f, -21.7549f, 196.3405f, 79.7002f, 54.0937f, 448.8364f, 162.5287f, 500.4571f,
+        339.5665f, 195.6321f, 349.3349f, 207.2475f, 409.8580f, 381.1502f, 499.9386f, 452.9707f, 86.2250f, 284.0088f, 208.7943f, 397.3206f,
+        278.8861f, 74.2190f, 289.9477f, 117.7022f, 106.2550f, 62.2701f, 183.5792f, 113.1921f, 257.3803f, 342.4895f, 296.9053f, 469.4987f,
+        261.0432f, 93.1105f, 360.8189f, 171.6012f, 295.8262f, 393.3591f, 314.5092f, 519.9261f, 241.4629f, 36.2717f, 382.0835f, 103.7837f,
+        0.3826f, 267.3577f, 134.6972f, 410.3510f, 332.4151f, 358.2527f, 361.1253f, 456.2211f, 312.7919f, 108.4937f, 361.9585f, 126.7627f,
+        297.0153f, 71.6643f, 385.5729f, 204.5431f, -16.9604f, 445.3092f, 91.0309f, 519.2097f, 189.9415f, 121.2467f, 256.8973f, 143.3509f,
+        192.3739f, 203.1031f, 216.6613f, 226.8539f, 35.0965f, 164.5365f, 51.6150f, 267.9791f, 36.2014f, 122.4881f, 186.1665f, 130.5466f,
+        186.0576f, 366.0443f, 254.9050f, 409.7468f, 305.9496f, 375.0105f, 436.9568f, 396.8388f, 82.0940f, 155.7987f, 154.9680f, 222.5193f,
+        345.6593f, 386.1935f, 484.0906f, 448.9323f, 265.8611f, 67.1577f, 279.9372f, 145.9173f, 371.2164f, -19.1800f, 389.2053f, 23.4858f,
+        166.5204f, 282.6964f, 306.0356f, 288.4709f, 178.5089f, 450.7671f, 320.6853f, 543.3107f, 285.9132f, -9.0198f, 333.8062f, 47.6641f,
+        437.0255f, 54.9746f, 490.9451f, 153.0235f, 211.6987f, 250.8616f, 280.1138f, 268.0530f, 232.8247f, 403.4440f, 295.8328f, 406.4968f,
+        286.3401f, 25.5231f, 315.6569f, 63.5189f, 301.3286f, 163.1046f, 436.1865f, 232.1301f, 16.5538f, 343.6795f, 55.2966f, 403.3963f,
+        204.8009f, 124.9041f, 310.8865f, 246.6391f, 235.2927f, 65.7693f, 246.2989f, 123.0671f, 457.4555f, 57.7300f, 464.2295f, 137.7658f,
+        197.5504f, 160.3075f, 295.9562f, 249.7413f, 208.4036f, 237.5821f, 259.9170f, 241.8350f, 431.7683f, 392.0298f, 530.4317f, 469.7846f,
+        217.7836f, 294.9363f, 232.7928f, 347.3161f, 19.1783f, 313.3156f, 161.7061f, 377.0863f, 52.1937f, 483.5222f, 164.7224f, 499.4650f,
+        -18.1881f, 147.1016f, 113.3757f, 264.7419f, -10.3830f, 130.9681f, 10.9511f, 272.3863f, 191.6208f, 459.5145f, 240.3248f, 463.8325f,
+        356.6797f, 77.6355f, 412.5629f, 168.2401f, 326.2139f, 307.5013f, 407.2526f, 422.3140f, -6.5422f, 355.5684f, 38.6912f, 399.0047f,
+        279.9745f, -10.2789f, 290.0085f, 108.0669f, 49.1601f, 186.5052f, 105.1230f, 281.7262f, 451.0742f, 30.5586f, 490.0021f, 170.0038f,
+        54.4314f, 19.1028f, 112.9336f, 166.2725f, 298.1461f, 228.2593f, 328.4931f, 235.5688f, 143.1079f, 111.0670f, 183.1305f, 178.3627f,
+        273.5727f, 356.7796f, 367.9886f, 439.2808f, 176.7118f, 442.3701f, 235.5468f, 465.2348f, 353.5905f, 375.8070f, 406.0526f, 426.9136f,
+        75.0636f, 58.9357f, 155.6155f, 207.0952f, 394.8923f, 135.3580f, 510.8995f, 138.7764f, 221.3792f, 93.1523f, 278.8305f, 161.5760f,
+        333.7764f, 4.2413f, 422.3168f, 130.7968f, 352.3830f, 447.2686f, 497.3472f, 496.5298f, 460.0268f, 164.7789f, 538.8018f, 237.2689f,
+        43.6929f, 38.9803f, 180.2527f, 185.7092f, 83.8176f, 387.4572f, 203.0748f, 459.2138f, 120.3420f, 189.3440f, 130.0911f, 209.8513f,
+        98.9678f, 13.2052f, 163.9035f, 21.9117f, 238.6976f, 10.0373f, 343.7471f, 151.9043f, 422.7512f, 299.3224f, 570.7713f, 339.9280f,
+        460.4900f, 353.3999f, 529.7881f, 429.5054f, 255.9741f, 98.2099f, 270.7991f, 112.7245f, 277.1439f, 426.6355f, 361.8833f, 490.7601f,
+        420.0563f, 355.7057f, 439.9143f, 495.2914f, 409.9785f, 386.2606f, 522.9550f, 462.1201f, 63.6084f, 40.9810f, 140.2522f, 186.6801f,
+        209.8752f, 5.4847f, 318.6665f, 45.0513f, 351.1511f, 395.6231f, 481.6860f, 471.8004f, 104.2444f, 88.3651f, 198.9577f, 217.4352f,
+        173.7778f, 275.5634f, 266.0312f, 343.3530f, 436.0951f, 358.6616f, 549.5261f, 401.3052f, 429.2604f, -0.0863f, 555.7863f, 128.3795f,
+        387.8089f, 360.8724f, 518.2979f, 419.9659f, 396.0101f, 429.2169f, 402.4382f, 509.2946f, 92.6291f, 290.9362f, 176.5014f, 437.4388f,
+        143.8130f, 206.2184f, 177.0371f, 235.0044f, 209.0457f, 415.3847f, 338.2372f, 461.2934f, 231.5831f, 260.9141f, 329.1943f, 266.5435f,
+        220.9448f, 342.6935f, 284.5580f, 402.0774f, 303.8214f, 394.8393f, 332.8489f, 425.6666f, 178.4043f, 323.5138f, 229.9188f, 425.8390f,
+        321.6556f, 129.9190f, 427.5185f, 157.9359f, 151.0502f, 8.1484f, 182.4998f, 109.6955f, 157.8666f, 99.0403f, 172.8104f, 139.2982f,
+        -3.0452f, 224.4737f, 130.2711f, 278.4012f, 36.9224f, 226.1483f, 151.7898f, 279.1286f, 409.8757f, 237.4242f, 440.6452f, 345.2202f,
+        200.8640f, 162.2960f, 245.4184f, 232.8059f, 41.0147f, 366.0289f, 186.8531f, 420.8625f, 326.4108f, 392.5565f, 432.9303f, 520.5973f,
+        231.0067f, 80.2522f, 322.9745f, 166.4729f, -12.8403f, 351.8312f, 33.9963f, 384.6920f, 135.3959f, 271.4291f, 180.9655f, 406.5427f,
+        85.0562f, 235.5178f, 91.9452f, 287.5727f, 273.1645f, 90.8612f, 382.7083f, 97.6691f, 133.7990f, 360.2684f, 141.2321f, 434.9638f,
+        31.6115f, 470.5798f, 33.3353f, 490.0465f, -27.3799f, 342.6524f, 82.3149f, 379.1839f, 219.6726f, 402.7702f, 362.0547f, 515.0898f,
+        -45.9977f, 481.8516f, 67.7212f, 502.3336f, 388.7589f, 115.4080f, 460.0333f, 236.6427f, 40.9882f, 248.8122f, 114.4089f, 389.4114f,
+        270.2910f, 191.2797f, 336.2753f, 282.6530f, 197.6581f, 439.8926f, 247.0300f, 546.7361f, 182.0580f, -6.7583f, 260.7935f, 100.5661f,
+        3.2778f, 131.7233f, 68.5193f, 280.6516f, 356.3126f, 411.8249f, 446.4396f, 463.7141f, 379.1163f, 129.3928f, 513.9362f, 154.6585f,
+        -69.1199f, 354.7185f, 80.1365f, 433.0744f, 82.9357f, 151.1645f, 95.6685f, 231.6187f, 422.7932f, 476.2348f, 481.1110f, 503.7437f,
+        260.7842f, 395.5883f, 288.7094f, 487.9416f, 48.2868f, 149.1079f, 101.7528f, 152.2125f, 79.4785f, 315.4853f, 123.3120f, 454.7079f,
+        316.4901f, 148.2175f, 343.4961f, 188.6391f, 304.9847f, 299.7342f, 419.8321f, 306.6287f, 262.2399f, 320.6758f, 337.1869f, 337.8050f,
+        407.5904f, 396.3992f, 545.5580f, 433.1963f, 244.1037f, -8.6806f, 249.9599f, 33.1314f, 144.6461f, 107.1346f, 155.6258f, 113.0233f,
+        208.0726f, 334.6470f, 269.1603f, 377.2708f, 173.3525f, 266.8875f, 186.3138f, 296.6358f, 92.1346f, 219.0953f, 132.2813f, 276.5098f,
+        -50.9776f, -1.5900f, 96.9408f, 56.8000f, 160.0388f, 148.3819f, 192.1737f, 199.8940f, 340.4449f, 407.6198f, 370.9644f, 457.4804f,
+        -34.0173f, 8.2614f, 52.4551f, 22.6314f, 181.9884f, 195.8403f, 257.1901f, 200.5959f, 278.2621f, 457.0166f, 365.7473f, 488.1317f,
+        276.6353f, -31.4300f, 333.7688f, 82.3108f, 326.2304f, 300.5375f, 450.4180f, 449.1682f, 394.4356f, 59.1311f, 416.0841f, 198.4815f,
+        323.4377f, 395.2401f, 388.2682f, 471.3687f, -0.4884f, 332.9131f, 103.2861f, 413.1549f, 172.3276f, 418.9163f, 302.6948f, 466.7889f,
+        273.6699f, 49.8039f, 329.7361f, 166.1209f, 79.9860f, 208.1720f, 165.5801f, 323.1208f, 15.6250f, 326.2367f, 26.9268f, 453.0333f,
+        98.6064f, 55.6348f, 124.9839f, 190.0650f, 221.7964f, 82.5141f, 233.0980f, 148.2322f, 152.2380f, -44.0412f, 261.6923f, 71.2233f,
+        66.3730f, 418.6809f, 110.2940f, 539.8344f, 357.7888f, 331.5282f, 466.6268f, 378.4887f, 457.3967f, 248.0516f, 468.2900f, 387.5087f,
+        35.9143f, 364.4689f, 165.4340f, 379.5258f, 402.0395f, 191.2334f, 527.5334f, 340.3795f, 1.8053f, 180.1951f, 16.0557f, 295.9387f,
+        460.2114f, 217.3174f, 464.7511f, 232.2148f, 471.2709f, 270.8305f, 480.6579f, 369.6087f, -58.0695f, 97.7211f, 70.1214f, 103.8139f,
+        363.5242f, 386.1504f, 399.4951f, 501.9083f, 443.7544f, 345.8341f, 526.4471f, 465.9183f, 420.6959f, 129.4022f, 485.2063f, 220.1614f,
+        425.5884f, 224.9686f, 545.1217f, 353.6407f, 238.2388f, 62.7213f, 312.0847f, 78.3060f, 1.2788f, 465.1168f, 76.8773f, 507.2295f,
+        350.7072f, 420.0901f, 499.0819f, 482.8026f, 295.2295f, 457.2856f, 318.5988f, 464.6119f, 248.9387f, 366.2193f, 368.7308f, 464.4846f,
+        266.4057f, -43.0988f, 411.9049f, 94.8485f, 365.3591f, 230.8355f, 381.3726f, 246.8133f, 213.6699f, 419.1429f, 302.9046f, 467.1919f,
+        282.3146f, 326.7091f, 321.6300f, 338.5049f, 157.0835f, 271.7193f, 238.9818f, 413.4953f, -3.7474f, 97.9864f, 45.0004f, 165.3309f,
+        28.3577f, 158.4742f, 71.5941f, 260.1006f, 284.2465f, 120.1271f, 370.7495f, 246.4540f, 483.6205f, 186.3921f, 511.9348f, 335.0511f,
+        -27.5488f, 218.5612f, 43.3521f, 243.6668f, 229.8062f, 103.3855f, 327.7773f, 223.5129f, 365.4548f, 86.1273f, 385.5540f, 219.3533f,
+        343.5581f, 121.2852f, 483.2167f, 129.5677f, 234.4260f, 125.8439f, 310.7789f, 239.2034f, 248.4032f, 48.0437f, 371.5128f, 101.8978f,
+        299.1465f, 387.2317f, 397.5784f, 484.8726f, 376.0880f, 262.2631f, 482.8782f, 339.8563f, 7.2930f, 47.0424f, 114.9965f, 86.7440f,
+        397.3961f, 336.3557f, 528.7860f, 357.5037f, -33.2049f, 414.6207f, 59.2223f, 433.0458f, 396.8727f, 110.5703f, 439.3271f, 126.9654f,
+        30.4567f, 27.2849f, 46.2837f, 123.3157f, 51.6484f, -22.3715f, 142.9798f, 30.9887f, -3.4962f, 6.9860f, 7.3904f, 40.2644f,
+        204.1520f, 329.0802f, 241.1047f, 433.1711f, 162.1569f, 441.9229f, 172.2023f, 545.2635f, 41.6043f, -18.2279f, 124.3886f, 63.1082f,
+        213.0999f, 303.8811f, 237.9903f, 444.1898f, 155.2101f, 6.7177f, 247.1608f, 65.1444f, 324.4111f, 233.2946f, 443.2500f, 358.8382f,
+        384.8351f, 371.9398f, 508.2953f, 384.1355f, 302.7226f, 123.9848f, 349.8446f, 235.2196f, 20.8081f, -68.6720f, 103.6023f, 79.6067f,
+        105.2511f, 234.0231f, 190.1397f, 361.1662f, 420.9290f, 451.9373f, 492.3893f, 539.3073f, -4.9387f, 81.6146f, 93.6732f, 176.0028f,
+        187.2764f, 67.9256f, 219.5794f, 121.5657f, 397.7987f, 10.8413f, 544.7059f, 113.0846f, 467.5255f, 219.7334f, 483.1394f, 335.5223f,
+        143.3246f, 223.3545f, 267.8786f, 373.0906f, 288.9383f, 358.9469f, 378.4586f, 433.3239f, 209.6311f, 371.4695f, 247.1145f, 381.6038f,
+        320.6775f, 401.3793f, 432.7831f, 491.1622f, 8.9968f, 393.5190f, 22.5845f, 412.2537f, 13.8844f, 104.8985f, 130.2727f, 142.3685f,
+        262.6455f, 252.9446f, 351.5533f, 302.9328f, 107.5252f, 93.7443f, 125.0270f, 203.6677f, 326.6030f, 150.6990f, 339.4493f, 179.0864f,
+        119.1742f, 453.1236f, 232.0488f, 478.8208f, 420.9991f, 337.0981f, 465.6465f, 344.7978f, 342.8767f, 421.7388f, 476.3827f, 552.8516f,
+        189.1445f, 156.2901f, 303.6933f, 260.6224f, 333.9324f, 265.2428f, 438.9627f, 272.1948f, 114.3128f, 240.9499f, 156.8251f, 246.1655f,
+        193.8135f, 11.5223f, 300.4463f, 95.7648f, 27.6040f, 96.8022f, 169.8780f, 139.8998f, 423.1219f, 218.8621f, 437.7643f, 308.7743f,
+        386.7347f, 0.8091f, 436.3329f, 66.5652f, 433.0917f, 396.4442f, 469.0579f, 535.0178f, 408.9413f, 39.9801f, 468.5356f, 83.8636f,
+        423.9944f, 47.8940f, 535.6019f, 150.0867f, 78.3370f, 378.1336f, 149.9992f, 387.1877f, 422.8927f, -23.2443f, 508.9316f, 120.1789f,
+        261.7021f, 376.5726f, 309.5111f, 523.7055f, 200.2215f, 307.2894f, 222.2736f, 418.4116f, 259.8004f, -0.8479f, 300.5735f, 69.4688f,
+        106.7550f, 329.0340f, 235.8474f, 362.8130f, 98.8964f, 254.7818f, 189.6566f, 376.8467f, 91.9970f, 323.3163f, 149.3173f, 434.0331f,
+        -18.1340f, 397.0634f, 100.5620f, 431.1345f, 242.9804f, 325.0598f, 253.5845f, 393.2908f, 424.4659f, 258.1096f, 463.2957f, 328.0667f,
+        297.4333f, 99.1641f, 332.7187f, 223.2992f, 186.5782f, 297.1904f, 334.3975f, 400.0833f, 161.1921f, 430.0698f, 267.4008f, 526.9018f,
+        185.6758f, 244.8488f, 278.7259f, 342.6730f, 103.7673f, 311.5224f, 105.5101f, 352.8224f, 397.2368f, 190.3715f, 425.6990f, 246.7565f,
+        51.3437f, 374.1586f, 147.0393f, 381.9622f, 329.5223f, 439.7066f, 387.1005f, 557.9608f, 310.6336f, 47.4363f, 449.3514f, 112.9530f,
+        229.9626f, 68.0539f, 344.9065f, 134.3514f, 397.6331f, 250.9398f, 465.2933f, 288.4979f, 89.1863f, 224.5854f, 201.8640f, 256.7900f,
+        367.6410f, 241.4922f, 513.9763f, 330.0776f, 329.8622f, 6.7118f, 399.5483f, 42.3622f, 351.0067f, 196.8547f, 447.7431f, 207.4218f,
+        263.3493f, 233.8098f, 401.2304f, 349.1684f, 404.1452f, 264.0487f, 442.1978f, 321.1426f, 430.0009f, 299.8394f, 563.0980f, 357.4945f,
+        202.3143f, 327.4748f, 217.8485f, 392.7412f, 358.1485f, 259.5528f, 455.7672f, 381.9944f, 313.4684f, 370.7192f, 431.1113f, 419.5239f,
+        180.1469f, 255.4066f, 272.7232f, 369.3540f, 426.0572f, 198.2577f, 500.8918f, 339.2499f, 150.7206f, 253.3635f, 243.7053f, 352.8329f,
+        270.9340f, 17.9364f, 294.5319f, 83.2569f, 36.4112f, 80.3679f, 69.5312f, 192.7886f, 92.2801f, 229.0865f, 133.4951f, 298.3132f,
+        375.3135f, 405.1188f, 465.3827f, 467.8684f, 164.8547f, 299.8922f, 231.6980f, 379.1594f, 178.3286f, 21.0337f, 215.7555f, 69.3744f,
+        56.7212f, 287.8708f, 189.2598f, 304.4041f, 217.4480f, 79.4625f, 274.1624f, 142.2755f, 369.1791f, 357.2809f, 436.6378f, 376.7356f,
+        416.5593f, 382.6425f, 478.6048f, 444.7983f, 21.0025f, 254.7366f, 49.1120f, 338.7197f, 232.4042f, 225.8433f, 342.4166f, 365.5193f,
+        199.7265f, 166.0972f, 267.5468f, 172.4943f, 305.4298f, 176.3264f, 308.8521f, 269.9237f, 151.3188f, 397.4529f, 295.9569f, 466.6555f,
+        138.0480f, 359.6507f, 260.5968f, 363.6696f, 181.5352f, 240.7855f, 290.3455f, 278.9682f, 225.7522f, 174.7890f, 356.2469f, 193.4433f,
+        182.4345f, 8.5387f, 318.5487f, 41.8410f, 210.4292f, 50.5482f, 261.7152f, 92.4592f, 362.9012f, 66.1153f, 454.9341f, 126.9099f,
+        326.9678f, 146.7783f, 418.6802f, 226.6052f, 150.2754f, 471.4981f, 191.1031f, 472.6456f, 383.2531f, 240.0174f, 417.3240f, 265.1360f,
+        417.8392f, 109.9494f, 435.8114f, 124.8908f, 27.1272f, 11.4244f, 126.3650f, 94.3257f, 232.6628f, 144.1367f, 350.0197f, 194.1688f,
+        85.4650f, 366.5097f, 199.8470f, 449.2209f, 345.5237f, 174.6456f, 393.6487f, 208.6972f, 103.6008f, 383.9478f, 135.1845f, 388.5580f,
+        301.4075f, 330.7206f, 369.9960f, 471.9843f, 86.3247f, 46.8414f, 168.7999f, 63.9793f, 186.5999f, 294.3789f, 324.5439f, 314.2809f,
+        408.6489f, 468.1303f, 539.9976f, 490.9658f, 121.9074f, 127.4639f, 259.4001f, 274.6741f, 374.0247f, -21.0436f, 501.7138f, 71.9877f,
+        421.1110f, 415.6848f, 565.8336f, 507.6180f, 402.2457f, 367.8241f, 472.6052f, 515.8422f, 78.8962f, 253.9820f, 86.9698f, 268.1594f,
+        403.1037f, 203.0262f, 416.5545f, 349.2269f, -13.5009f, 90.1716f, 45.6503f, 121.5695f, 176.9532f, 362.8065f, 216.3486f, 456.6442f,
+        422.2061f, 217.5038f, 448.5273f, 281.0963f, 272.8624f, -12.1655f, 415.8898f, 46.0433f, 251.3114f, 271.6299f, 281.4290f, 411.3851f,
+        121.9583f, 463.6307f, 265.9058f, 486.8656f, 348.9660f, 339.7936f, 463.3310f, 489.3569f, 306.5287f, 109.8543f, 403.0297f, 167.3439f,
+        183.3392f, -22.1712f, 285.0661f, 75.4963f, 421.0473f, 397.5667f, 471.4370f, 542.7847f, 66.3152f, 463.7401f, 163.6328f, 473.3226f,
+        70.7872f, 196.9543f, 99.6043f, 335.4611f, 251.0428f, 278.3568f, 391.7609f, 363.9607f, 463.0136f, 178.3225f, 508.9808f, 284.2776f,
+        104.1169f, 198.2685f, 143.1397f, 221.4969f, 71.3536f, 19.4869f, 178.3168f, 99.9616f, 20.3440f, -2.3003f, 119.1549f, 99.0532f,
+        396.1600f, 81.8756f, 464.4035f, 150.8565f, 65.5815f, 406.2740f, 160.8160f, 430.3668f, 239.2070f, 54.2293f, 263.9715f, 91.6030f,
+        444.7733f, 49.1971f, 546.0992f, 177.5016f, -14.5900f, 271.2390f, 26.7309f, 277.3751f, 257.4168f, 54.2554f, 299.0693f, 160.8758f,
+        243.5621f, 6.6488f, 268.7269f, 156.5579f, 378.4616f, 280.6006f, 428.9858f, 282.7156f, 152.4626f, 171.5487f, 202.8190f, 196.5445f,
+        170.8344f, 262.3559f, 239.5070f, 363.8034f, 69.2827f, 451.1334f, 98.6552f, 461.0720f, 355.5286f, 31.0572f, 385.2867f, 119.9359f,
+        351.4949f, 405.2588f, 433.2140f, 508.1748f, 58.2303f, 406.9281f, 78.4330f, 495.5619f, 144.9057f, 386.8375f, 248.5514f, 442.2501f,
+        375.6284f, 263.1954f, 517.2766f, 368.0905f, -30.9426f, 265.2984f, 33.6499f, 354.8483f, 81.7472f, 303.6374f, 217.0119f, 335.5753f,
+        269.6966f, 302.7942f, 285.3457f, 387.7014f, 163.3466f, -57.9610f, 170.7473f, 74.4432f, 81.7806f, 428.8672f, 190.2646f, 529.2253f,
+        172.8226f, 257.1534f, 287.2148f, 328.4503f, 27.4537f, 366.2749f, 154.0694f, 415.1909f, 260.0797f, 181.7424f, 269.5455f, 195.5394f,
+        294.9684f, -12.5261f, 411.7275f, 24.9233f, 259.0953f, 253.5339f, 316.1996f, 256.2007f, 23.4560f, 179.5914f, 69.6533f, 327.5987f,
+        408.8140f, 201.4197f, 435.5946f, 235.5696f, 12.7857f, 108.6503f, 162.1921f, 231.0668f, 377.1631f, 111.8490f, 387.6489f, 137.9771f,
+        118.1705f, 242.1441f, 242.3947f, 285.4007f, 343.2383f, 155.9774f, 439.5230f, 219.3007f, 47.8730f, 460.2977f, 158.3999f, 509.6342f,
+        39.8081f, 26.4865f, 146.8540f, 146.4408f, 184.0596f, 87.9846f, 312.9663f, 231.6809f, 2.2755f, 81.2708f, 30.6605f, 212.6897f,
+        112.0872f, 259.7130f, 113.2101f, 283.5961f, 316.9157f, 191.2768f, 407.0965f, 308.0034f, 391.8293f, 310.3482f, 445.5542f, 333.3923f,
+        30.6705f, 406.4540f, 50.1148f, 543.5478f, 426.6715f, 103.5286f, 455.4062f, 181.6925f, 373.5433f, 320.8254f, 423.9739f, 371.9462f,
+        429.1098f, 0.3217f, 440.5745f, 24.7185f, 344.4742f, 129.8145f, 353.9543f, 132.5740f, 268.3326f, 212.8878f, 405.8205f, 250.8319f,
+        238.7950f, -53.0971f, 286.2983f, 84.0919f};
+    std::vector<float> scores_vec = {
+        0.9822f, 0.9644f, 0.1426f, 0.7149f, 0.6008f, 0.6906f, 0.0962f, 0.1886f, 0.0766f, 0.6041f, 0.9866f, 0.6720f,
+        0.7108f, 0.9846f, 0.6780f, 0.0402f, 0.8670f, 0.3647f, 0.0044f, 0.5072f, 0.9370f, 0.2573f, 0.4915f, 0.1738f,
+        0.0577f, 0.0805f, 0.7270f, 0.8641f, 0.1433f, 0.2883f, 0.1950f, 0.0269f, 0.5534f, 0.6999f, 0.6479f, 0.3881f,
+        0.5550f, 0.0941f, 0.1543f, 0.9318f, 0.7615f, 0.9227f, 0.9167f, 0.6494f, 0.9282f, 0.4167f, 0.0036f, 0.0626f,
+        0.1095f, 0.0954f, 0.3517f, 0.7013f, 0.7906f, 0.5902f, 0.1464f, 0.7479f, 0.3548f, 0.0130f, 0.2806f, 0.3306f,
+        0.2742f, 0.8119f, 0.7599f, 0.6956f, 0.1390f, 0.8078f, 0.6772f, 0.1948f, 0.6481f, 0.4835f, 0.4394f, 0.1121f,
+        0.5183f, 0.0999f, 0.1643f, 0.1325f, 0.9541f, 0.2849f, 0.3552f, 0.3221f, 0.8983f, 0.5630f, 0.9192f, 0.2999f,
+        0.1148f, 0.5562f, 0.3455f, 0.8019f, 0.8794f, 0.4726f, 0.9714f, 0.5530f, 0.2709f, 0.4890f, 0.0373f, 0.8040f,
+        0.1014f, 0.3087f, 0.5653f, 0.0430f, 0.0793f, 0.6961f, 0.0718f, 0.4771f, 0.3387f, 0.2281f, 0.1888f, 0.7634f,
+        0.9515f, 0.1402f, 0.9597f, 0.5948f, 0.6417f, 0.7099f, 0.7041f, 0.8198f, 0.4835f, 0.5334f, 0.3238f, 0.1053f,
+        0.6646f, 0.0336f, 0.2756f, 0.0942f, 0.1907f, 0.6387f, 0.6285f, 0.4211f, 0.0902f, 0.4334f, 0.3527f, 0.7205f,
+        0.5790f, 0.4916f, 0.4870f, 0.9663f, 0.7563f, 0.4970f, 0.4792f, 0.0265f, 0.9425f, 0.3192f, 0.2559f, 0.9994f,
+        0.7187f, 0.0474f, 0.0619f, 0.0255f, 0.5996f, 0.0716f, 0.9334f, 0.9369f, 0.5461f, 0.6166f, 0.2919f, 0.0640f,
+        0.7375f, 0.1018f, 0.0856f, 0.3112f, 0.0125f, 0.4340f, 0.7077f, 0.8013f, 0.6043f, 0.8469f, 0.4065f, 0.8488f,
+        0.5065f, 0.2230f, 0.9441f, 0.2750f, 0.0262f, 0.2427f, 0.3667f, 0.3513f, 0.5247f, 0.8831f, 0.2923f, 0.5208f,
+        0.3401f, 0.8218f, 0.1576f, 0.1035f, 0.5030f, 0.6719f, 0.7955f, 0.5896f, 0.7738f, 0.3927f, 0.0329f, 0.1161f,
+        0.0387f, 0.3289f, 0.4955f, 0.3563f, 0.5606f, 0.4806f, 0.6779f, 0.6670f, 0.3181f, 0.3462f, 0.5851f, 0.5964f,
+        0.3147f, 0.3303f, 0.6940f, 0.6474f, 0.1351f, 0.4410f, 0.8927f, 0.0363f, 0.8552f, 0.1632f, 0.5072f, 0.4243f,
+        0.0101f, 0.9154f, 0.4549f, 0.9543f, 0.2867f, 0.8663f, 0.9224f, 0.5568f, 0.2027f, 0.6852f, 0.5490f, 0.9445f,
+        0.4393f, 0.2685f, 0.1383f, 0.6986f, 0.9741f, 0.0283f, 0.7404f, 0.9269f, 0.0748f, 0.1102f, 0.6920f, 0.6480f,
+        0.0688f, 0.8344f, 0.5234f, 0.9072f, 0.8780f, 0.8125f, 0.5159f, 0.2517f, 0.5060f, 0.1008f, 0.6588f, 0.1340f,
+        0.5112f, 0.0544f, 0.2995f, 0.2321f, 0.6200f, 0.7868f, 0.0573f, 0.8503f, 0.8608f, 0.3423f, 0.6590f, 0.4026f,
+        0.1542f, 0.5287f, 0.0864f, 0.8785f, 0.9243f, 0.8216f, 0.5625f, 0.5576f, 0.9846f, 0.2479f, 0.0759f, 0.5619f,
+        0.3288f, 0.3223f, 0.0071f, 0.5962f, 0.2640f, 0.1879f, 0.0404f, 0.3644f, 0.8790f, 0.3367f, 0.6791f, 0.7565f,
+        0.3281f, 0.8216f, 0.6919f, 0.5592f, 0.0010f, 0.0351f, 0.9909f, 0.7823f, 0.9376f, 0.9023f, 0.0204f, 0.7918f,
+        0.4511f, 0.7896f, 0.0067f, 0.2882f, 0.7513f, 0.7930f, 0.6197f, 0.3013f, 0.3104f, 0.9668f, 0.4392f, 0.4471f,
+        0.5523f, 0.4095f, 0.5527f, 0.4323f, 0.8267f, 0.9091f, 0.9321f, 0.5643f, 0.4421f, 0.7052f, 0.8383f, 0.5630f,
+        0.7000f, 0.7497f, 0.6764f, 0.7461f, 0.2086f, 0.4984f, 0.5883f, 0.0025f, 0.8560f, 0.6100f, 0.1291f, 0.8164f,
+        0.7171f, 0.7583f, 0.3920f, 0.8542f, 0.4140f, 0.5705f, 0.0006f, 0.6449f, 0.7182f, 0.5671f, 0.4966f, 0.8099f,
+        0.6814f, 0.2781f, 0.9591f, 0.7073f, 0.9879f, 0.9713f, 0.9189f, 0.7554f, 0.6094f, 0.1722f, 0.5434f, 0.7654f,
+        0.5209f, 0.8682f, 0.1097f, 0.3809f, 0.5060f, 0.4323f, 0.1086f, 0.1535f, 0.8376f, 0.4844f, 0.0487f, 0.0165f,
+        0.4735f, 0.1644f, 0.7051f, 0.7953f, 0.2283f, 0.5922f, 0.1544f, 0.3036f, 0.8888f, 0.5441f, 0.8859f, 0.2252f,
+        0.3300f, 0.4710f, 0.4801f, 0.9976f, 0.1144f, 0.8520f, 0.8637f, 0.5532f, 0.3440f, 0.5192f, 0.2925f, 0.7991f,
+        0.4983f, 0.9258f, 0.6227f, 0.5143f, 0.7111f, 0.5039f, 0.9045f, 0.1844f, 0.9733f, 0.8122f, 0.8607f, 0.4829f,
+        0.8372f, 0.3068f, 0.7619f, 0.1405f, 0.3071f, 0.4457f, 0.3223f, 0.3870f, 0.8201f, 0.2567f, 0.7453f, 0.0737f,
+        0.7657f, 0.7920f, 0.4017f, 0.7225f, 0.9151f, 0.8007f, 0.3904f, 0.4842f, 0.7794f, 0.2926f, 0.8039f, 0.3281f,
+        0.8060f, 0.0868f, 0.0444f, 0.9977f, 0.8695f, 0.8828f, 0.9513f, 0.4383f, 0.2868f, 0.1300f, 0.5012f, 0.2200f,
+        0.9356f, 0.0040f, 0.1432f, 0.2465f, 0.1990f, 0.2258f, 0.6560f, 0.3275f, 0.6150f, 0.8903f, 0.6026f, 0.6945f,
+        0.3655f, 0.1597f, 0.3206f, 0.9643f, 0.6218f, 0.2775f, 0.4509f, 0.8355f, 0.6684f, 0.5607f, 0.8852f, 0.6724f,
+        0.6427f, 0.1898f, 0.1064f, 0.9651f, 0.5989f, 0.4157f, 0.5890f, 0.0618f, 0.8221f, 0.2166f, 0.8045f, 0.5344f,
+        0.2766f, 0.0302f, 0.8158f, 0.1765f, 0.0518f, 0.7559f, 0.3500f, 0.3893f, 0.2471f, 0.8592f, 0.2973f, 0.2102f,
+        0.3092f, 0.2031f, 0.3177f, 0.0829f, 0.1585f, 0.4171f, 0.8795f, 0.0573f, 0.2127f, 0.9083f, 0.8900f, 0.6795f,
+        0.2405f, 0.4198f, 0.2112f, 0.1286f, 0.3800f, 0.5758f, 0.3599f, 0.6108f, 0.2963f, 0.3459f, 0.7907f, 0.8783f,
+        0.3220f, 0.5715f, 0.2782f, 0.0533f, 0.7379f, 0.1710f, 0.4257f, 0.4870f, 0.1845f, 0.0946f, 0.3480f, 0.9523f,
+        0.6151f, 0.3814f, 0.0389f, 0.6003f, 0.0923f, 0.5425f, 0.7520f, 0.4236f, 0.2994f, 0.0474f, 0.0248f, 0.4300f,
+        0.8833f, 0.2441f, 0.5741f, 0.6843f, 0.0608f, 0.1531f, 0.3313f, 0.6701f, 0.4390f, 0.7342f, 0.8676f, 0.7584f,
+        0.9922f, 0.7544f, 0.8522f, 0.8324f, 0.7303f, 0.8018f, 0.9347f, 0.4752f, 0.6383f, 0.5149f, 0.8510f, 0.4314f,
+        0.8197f, 0.7994f, 0.9619f, 0.2489f, 0.7096f, 0.7569f, 0.9363f, 0.9069f, 0.5735f, 0.5792f, 0.1673f, 0.9750f,
+        0.2550f, 0.7247f, 0.7958f, 0.4412f, 0.2112f, 0.1890f, 0.8565f, 0.5108f, 0.0901f, 0.7170f, 0.2502f, 0.8764f,
+        0.3096f, 0.2003f, 0.0849f, 0.5115f, 0.4507f, 0.7513f, 0.4646f, 0.3438f, 0.2617f, 0.2781f, 0.9278f, 0.1651f,
+        0.9882f, 0.3269f, 0.0884f, 0.2487f, 0.0584f, 0.7900f, 0.5126f, 0.3370f, 0.6620f, 0.6306f, 0.9399f, 0.9613f,
+        0.6807f, 0.8178f, 0.7924f, 0.4913f, 0.7045f, 0.0783f, 0.7580f, 0.9618f, 0.0850f, 0.8361f, 0.9330f, 0.2262f,
+        0.5248f, 0.9279f, 0.9602f, 0.1279f, 0.3490f, 0.6981f, 0.2216f, 0.3248f, 0.0233f, 0.1535f, 0.5623f, 0.6531f,
+        0.6489f, 0.7784f, 0.4153f, 0.2735f, 0.0156f, 0.2066f, 0.3124f, 0.1782f, 0.0201f, 0.1574f, 0.6661f, 0.6296f,
+        0.9357f, 0.7982f, 0.5678f, 0.1376f, 0.5641f, 0.0616f, 0.4309f, 0.3903f, 0.4278f, 0.2798f, 0.6858f, 0.8409f,
+        0.7685f, 0.6278f, 0.5383f, 0.0311f, 0.7229f, 0.5450f, 0.2707f, 0.3278f, 0.9356f, 0.6244f, 0.4759f, 0.6209f,
+        0.4137f, 0.4702f, 0.2903f, 0.4399f, 0.6856f, 0.0399f, 0.7950f, 0.2830f, 0.6826f, 0.6427f, 0.6526f, 0.6081f,
+        0.9591f, 0.5083f, 0.7323f, 0.7054f, 0.2363f, 0.2833f, 0.4240f, 0.2777f, 0.3667f, 0.3910f, 0.6039f, 0.2199f,
+        0.8043f, 0.4375f, 0.7062f, 0.0814f, 0.4700f, 0.0282f, 0.6759f, 0.3437f, 0.9493f, 0.3241f, 0.5638f, 0.2574f,
+        0.6201f, 0.4670f, 0.3706f, 0.2037f, 0.1115f, 0.1199f, 0.9990f, 0.4123f, 0.0019f, 0.9529f, 0.0200f, 0.4186f,
+        0.7175f, 0.9146f, 0.7129f, 0.4636f, 0.9744f, 0.0393f, 0.9869f, 0.8494f, 0.9289f, 0.2548f, 0.1425f, 0.6633f,
+        0.5159f, 0.5232f, 0.9246f, 0.6201f, 0.3111f, 0.4001f, 0.1335f, 0.1923f, 0.1434f, 0.8103f, 0.7049f, 0.5303f,
+        0.3744f, 0.6685f, 0.8129f, 0.8812f, 0.5470f, 0.8199f, 0.5113f, 0.4745f, 0.8654f, 0.3864f, 0.3959f, 0.3049f,
+        0.5187f, 0.5449f, 0.6605f, 0.4305f, 0.2178f, 0.8668f, 0.3460f, 0.9229f, 0.2074f, 0.5601f, 0.5366f, 0.8286f,
+        0.1389f, 0.9099f, 0.5314f, 0.5861f, 0.5102f, 0.0360f, 0.4971f, 0.2635f, 0.3427f, 0.6491f, 0.4977f, 0.0932f,
+        0.0730f, 0.1857f, 0.1909f, 0.6083f, 0.1778f, 0.8817f, 0.2098f, 0.0911f, 0.8757f, 0.2953f, 0.4254f, 0.9590f,
+        0.9444f, 0.7149f, 0.0689f, 0.5933f, 0.9891f, 0.9469f, 0.1060f, 0.3960f};
+
+    migraphx::parameter_map host_params;
+    host_params["boxes"]  = migraphx::argument(boxes_s, boxes_vec.data());
+    host_params["scores"] = migraphx::argument(scores_s, scores_vec.data());
+
+    auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params);
+    indices.resize(static_cast<std::size_t>(num_selected) * 3);
+    std::vector<int64_t> gold = {0, 0, 143, 0, 0, 10, 0, 0, 13, 0, 0, 0, 0, 0, 90, 0, 0, 135, 0, 0, 1, 0, 0, 76, 0, 0, 108, 0, 0, 170, 0, 0, 140, 0, 0, 20, 0, 0, 151, 0, 0, 150, 0, 0, 39, 0, 0, 44, 0, 0, 41, 0, 0, 82, 0, 0, 80, 0, 0, 88, 0, 0, 16, 0, 0, 27, 0, 0, 167, 0, 0, 165, 0, 0, 181, 0, 1, 187, 0, 1, 94, 0, 1, 152, 0, 1, 72, 0, 1, 32, 0, 1, 153, 0, 1, 109, 0, 1, 150, 0, 1, 19, 0, 1, 27, 0, 1, 96, 0, 1, 35, 0, 1, 197, 0, 1, 68, 0, 1, 22, 0, 1, 154, 0, 1, 17, 0, 1, 117, 0, 1, 43, 0, 1, 97, 0, 1, 10, 0, 1, 180, 0, 1, 182, 0, 1, 67, 0, 1, 44, 1, 0, 35, 1, 0, 152, 1, 0, 175, 1, 0, 4, 1, 0, 71, 1, 0, 166, 1, 0, 127, 1, 0, 38, 1, 0, 170, 1, 0, 44, 1, 0, 158, 1, 0, 198, 1, 0, 24, 1, 0, 101, 1, 0, 171, 1, 0, 2, 1, 0, 53, 1, 0, 102, 1, 0, 66, 1, 0, 140, 1, 0, 37, 1, 0, 98, 1, 0, 115, 1, 0, 150, 1, 0, 6, 1, 1, 114, 1, 1, 196, 1, 1, 0, 1, 1, 126, 1, 1, 124, 1, 1, 19, 1, 1, 11, 1, 1, 26, 1, 1, 84, 1, 1, 191, 1, 1, 117, 1, 1, 104, 1, 1, 197, 1, 1, 192, 1, 1, 10, 1, 1, 48, 1, 1, 68, 1, 1, 22, 1, 1, 128, 1, 1, 25, 1, 1, 134, 1, 1, 163, 1, 1, 121, 1, 1, 169, 1, 1, 185};
+    EXPECT(migraphx::verify::verify_rms_range(indices, gold));
+    EXPECT(num_selected == 100);
+}
+
+
 int main(int argc, const char* argv[]) { test::run(argc, argv); }

From 600d9fbb6cbdb280dd78ed884391e17e58170a8f Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Tue, 19 May 2026 21:22:33 -0500
Subject: [PATCH 15/32] Progress update

---
 src/targets/gpu/jit/nonmaxsuppression.cpp     | 39 ++++++++++++++-----
 src/targets/gpu/prepare_nonmaxsuppression.cpp | 13 +++----
 test/gpu/nonmaxsuppression.cpp                |  8 +++-
 3 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp
index dfa5aaffcba..dbfe766882b 100644
--- a/src/targets/gpu/jit/nonmaxsuppression.cpp
+++ b/src/targets/gpu/jit/nonmaxsuppression.cpp
@@ -46,12 +46,21 @@ extern "C" {
 
 MIGRAPHX_GLOBAL void nms_sort_kernel(${params})
 {
-    make_tensors()(${args})([](auto boxes, auto scores, auto sorted) {
+    make_tensors()(${args})([](auto boxes,
+                               auto scores,
+                               auto sorted_scores,
+                               auto sorted_boxes,
+                               auto sorted_box_indices) {
         nonmaxsuppression_sort<${center_point_box},
                                ${num_batches},
                                ${num_classes},
                                ${num_boxes},
-                               ${aligned_num_boxes}>(boxes, scores, sorted);
+                               ${aligned_num_boxes}>(
+           boxes,
+           scores,
+           sorted_scores,
+           sorted_boxes,
+           sorted_box_indices);
     });
 }
 
@@ -71,18 +80,28 @@ extern "C" {
 
 MIGRAPHX_GLOBAL void nms_filter_kernel(${params})
 {
-    make_tensors()(${args})([](auto sorted,
-                                auto max_p,
-                                auto iou_p,
-                                auto thr_p,
-                                auto mask,
-                                auto output,
-                                auto counts) {
+    make_tensors()(${args})([](auto sorted_scores,
+                               auto sorted_boxes,
+                               auto sorted_box_indices,
+                               auto max_p,
+                               auto iou_p,
+                               auto thr_p,
+                               auto mask,
+                               auto output,
+                               auto counts) {
         nonmaxsuppression_filter<${num_batches},
                                  ${num_classes},
                                  ${num_boxes},
                                  ${aligned_num_boxes}>(
-            sorted, max_p, iou_p, thr_p, mask, output, counts);
+            sorted_scores,
+            sorted_boxes,
+            sorted_box_indices,
+            max_p,
+            iou_p,
+            thr_p,
+            mask,
+            output,
+            counts);
     });
 }
 
diff --git a/src/targets/gpu/prepare_nonmaxsuppression.cpp b/src/targets/gpu/prepare_nonmaxsuppression.cpp
index 53514963c13..e6e2c021578 100644
--- a/src/targets/gpu/prepare_nonmaxsuppression.cpp
+++ b/src/targets/gpu/prepare_nonmaxsuppression.cpp
@@ -38,11 +38,6 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 
-// nms_data is laid out as { float score; float box[4]; int box_index; } for a
-// total of 24 bytes per entry. The scratch workspace is allocated as raw uint8
-// and reinterpreted in the kernel.
-static constexpr std::size_t nms_bytes_per_data = 24;
-
 // Sort boxes per (batch, class) into nms_data{} tensor.
 struct nms_sort
 {
@@ -66,9 +61,11 @@ struct nms_sort
         const auto num_batches = boxes_s.lens()[0];
         const auto num_boxes  = boxes_s.lens()[1];
         const auto num_classes = scores_s.lens()[1];
-        const auto aligned_b =
-            static_cast<std::size_t>(bit_ceil(static_cast<std::uint32_t>(num_boxes)));
-        return shape{shape::uint8_type, {num_batches * num_classes * aligned_b * nms_bytes_per_data}};
+        const auto aligned_b = static_cast<std::size_t>(bit_ceil(static_cast<std::uint32_t>(num_boxes)));
+        shape out_scores_shape{shape::float_type, {num_batches * num_classes, aligned_b}};
+        shape out_boxes_shape{shape::float_type, {num_batches * num_classes, aligned_b, 4}};
+        shape out_box_index_shape{shape::int32_type, {num_batches * num_classes, aligned_b}};
+        return shape{{out_scores_shape, out_boxes_shape, out_box_index_shape}};
     }
 };
 MIGRAPHX_REGISTER_OP(nms_sort);
diff --git a/test/gpu/nonmaxsuppression.cpp b/test/gpu/nonmaxsuppression.cpp
index 3f7aab9b432..119b2ab6a8e 100644
--- a/test/gpu/nonmaxsuppression.cpp
+++ b/test/gpu/nonmaxsuppression.cpp
@@ -66,7 +66,7 @@ static void add_nms_return(migraphx::module* mm, migraphx::instruction_ref nms)
     mm->add_return({idx, cnt});
 }
 
-TEST_CASE(nms_test)
+TEST_CASE(nms_default_test)
 {
     migraphx::program p;
     auto* mm = p.get_main_module();
@@ -362,6 +362,7 @@ TEST_CASE(nms_multi_class_test)
     EXPECT(num_selected == 4);
 }
 
+// Values generated from onnxruntime CPU EP
 TEST_CASE(nms_20boxes_test)
 {
     migraphx::program p;
@@ -407,6 +408,7 @@ TEST_CASE(nms_20boxes_test)
     EXPECT(num_selected == 10);
 }
 
+// Values generated from onnxruntime CPU EP
 TEST_CASE(nms_50boxes_center_test)
 {
     migraphx::program p;
@@ -465,6 +467,7 @@ TEST_CASE(nms_50boxes_center_test)
     EXPECT(num_selected == 20);
 }
 
+// Values generated from onnxruntime CPU EP
 TEST_CASE(nms_100boxes_2batch_test)
 {
     migraphx::program p;
@@ -585,6 +588,7 @@ TEST_CASE(nms_100boxes_2batch_test)
     EXPECT(num_selected == 30);
 }
 
+// Values generated from onnxruntime CPU EP
 TEST_CASE(nms_30boxes_3class_test)
 {
     migraphx::program p;
@@ -639,6 +643,7 @@ TEST_CASE(nms_30boxes_3class_test)
     EXPECT(num_selected == 15);
 }
 
+// Values generated from onnxruntime CPU EP
 TEST_CASE(nms_200boxes_2batch_2class_test)
 {
     migraphx::program p;
@@ -876,5 +881,4 @@ TEST_CASE(nms_200boxes_2batch_2class_test)
     EXPECT(num_selected == 100);
 }
 
-
 int main(int argc, const char* argv[]) { test::run(argc, argv); }

From d5934c067ee5183e99f82215505e80bb50388fa0 Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Wed, 20 May 2026 15:40:32 -0500
Subject: [PATCH 16/32] Version with iterator nms_data

---
 src/include/migraphx/op/nonmaxsuppression.hpp |   1 +
 src/targets/gpu/jit/nonmaxsuppression.cpp     |  13 +-
 .../migraphx/kernels/nonmaxsuppression.hpp    | 151 ++++++++++++------
 .../kernels/include/migraphx/kernels/sort.hpp |   9 +-
 src/targets/gpu/lowering.cpp                  |  21 +++
 5 files changed, 139 insertions(+), 56 deletions(-)

diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp
index b6cbd4c9bc1..d154733a581 100644
--- a/src/include/migraphx/op/nonmaxsuppression.hpp
+++ b/src/include/migraphx/op/nonmaxsuppression.hpp
@@ -75,6 +75,7 @@ struct nonmaxsuppression
         // Per ONNX spec, output is [num_selected_indices, 3] where each row is
         // [batch_index, class_index, box_index]. The maximum possible
         // num_selected_indices = num_batches * num_classes * spatial_dimension.
+        // TODO: can also be limited by max_output_boxes_per_class
         const auto max_num_boxes = max_batches * max_classes * max_spatial_dimension;
 
         auto fixed_shape_error_check = [&]() {
diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp
index dbfe766882b..ff6b1d32ece 100644
--- a/src/targets/gpu/jit/nonmaxsuppression.cpp
+++ b/src/targets/gpu/jit/nonmaxsuppression.cpp
@@ -126,7 +126,10 @@ MIGRAPHX_GLOBAL void nms_compact_kernel(${params})
                                auto selected_indices,
                                auto num_selected) {
         nonmaxsuppression_compact<${num_batch_class}, ${num_boxes}>(
-            bc_counts, indices, selected_indices, num_selected);
+            bc_counts,
+            indices,
+            selected_indices,
+            num_selected);
     });
 }
 
@@ -153,16 +156,16 @@ struct nms_sort_compiler : compiler<nms_sort_compiler>
         auto block_size = compute_block_size(ctx, aligned_num_boxes, 1024);
 
         hip_compile_options options;
-        options.inputs         = inputs;
+        options.inputs         = flatten_shapes(inputs);
         options.output         = inputs.back();
         options.kernel_name    = "nms_sort_kernel";
-        options.virtual_inputs = inputs;
+        options.virtual_inputs = options.inputs;
         options.set_launch_params(v, num_batches * num_classes * block_size, block_size);
 
         auto src = interpolate_string(
             nms_sort_kernel_src,
-            {{"params", enum_params(inputs.size(), "void * private_p")},
-             {"args", enum_params(inputs.size(), "private_p")},
+            {{"params", enum_params(options.inputs.size(), "void * private_p")},
+             {"args", enum_params(options.inputs.size(), "private_p")},
              {"num_batches", std::to_string(num_batches)},
              {"num_classes", std::to_string(num_classes)},
              {"num_boxes", std::to_string(num_boxes)},
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
index bde081bbc69..f999855c8e9 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
@@ -38,17 +38,19 @@
 
 namespace migraphx {
 
+template <class Score, class Box, class Index>
 struct nms_data
 {
-    float score;
-    array<float, 4> box;
-    int box_index;
+    // should hold iterators
+    Score score;
+    Box box;
+    Index box_index;
 };
 
 // Decode a single box into (xmin, ymin, xmax, ymax) corners.
 // Normalize such that [x1, y1] is the bottom left corner.
 template <bool CenterPointBox, class Box>
-__device__ inline array<float, 4> nms_normalize_box(Box box)
+__device__ inline array<typename Box::type, 4> nms_normalize_box(Box box)
 {
     if constexpr(CenterPointBox)
     {
@@ -99,11 +101,12 @@ __device__ inline index_int nms_packed_idx(index_int i, index_int j, index_int N
 }
 
 // Comparator for sorting nms_data{}.
+template <class Score, class Box, class Index>
 struct nms_score_greater
 {
-    constexpr bool operator()(const nms_data& a, const nms_data& b) const
+    constexpr bool operator()(const nms_data<Score, Box, Index>& a, const nms_data<Score, Box, Index>& b) const
     {
-        return a.score > b.score;
+        return *(a.score) > *(b.score);
     }
 };
 
@@ -121,8 +124,15 @@ template <bool CenterPointBox,
           index_int AlignedNumBoxes,
           class Boxes,
           class Scores,
-          class Output>
-__device__ void nonmaxsuppression_sort(Boxes boxes_tv, Scores scores_tv, Output out_tv)
+          class SortedScores,
+          class SortedBoxes,
+          class SortedIndices>
+__device__ void nonmaxsuppression_sort(
+    Boxes boxes_tv,
+    Scores scores_tv,
+    SortedScores sorted_scores,
+    SortedBoxes sorted_boxes,
+    SortedIndices sorted_indices)
 {
     static_assert(NumBatches > 0);
     static_assert(NumClasses > 0);
@@ -134,42 +144,60 @@ __device__ void nonmaxsuppression_sort(Boxes boxes_tv, Scores scores_tv, Output
     const int batch_idx = block_id / NumClasses;
     const int class_idx = block_id % NumClasses;
     
-    constexpr auto block_out_shape = make_shape(index_ints<AlignedNumBoxes>{});
-    auto* p = reinterpret_cast<nms_data*>(out_tv.data()) + block_id * AlignedNumBoxes;
-    auto block_out_tv = make_tensor_view<nms_data>(p, block_out_shape);
-
     // numpy indexing: scores[batch_idx, class_idx, :]
     const auto my_scores = slice_tensor(scores_tv, array<index_int, 3>{batch_idx, class_idx, 0}, slice_axes<2>());
+    
+    auto block_out_scores = slice_tensor(sorted_scores, array<index_int, 2>{block_id, 0}, slice_axes<1>());
+    auto block_out_boxes = slice_tensor(sorted_boxes, array<index_int, 3>{block_id, 0, 0}, slice_axes<1, 2>());
+    auto block_out_indices = slice_tensor(sorted_indices, array<index_int, 2>{block_id, 0}, slice_axes<1>());
+
+    using scores_type = decltype(block_out_scores.begin());
+    using boxes_type = decltype(block_out_boxes.begin());
+    using indices_type = decltype(block_out_indices.begin());
+    __shared__ uninitialized_buffer<nms_data<scores_type, boxes_type, indices_type>, AlignedNumBoxes> block_nms_data;
 
-    nms_data tmp_data;
     idx.local_stride(AlignedNumBoxes, [&](auto i) {
         if(i < NumBoxes)
         {
-            tmp_data.score     = my_scores[i];
+            block_out_scores[i] = my_scores[i];
             // numpy indexing: boxes[batch_idx, i, :]
-            tmp_data.box       = nms_normalize_box<CenterPointBox>(slice_tensor(boxes_tv, array<index_int, 3>{batch_idx, i, 0}, slice_axes<2>()));
-            tmp_data.box_index = static_cast<int>(i);
+            auto normed_box = nms_normalize_box<CenterPointBox>(
+                slice_tensor(boxes_tv, array<index_int, 3>{batch_idx, i, 0}, slice_axes<2>())
+            );
+            // numpy syntax: out_boxes[block_id, i, 0]
+            auto out_boxes_iter = block_out_boxes.begin_at(array<index_int, 3>{0, i, 0});
+            copy(normed_box.begin(), normed_box.end(), out_boxes_iter);
+            block_out_indices[i] = i;
         }
         else
         {
             // Sentinel score so it never beats any real entry
-            tmp_data.score     = numeric_lowest<typename Boxes::type>();
-            tmp_data.box       = array<float, 4>{0.f, 0.f, 0.f, 0.f};
-            tmp_data.box_index = -1;
+            block_out_scores[i] = numeric_lowest<typename Boxes::type>();
+            auto filler_box = array<float, 4>{0.f, 0.f, 0.f, 0.f};
+            auto out_boxes_iter = block_out_boxes.begin_at(array<index_int, 3>{0, i, 0});
+            copy(filler_box.begin(), filler_box.end(), out_boxes_iter);
+            block_out_indices[i] = -1;
         }
-        block_out_tv[i] = tmp_data;
+        block_nms_data[i] = {
+            block_out_scores.begin_at(array<index_int, 2>{0, i}),
+            block_out_boxes.begin_at(array<index_int, 3>{0, i, 0}),
+            block_out_indices.begin_at(array<index_int, 2>{0, i})
+        };
     });
     __syncthreads();
-    bitonic_sort<nms_score_greater>{nms_score_greater{}}.template block_sort<AlignedNumBoxes>(idx, block_out_tv);
+    bitonic_sort<nms_score_greater<scores_type, boxes_type, indices_type>>
+    {
+        nms_score_greater<scores_type, boxes_type, indices_type>{}
+    }.template block_sort<AlignedNumBoxes>(idx, block_nms_data);
 }
 
-// Build the packed upper-triangular IoU mask for the NumBoxes sorted boxes.
+// Build the packed upper-triangular IoU mask for the NumBoxes nms_data boxes.
 // Work is striped such that each thread does a multiple of 2 rows so each does roughly the same
 // amount of work regardless of where it falls in the triangle.
-// `sorted`: sorted nms_data{} tensor
+// `nms_data`: nms_data nms_data{} tensor
 // `mask`: bool mask tensor
-template <index_int NumBoxes, class SortedData, class Mask, class IouThreshold>
-__device__ void nms_make_iou_mask(index idx, const SortedData sorted, Mask mask, IouThreshold iou_threshold)
+template <index_int NumBoxes, class NMSData, class Mask, class IouThreshold>
+__device__ void nms_make_iou_mask(index idx, const NMSData nms_data, Mask mask, IouThreshold iou_threshold)
 {
     static_assert(NumBoxes > 0);
     constexpr index_int half = NumBoxes / 2;
@@ -178,7 +206,7 @@ __device__ void nms_make_iou_mask(index idx, const SortedData sorted, Mask mask,
         for(index_int j = i + 1; j < NumBoxes; ++j)
         {
             mask[nms_packed_idx(i, j, NumBoxes)] =
-                nms_iou_over_threshold(sorted[i].box, sorted[j].box, iou_threshold) ? 1 : 0;
+                nms_iou_over_threshold(nms_data[i].box, nms_data[j].box, iou_threshold) ? 1 : 0;
         }
     };
 
@@ -197,9 +225,9 @@ __device__ void nms_make_iou_mask(index idx, const SortedData sorted, Mask mask,
 
 // TODO: use template for types
 // Greedy filter that writes selections into a per-batch per-class region of output.
-template <index_int NumBoxes, index_int NumClasses, class Sorted, class Mask, class Output, class Counts>
+template <index_int NumBoxes, index_int NumClasses, class NMSData, class Mask, class Output, class Counts>
 __device__ void nms_filter_per_block(index idx,
-                                     const Sorted sorted,
+                                     const NMSData nms_data,
                                      const Mask mask,
                                      int64_t max_output,
                                      float score_thr,
@@ -215,7 +243,7 @@ __device__ void nms_filter_per_block(index idx,
     // Match the ref op: only filter by score when score_threshold > 0.
     const bool do_filter = score_thr > 0.f;
     idx.local_stride(NumBoxes, [&](auto i) {
-        removed[i] = (do_filter and sorted[i].score < score_thr);
+        removed[i] = (do_filter and *(nms_data[i].score) < score_thr);
     });
     __syncthreads();
 
@@ -231,9 +259,9 @@ __device__ void nms_filter_per_block(index idx,
         {
             if(idx.local == 0)
             {
-                output[output_idx * 3 + 0] = batch_idx;
-                output[output_idx * 3 + 1] = class_idx;
-                output[output_idx * 3 + 2] = sorted[i].box_index;
+                array<typename Output::type, 3> tmp = {batch_idx, class_idx, *(nms_data[i].box_index)};
+                auto output_iter = output.begin_at(array<index_int, 3>{block_id, output_idx, 0});
+                copy(tmp.begin(), tmp.end(), output_iter);
             }
             ++output_idx;
             for(index_int j = i + 1 + idx.local; j < NumBoxes; j += idx.nlocal())
@@ -254,14 +282,18 @@ template <index_int NumBatches,
           index_int NumClasses,
           index_int NumBoxes,
           index_int AlignedNumBoxes,
-          class Sorted,
+          class SortedScores,
+          class SortedBoxes,
+          class SortedIndices,
           class MaxOut,
           class IouThr,
           class ScoreThr,
           class Mask,
           class Output,
           class Counts>
-__device__ void nonmaxsuppression_filter(Sorted sorted_buf,
+__device__ void nonmaxsuppression_filter(SortedScores sorted_scores,
+                                         SortedBoxes sorted_boxes,
+                                         SortedIndices sorted_indices,
                                          MaxOut max_out_p,
                                          IouThr iou_thr_p,
                                          ScoreThr score_thr_p,
@@ -274,31 +306,48 @@ __device__ void nonmaxsuppression_filter(Sorted sorted_buf,
     static_assert(NumBoxes > 0);
 
     auto idx                            = make_index();
-    const index_int block_id            = idx.group;
-    //constexpr index_int iou_packed_size = (NumBoxes > 1) ? (NumBoxes * (NumBoxes - 1)) / 2 : 1;
+    const index_int block_idx            = idx.group;
 
-    constexpr auto my_sorted_shape = make_shape(index_ints<NumBoxes>{});
-    nms_data* my_sorted_p = reinterpret_cast<nms_data*>(sorted_buf.data()) + block_id * AlignedNumBoxes;
-    auto my_sorted = make_tensor_view<nms_data>(my_sorted_p, my_sorted_shape);
+    auto my_sorted_scores = slice_tensor(sorted_scores, array<index_int, 2>{block_idx, 0}, slice_axes<1>());
+    auto my_sorted_boxes = slice_tensor(sorted_boxes, array<index_int, 3>{block_idx, 0, 0}, slice_axes<1, 2>());
+    auto my_sorted_indices = slice_tensor(sorted_indices, array<index_int, 2>{block_idx, 0}, slice_axes<1>());
+
+    using scores_type = decltype(my_sorted_scores.begin());
+    using boxes_type = decltype(my_sorted_boxes.begin());
+    using indices_type = decltype(my_sorted_indices.begin());
+    __shared__ uninitialized_buffer<nms_data<scores_type, boxes_type, indices_type>, NumBoxes> block_nms_data;
+
+    idx.local_stride(AlignedNumBoxes, [&](auto i) {
+        if(i < NumBoxes)
+        {
+            block_nms_data[i] = {
+                my_sorted_scores.begin_at(array<index_int, 2>{0, i}),
+                my_sorted_boxes.begin_at(array<index_int, 3>{0, i, 0}),
+                my_sorted_indices.begin_at(array<index_int, 2>{0, i})
+            };
+        }
+    });
+    __syncthreads();
     
-    auto my_mask = slice_tensor(mask, block_id, slice_axes<1>());
-    auto my_output = slice_tensor(output, block_id, slice_axes<1, 2>());
+    auto my_mask = slice_tensor(mask, array<index_int, 2>{block_idx, 0}, slice_axes<1>());
+    auto my_output = slice_tensor(output, array<index_int, 3>{block_idx, 0, 0}, slice_axes<1, 2>());
 
     // Read scalar tensor inputs
     const int64_t max_output_boxes_per_class = max_out_p[0];
     const float iou_thr_val   = iou_thr_p[0];
     const float score_thr_val = score_thr_p[0];
 
-    nms_make_iou_mask<NumBoxes>(idx, my_sorted, my_mask, iou_thr_val);
+    nms_make_iou_mask<NumBoxes>(idx, block_nms_data, my_mask, iou_thr_val);
     __syncthreads();
 
-    nms_filter_per_block<NumBoxes, NumClasses>(idx,
-                                   my_sorted,
-                                   my_mask,
-                                   max_output_boxes_per_class,
-                                   score_thr_val,
-                                   my_output,
-                                   bc_counts);
+    nms_filter_per_block<NumBoxes, NumClasses>(
+        idx,
+        block_nms_data,
+        my_mask,
+        max_output_boxes_per_class,
+        score_thr_val,
+        my_output,
+        bc_counts);
 }
 
 
@@ -325,6 +374,7 @@ __device__ void nonmaxsuppression_compact(const Counts bc_counts,
     auto idx = make_index();
     __shared__ index_int offsets[NumBatchClass];
     // Exclusive prefix sum on bc_counts to get offsets
+    // TODO: there's probably a better way to get the exclusive prefix sum rather than doing the minus each time.
     block_scan(
         idx,
         op::sum{},
@@ -340,7 +390,8 @@ __device__ void nonmaxsuppression_compact(const Counts bc_counts,
         num_selected[0] = offsets[NumBatchClass-1] + bc_counts[NumBatchClass-1];
     }
 
-    // swap index values to make the output packed
+    // rearrange index values to make the output packed.
+    // TODO: this could be done in-place to save memory.
     constexpr index_int index_size = 3;
     constexpr index_int max_entries = NumBatchClass * NumBoxes;
     idx.local_stride(max_entries, [&](auto i) {
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp
index 980a628682b..fb59d3724c6 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp
@@ -160,7 +160,14 @@ struct bitonic_sort
                     {
                         const bool reverse = (tid & k) != 0;
                         if(this->compare(buf[tid], buf[partner], reverse))
-                            swap(buf[tid], buf[partner]);
+                        {
+                            swap(*(buf[tid].score), *(buf[partner].score));
+                            swap(*(buf[tid].box), *(buf[partner].box));
+                            swap(*(buf[tid].box+1), *(buf[partner].box+1));
+                            swap(*(buf[tid].box+2), *(buf[partner].box+2));
+                            swap(*(buf[tid].box+3), *(buf[partner].box+3));
+                            swap(*(buf[tid].box_index), *(buf[partner].box_index));
+                        }
                     }
                 });
                 __syncthreads();
diff --git a/src/targets/gpu/lowering.cpp b/src/targets/gpu/lowering.cpp
index 1a9275de52b..196a0353a35 100644
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -109,6 +109,7 @@ struct miopen_apply
         add_loop_op();
         add_neg_op();
         add_lrn_op();
+        add_nms_op();
         add_convolution_backwards_op();
         add_select_module_op();
         add_reshape_lazy_op();
@@ -446,6 +447,26 @@ struct miopen_apply
         });
     }
 
+    void add_nms_op()
+    {
+        apply_map.emplace("nonmaxsuppression", [=](instruction_ref ins) {
+            auto s      = ins->get_shape();
+            auto output = insert_allocation(ins, s);
+            std::vector<instruction_ref> cpu_inputs;
+            auto inputs = ins->inputs();
+            std::transform(
+                inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) {
+                    return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in);
+                });
+            cpu_inputs.front() =
+                mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs);
+            auto cpu_out = mod->insert_instruction(ins, ins->get_operator(), cpu_inputs);
+            auto gpu_out =
+                mod->insert_instruction(ins, make_op("hip::copy_to_gpu"), cpu_out, output);
+            return mod->replace_instruction(ins, gpu_out);
+        });
+    }
+
     void add_lrn_op()
     {
         apply_map.emplace("lrn", [=](instruction_ref ins) {

From b5c1e7760e14c29444beadd4ca7ca54779b46a56 Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Wed, 20 May 2026 16:40:06 -0500
Subject: [PATCH 17/32] Kernel version using block shared memory for nms_data

---
 .../migraphx/kernels/nonmaxsuppression.hpp    | 149 +++++++++---------
 .../kernels/include/migraphx/kernels/sort.hpp |  13 +-
 2 files changed, 76 insertions(+), 86 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
index f999855c8e9..5ac4c46f4cb 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
@@ -41,16 +41,26 @@ namespace migraphx {
 template <class Score, class Box, class Index>
 struct nms_data
 {
-    // should hold iterators
+    // holds a copy of data
     Score score;
-    Box box;
+    array<Box, 4> box;
     Index box_index;
 };
 
+// Comparator for sorting nms_data{}.
+template <class Score, class Box, class Index>
+struct nms_score_greater
+{
+    constexpr bool operator()(const nms_data<Score, Box, Index>& a, const nms_data<Score, Box, Index>& b) const
+    {
+        return a.score > b.score;
+    }
+};
+
 // Decode a single box into (xmin, ymin, xmax, ymax) corners.
 // Normalize such that [x1, y1] is the bottom left corner.
 template <bool CenterPointBox, class Box>
-__device__ inline array<typename Box::type, 4> nms_normalize_box(Box box)
+__device__ inline array<typename Box::type, 4> nms_normalize_box(const Box box)
 {
     if constexpr(CenterPointBox)
     {
@@ -77,7 +87,7 @@ __device__ inline array<typename Box::type, 4> nms_normalize_box(Box box)
 
 template <class Box, class Threshold>
 __device__ inline bool
-nms_iou_over_threshold(const Box a, Box b, Threshold threshold)
+nms_iou_over_threshold(const Box a, const Box b, const Threshold threshold)
 {
     const float left   = max(a[0], b[0]);
     const float right  = min(a[2], b[2]);
@@ -100,16 +110,6 @@ __device__ inline index_int nms_packed_idx(index_int i, index_int j, index_int N
     return (i * N - (i * (i + 1)) / 2) + j - (i + 1);
 }
 
-// Comparator for sorting nms_data{}.
-template <class Score, class Box, class Index>
-struct nms_score_greater
-{
-    constexpr bool operator()(const nms_data<Score, Box, Index>& a, const nms_data<Score, Box, Index>& b) const
-    {
-        return *(a.score) > *(b.score);
-    }
-};
-
 // One block per (batch_idx, class_idx).
 // Load data into per-block buffer of nms_data.
 // Pads values after N with sentinel values.
@@ -128,8 +128,8 @@ template <bool CenterPointBox,
           class SortedBoxes,
           class SortedIndices>
 __device__ void nonmaxsuppression_sort(
-    Boxes boxes_tv,
-    Scores scores_tv,
+    const Boxes boxes_tv,
+    const Scores scores_tv,
     SortedScores sorted_scores,
     SortedBoxes sorted_boxes,
     SortedIndices sorted_indices)
@@ -146,49 +146,44 @@ __device__ void nonmaxsuppression_sort(
     
     // numpy indexing: scores[batch_idx, class_idx, :]
     const auto my_scores = slice_tensor(scores_tv, array<index_int, 3>{batch_idx, class_idx, 0}, slice_axes<2>());
-    
-    auto block_out_scores = slice_tensor(sorted_scores, array<index_int, 2>{block_id, 0}, slice_axes<1>());
-    auto block_out_boxes = slice_tensor(sorted_boxes, array<index_int, 3>{block_id, 0, 0}, slice_axes<1, 2>());
-    auto block_out_indices = slice_tensor(sorted_indices, array<index_int, 2>{block_id, 0}, slice_axes<1>());
-
-    using scores_type = decltype(block_out_scores.begin());
-    using boxes_type = decltype(block_out_boxes.begin());
-    using indices_type = decltype(block_out_indices.begin());
+   
+    using scores_type = typename SortedScores::type;
+    using boxes_type = typename SortedBoxes::type;
+    using indices_type = typename SortedIndices::type;
+    // Use shared memory for sorting per-block nms_data. Assuming it fits in LDS.
+    // TODO: can add a static_assert on needed LDS size
     __shared__ uninitialized_buffer<nms_data<scores_type, boxes_type, indices_type>, AlignedNumBoxes> block_nms_data;
-
     idx.local_stride(AlignedNumBoxes, [&](auto i) {
         if(i < NumBoxes)
         {
-            block_out_scores[i] = my_scores[i];
-            // numpy indexing: boxes[batch_idx, i, :]
-            auto normed_box = nms_normalize_box<CenterPointBox>(
-                slice_tensor(boxes_tv, array<index_int, 3>{batch_idx, i, 0}, slice_axes<2>())
-            );
-            // numpy syntax: out_boxes[block_id, i, 0]
-            auto out_boxes_iter = block_out_boxes.begin_at(array<index_int, 3>{0, i, 0});
-            copy(normed_box.begin(), normed_box.end(), out_boxes_iter);
-            block_out_indices[i] = i;
+            block_nms_data[i].score     = my_scores[i];
+            block_nms_data[i].box       = nms_normalize_box<CenterPointBox>(
+                    slice_tensor(boxes_tv, array<index_int, 3>{batch_idx, i, 0}, slice_axes<2>()));
+            block_nms_data[i].box_index = static_cast<int32_t>(i);
         }
         else
         {
-            // Sentinel score so it never beats any real entry
-            block_out_scores[i] = numeric_lowest<typename Boxes::type>();
-            auto filler_box = array<float, 4>{0.f, 0.f, 0.f, 0.f};
-            auto out_boxes_iter = block_out_boxes.begin_at(array<index_int, 3>{0, i, 0});
-            copy(filler_box.begin(), filler_box.end(), out_boxes_iter);
-            block_out_indices[i] = -1;
+            block_nms_data[i].score     = numeric_lowest<typename Boxes::type>();
+            block_nms_data[i].box       = array<float, 4>{0.f, 0.f, 0.f, 0.f};
+            block_nms_data[i].box_index = -1;
         }
-        block_nms_data[i] = {
-            block_out_scores.begin_at(array<index_int, 2>{0, i}),
-            block_out_boxes.begin_at(array<index_int, 3>{0, i, 0}),
-            block_out_indices.begin_at(array<index_int, 2>{0, i})
-        };
     });
     __syncthreads();
-    bitonic_sort<nms_score_greater<scores_type, boxes_type, indices_type>>
-    {
-        nms_score_greater<scores_type, boxes_type, indices_type>{}
-    }.template block_sort<AlignedNumBoxes>(idx, block_nms_data);
+
+    bitonic_sort<nms_score_greater<scores_type, boxes_type, indices_type>>{nms_score_greater<scores_type, boxes_type, indices_type>{}}
+    .template block_sort<AlignedNumBoxes>(idx, block_nms_data);
+    __syncthreads();
+
+    // Copy sorted result back to global memory.
+    auto block_out_scores = slice_tensor(sorted_scores, array<index_int, 2>{block_id, 0}, slice_axes<1>());
+    auto block_out_boxes = slice_tensor(sorted_boxes, array<index_int, 3>{block_id, 0, 0}, slice_axes<1, 2>());
+    auto block_out_indices = slice_tensor(sorted_indices, array<index_int, 2>{block_id, 0}, slice_axes<1>());
+    idx.local_stride(AlignedNumBoxes, [&](auto i) {
+        block_out_scores[i]  = block_nms_data[i].score;
+        auto out_box_iter    = block_out_boxes.begin_at(array<index_int, 3>{0, i, 0});
+        copy(block_nms_data[i].box.begin(), block_nms_data[i].box.end(), out_box_iter);
+        block_out_indices[i] = block_nms_data[i].box_index;
+    });
 }
 
 // Build the packed upper-triangular IoU mask for the NumBoxes nms_data boxes.
@@ -197,7 +192,7 @@ __device__ void nonmaxsuppression_sort(
 // `nms_data`: nms_data nms_data{} tensor
 // `mask`: bool mask tensor
 template <index_int NumBoxes, class NMSData, class Mask, class IouThreshold>
-__device__ void nms_make_iou_mask(index idx, const NMSData nms_data, Mask mask, IouThreshold iou_threshold)
+__device__ void nms_make_iou_mask(const index idx, const NMSData nms_data, Mask mask, const IouThreshold iou_threshold)
 {
     static_assert(NumBoxes > 0);
     constexpr index_int half = NumBoxes / 2;
@@ -226,12 +221,12 @@ __device__ void nms_make_iou_mask(index idx, const NMSData nms_data, Mask mask,
 // TODO: use template for types
 // Greedy filter that writes selections into a per-batch per-class region of output.
 template <index_int NumBoxes, index_int NumClasses, class NMSData, class Mask, class Output, class Counts>
-__device__ void nms_filter_per_block(index idx,
+__device__ void nms_filter_per_block(const index idx,
                                      const NMSData nms_data,
                                      const Mask mask,
-                                     int64_t max_output,
-                                     float score_thr,
-                                     Output output,
+                                     const int64_t max_output,
+                                     const float score_thr,
+                                     Output block_output,
                                      Counts bc_counts)
 {
     static_assert(NumBoxes > 0);
@@ -243,7 +238,7 @@ __device__ void nms_filter_per_block(index idx,
     // Match the ref op: only filter by score when score_threshold > 0.
     const bool do_filter = score_thr > 0.f;
     idx.local_stride(NumBoxes, [&](auto i) {
-        removed[i] = (do_filter and *(nms_data[i].score) < score_thr);
+        removed[i] = (do_filter and nms_data[i].score < score_thr);
     });
     __syncthreads();
 
@@ -259,8 +254,8 @@ __device__ void nms_filter_per_block(index idx,
         {
             if(idx.local == 0)
             {
-                array<typename Output::type, 3> tmp = {batch_idx, class_idx, *(nms_data[i].box_index)};
-                auto output_iter = output.begin_at(array<index_int, 3>{block_id, output_idx, 0});
+                array<typename Output::type, 3> tmp = {batch_idx, class_idx, nms_data[i].box_index};
+                auto output_iter = block_output.begin_at(array<index_int, 3>{0, output_idx, 0});
                 copy(tmp.begin(), tmp.end(), output_iter);
             }
             ++output_idx;
@@ -278,6 +273,8 @@ __device__ void nms_filter_per_block(index idx,
 
 // Per-block filter driver: one block per (batch_idx, class_idx).`.
 // Expecting box-coordinate convention has already been normalized into corner form.
+// TODO: Merge the nonmaxsuppression_sort and nonmaxsuppression_filter kernels by relaxing
+// the AlignedNumBoxes resitriction for the sort.
 template <index_int NumBatches,
           index_int NumClasses,
           index_int NumBoxes,
@@ -291,12 +288,12 @@ template <index_int NumBatches,
           class Mask,
           class Output,
           class Counts>
-__device__ void nonmaxsuppression_filter(SortedScores sorted_scores,
-                                         SortedBoxes sorted_boxes,
-                                         SortedIndices sorted_indices,
-                                         MaxOut max_out_p,
-                                         IouThr iou_thr_p,
-                                         ScoreThr score_thr_p,
+__device__ void nonmaxsuppression_filter(const SortedScores sorted_scores,
+                                         const SortedBoxes sorted_boxes,
+                                         const SortedIndices sorted_indices,
+                                         const MaxOut max_out_p,
+                                         const IouThr iou_thr_p,
+                                         const ScoreThr score_thr_p,
                                          Mask mask,
                                          Output output,
                                          Counts bc_counts)
@@ -305,30 +302,29 @@ __device__ void nonmaxsuppression_filter(SortedScores sorted_scores,
     static_assert(NumClasses > 0);
     static_assert(NumBoxes > 0);
 
-    auto idx                            = make_index();
-    const index_int block_idx            = idx.group;
+    auto idx = make_index();
+    const index_int block_idx = idx.group;
 
     auto my_sorted_scores = slice_tensor(sorted_scores, array<index_int, 2>{block_idx, 0}, slice_axes<1>());
     auto my_sorted_boxes = slice_tensor(sorted_boxes, array<index_int, 3>{block_idx, 0, 0}, slice_axes<1, 2>());
     auto my_sorted_indices = slice_tensor(sorted_indices, array<index_int, 2>{block_idx, 0}, slice_axes<1>());
 
-    using scores_type = decltype(my_sorted_scores.begin());
-    using boxes_type = decltype(my_sorted_boxes.begin());
-    using indices_type = decltype(my_sorted_indices.begin());
+    using scores_type = typename SortedScores::type;
+    using boxes_type = typename SortedBoxes::type;
+    using indices_type = typename SortedIndices::type;
+    // Use shared memory for sorting per-block nms_data. Assuming it fits in LDS.
+    // TODO: can add a static_assert on needed LDS size
     __shared__ uninitialized_buffer<nms_data<scores_type, boxes_type, indices_type>, NumBoxes> block_nms_data;
 
     idx.local_stride(AlignedNumBoxes, [&](auto i) {
         if(i < NumBoxes)
         {
-            block_nms_data[i] = {
-                my_sorted_scores.begin_at(array<index_int, 2>{0, i}),
-                my_sorted_boxes.begin_at(array<index_int, 3>{0, i, 0}),
-                my_sorted_indices.begin_at(array<index_int, 2>{0, i})
-            };
+            block_nms_data[i].score = my_sorted_scores[i];
+            auto boxes_iter = my_sorted_boxes.begin_at(array<index_int, 3>{0, i, 0});
+            copy(boxes_iter, boxes_iter + 4, block_nms_data[i].box.begin());
+            block_nms_data[i].box_index = my_sorted_indices[i];
         }
     });
-    __syncthreads();
-    
     auto my_mask = slice_tensor(mask, array<index_int, 2>{block_idx, 0}, slice_axes<1>());
     auto my_output = slice_tensor(output, array<index_int, 3>{block_idx, 0, 0}, slice_axes<1, 2>());
 
@@ -337,9 +333,10 @@ __device__ void nonmaxsuppression_filter(SortedScores sorted_scores,
     const float iou_thr_val   = iou_thr_p[0];
     const float score_thr_val = score_thr_p[0];
 
-    nms_make_iou_mask<NumBoxes>(idx, block_nms_data, my_mask, iou_thr_val);
     __syncthreads();
+    nms_make_iou_mask<NumBoxes>(idx, block_nms_data, my_mask, iou_thr_val);
 
+    __syncthreads();
     nms_filter_per_block<NumBoxes, NumClasses>(
         idx,
         block_nms_data,
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp
index fb59d3724c6..b49d78ca572 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp
@@ -146,8 +146,8 @@ struct bitonic_sort
     // (e.g. greater{} -> descending). The buffer must be sized to N (a
     // compile-time power of 2); callers pad with sentinel values when the
     // logical length is smaller.
-    template <index_int N, class Array>
-    __device__ void block_sort(index idx, Array& buf) const
+    template <index_int N, class T>
+    __device__ void block_sort(index idx, T& buf) const
     {
         static_assert(is_power_of_2(N), "N must be a power of 2");
         for(index_int k = 2; k <= N; k <<= 1)
@@ -160,14 +160,7 @@ struct bitonic_sort
                     {
                         const bool reverse = (tid & k) != 0;
                         if(this->compare(buf[tid], buf[partner], reverse))
-                        {
-                            swap(*(buf[tid].score), *(buf[partner].score));
-                            swap(*(buf[tid].box), *(buf[partner].box));
-                            swap(*(buf[tid].box+1), *(buf[partner].box+1));
-                            swap(*(buf[tid].box+2), *(buf[partner].box+2));
-                            swap(*(buf[tid].box+3), *(buf[partner].box+3));
-                            swap(*(buf[tid].box_index), *(buf[partner].box_index));
-                        }
+                            swap(buf[tid], buf[partner]);
                     }
                 });
                 __syncthreads();

From 101125604a67afb9fb5edb56941534394c5e0961 Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Wed, 20 May 2026 17:20:44 -0500
Subject: [PATCH 18/32] Progress on polish

---
 src/targets/gpu/jit/nonmaxsuppression.cpp     |  4 +-
 .../migraphx/kernels/nonmaxsuppression.hpp    | 40 +++++++++----------
 2 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp
index ff6b1d32ece..9d0c8cc3efd 100644
--- a/src/targets/gpu/jit/nonmaxsuppression.cpp
+++ b/src/targets/gpu/jit/nonmaxsuppression.cpp
@@ -195,8 +195,8 @@ struct nms_filter_compiler : compiler<nms_filter_compiler>
         const auto aligned_num_boxes =
             static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(num_boxes)));
         // TODO: tune for max block size?
-        // num_boxes/2 because of strided thread work distribution
-        const auto block_size = compute_block_size(ctx, num_boxes/2, 256);
+        // ceil_div(num_boxes, 2) because of strided thread work distribution
+        const auto block_size = compute_block_size(ctx, (num_boxes + 1)/2, 256);
 
         hip_compile_options options;
         options.inputs         = flatten_shapes(inputs);
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
index 5ac4c46f4cb..ab4eb053cd6 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
@@ -47,11 +47,11 @@ struct nms_data
     Index box_index;
 };
 
-// Comparator for sorting nms_data{}.
-template <class Score, class Box, class Index>
+// Comparator for sorting nms_data{} (or anything else with a `.score` field).
 struct nms_score_greater
 {
-    constexpr bool operator()(const nms_data<Score, Box, Index>& a, const nms_data<Score, Box, Index>& b) const
+    template <class T>
+    constexpr bool operator()(const T& a, const T& b) const
     {
         return a.score > b.score;
     }
@@ -114,9 +114,11 @@ __device__ inline index_int nms_packed_idx(index_int i, index_int j, index_int N
 // Load data into per-block buffer of nms_data.
 // Pads values after N with sentinel values.
 // Sorts the nms_data in descending order by score.
-// boxes_tv: dims([N, 4]) of float.
-// scores_tv: dims([N]) of float.
-// sorted_tv: dims([N]) of nms_data{}.
+// boxes_tv: dims([NumBatches, NumBoxes, 4])
+// scores_tv: dims([NumBatches, NumClasses, NumBoxes])
+// sorted_scores: output, dims([B, C, AlignedNumBoxes])
+// sorted_boxes: output, dims([B, C, AlignedNumBoxes, 4])
+// sorted_indices: output, dims([B, C, AlignedNumBoxes])
 template <bool CenterPointBox,
           index_int NumBatches,
           index_int NumClasses,
@@ -163,16 +165,14 @@ __device__ void nonmaxsuppression_sort(
         }
         else
         {
-            block_nms_data[i].score     = numeric_lowest<typename Boxes::type>();
-            block_nms_data[i].box       = array<float, 4>{0.f, 0.f, 0.f, 0.f};
+            block_nms_data[i].score     = numeric_lowest<scores_type>();
+            block_nms_data[i].box       = array<boxes_type, 4>{0.f, 0.f, 0.f, 0.f};
             block_nms_data[i].box_index = -1;
         }
     });
     __syncthreads();
 
-    bitonic_sort<nms_score_greater<scores_type, boxes_type, indices_type>>{nms_score_greater<scores_type, boxes_type, indices_type>{}}
-    .template block_sort<AlignedNumBoxes>(idx, block_nms_data);
-    __syncthreads();
+    bitonic_sort{nms_score_greater{}}.template block_sort<AlignedNumBoxes>(idx, block_nms_data);
 
     // Copy sorted result back to global memory.
     auto block_out_scores = slice_tensor(sorted_scores, array<index_int, 2>{block_id, 0}, slice_axes<1>());
@@ -201,7 +201,7 @@ __device__ void nms_make_iou_mask(const index idx, const NMSData nms_data, Mask
         for(index_int j = i + 1; j < NumBoxes; ++j)
         {
             mask[nms_packed_idx(i, j, NumBoxes)] =
-                nms_iou_over_threshold(nms_data[i].box, nms_data[j].box, iou_threshold) ? 1 : 0;
+                nms_iou_over_threshold(nms_data[i].box, nms_data[j].box, iou_threshold);
         }
     };
 
@@ -316,14 +316,11 @@ __device__ void nonmaxsuppression_filter(const SortedScores sorted_scores,
     // TODO: can add a static_assert on needed LDS size
     __shared__ uninitialized_buffer<nms_data<scores_type, boxes_type, indices_type>, NumBoxes> block_nms_data;
 
-    idx.local_stride(AlignedNumBoxes, [&](auto i) {
-        if(i < NumBoxes)
-        {
-            block_nms_data[i].score = my_sorted_scores[i];
-            auto boxes_iter = my_sorted_boxes.begin_at(array<index_int, 3>{0, i, 0});
-            copy(boxes_iter, boxes_iter + 4, block_nms_data[i].box.begin());
-            block_nms_data[i].box_index = my_sorted_indices[i];
-        }
+    idx.local_stride(NumBoxes, [&](auto i) {
+        block_nms_data[i].score = my_sorted_scores[i];
+        auto boxes_iter = my_sorted_boxes.begin_at(array<index_int, 3>{0, i, 0});
+        copy(boxes_iter, boxes_iter + 4, block_nms_data[i].box.begin());
+        block_nms_data[i].box_index = my_sorted_indices[i];
     });
     auto my_mask = slice_tensor(mask, array<index_int, 2>{block_idx, 0}, slice_axes<1>());
     auto my_output = slice_tensor(output, array<index_int, 3>{block_idx, 0, 0}, slice_axes<1, 2>());
@@ -366,7 +363,8 @@ __device__ void nonmaxsuppression_compact(const Counts bc_counts,
 {
     static_assert(NumBatchClass > 0);
     static_assert(NumBoxes > 0);
-    static_assert(NumBatchClass <= 16000, "nms_compact: NumBatchClass exceeds the LDS budget for offsets[]");
+    // TODO: get a better bound on this
+    static_assert(NumBatchClass <= 8192, "nms_compact: NumBatchClass exceeds the LDS budget for offsets[]");
 
     auto idx = make_index();
     __shared__ index_int offsets[NumBatchClass];

From b5a95684db5512fbc26f14d72c0611b103b65c05 Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Wed, 20 May 2026 18:58:10 -0500
Subject: [PATCH 19/32] Minor cleanup

---
 .../include/migraphx/kernels/nonmaxsuppression.hpp        | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
index ab4eb053cd6..7e050e1b51b 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
@@ -191,8 +191,8 @@ __device__ void nonmaxsuppression_sort(
 // amount of work regardless of where it falls in the triangle.
 // `nms_data`: nms_data nms_data{} tensor
 // `mask`: bool mask tensor
-template <index_int NumBoxes, class NMSData, class Mask, class IouThreshold>
-__device__ void nms_make_iou_mask(const index idx, const NMSData nms_data, Mask mask, const IouThreshold iou_threshold)
+template <index_int NumBoxes, class NMSData, class Mask>
+__device__ void nms_make_iou_mask(const index idx, const NMSData nms_data, Mask mask, const float iou_threshold)
 {
     static_assert(NumBoxes > 0);
     constexpr index_int half = NumBoxes / 2;
@@ -224,7 +224,7 @@ template <index_int NumBoxes, index_int NumClasses, class NMSData, class Mask, c
 __device__ void nms_filter_per_block(const index idx,
                                      const NMSData nms_data,
                                      const Mask mask,
-                                     const int64_t max_output,
+                                     const int max_output,
                                      const float score_thr,
                                      Output block_output,
                                      Counts bc_counts)
@@ -326,7 +326,7 @@ __device__ void nonmaxsuppression_filter(const SortedScores sorted_scores,
     auto my_output = slice_tensor(output, array<index_int, 3>{block_idx, 0, 0}, slice_axes<1, 2>());
 
     // Read scalar tensor inputs
-    const int64_t max_output_boxes_per_class = max_out_p[0];
+    const int max_output_boxes_per_class = max_out_p[0];
     const float iou_thr_val   = iou_thr_p[0];
     const float score_thr_val = score_thr_p[0];
 

From 32c779d94effdedbaeb18c562b51a2c10e1fcb10 Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Thu, 21 May 2026 13:19:20 -0500
Subject: [PATCH 20/32] Move prepare_nonmaxsuppression into lowering

---
 src/targets/gpu/CMakeLists.txt                |  2 +-
 .../gpu/prepare_nonmaxsuppression.hpp         | 48 ----------
 .../migraphx/kernels/nonmaxsuppression.hpp    |  1 -
 src/targets/gpu/lowering.cpp                  | 77 +++++++++++++---
 ...pare_nonmaxsuppression.cpp => nms_ops.cpp} | 91 +------------------
 src/targets/gpu/target.cpp                    |  3 -
 6 files changed, 68 insertions(+), 154 deletions(-)
 delete mode 100644 src/targets/gpu/include/migraphx/gpu/prepare_nonmaxsuppression.hpp
 rename src/targets/gpu/{prepare_nonmaxsuppression.cpp => nms_ops.cpp} (56%)

diff --git a/src/targets/gpu/CMakeLists.txt b/src/targets/gpu/CMakeLists.txt
index b8e92310b99..eee696a85a9 100644
--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -179,11 +179,11 @@ add_library(migraphx_gpu
     loop.cpp
     lrn.cpp
     mlir.cpp
+    nms_ops.cpp
     no_device.cpp
     pack_args.cpp
     prefuse_ops.cpp
     prepare_mlir.cpp
-    prepare_nonmaxsuppression.cpp
     prepare_reduce.cpp
     perfdb.cpp
     pooling.cpp
diff --git a/src/targets/gpu/include/migraphx/gpu/prepare_nonmaxsuppression.hpp b/src/targets/gpu/include/migraphx/gpu/prepare_nonmaxsuppression.hpp
deleted file mode 100644
index bf47c8607b9..00000000000
--- a/src/targets/gpu/include/migraphx/gpu/prepare_nonmaxsuppression.hpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- */
-#ifndef MIGRAPHX_GUARD_GPU_PREPARE_NONMAXSUPPRESSION_HPP
-#define MIGRAPHX_GUARD_GPU_PREPARE_NONMAXSUPPRESSION_HPP
-
-#include <migraphx/config.hpp>
-#include <migraphx/gpu/export.h>
-#include <string>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-
-struct module;
-
-namespace gpu {
-
-struct MIGRAPHX_GPU_EXPORT prepare_nonmaxsuppression
-{
-    std::string name() const { return "gpu::prepare_nonmaxsuppression"; }
-    void apply(module& m) const;
-};
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-#endif // MIGRAPHX_GUARD_GPU_PREPARE_NONMAXSUPPRESSION_HPP
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
index 7e050e1b51b..5ba50f436d4 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
@@ -218,7 +218,6 @@ __device__ void nms_make_iou_mask(const index idx, const NMSData nms_data, Mask
     }
 }
 
-// TODO: use template for types
 // Greedy filter that writes selections into a per-batch per-class region of output.
 template <index_int NumBoxes, index_int NumClasses, class NMSData, class Mask, class Output, class Counts>
 __device__ void nms_filter_per_block(const index idx,
diff --git a/src/targets/gpu/lowering.cpp b/src/targets/gpu/lowering.cpp
index 196a0353a35..976d84ade0e 100644
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -27,6 +27,7 @@
 
 #include <migraphx/manage_ptr.hpp>
 #include <migraphx/instruction.hpp>
+#include <migraphx/literal.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/instruction_ref.hpp>
 #include <migraphx/stringutils.hpp>
@@ -447,23 +448,73 @@ struct miopen_apply
         });
     }
 
+    // Rewrites onnx `nonmaxsuppression` into the GPU op pipeline:
+    //   gpu::nms_sort -> gpu::nms_filter -> gpu::nms_compact
+    // Each gpu::nms_* op is wrapped in gpu::precompile_op inline so the JIT
+    // compile pass can pick them up later. We can't rely on the main lowering
+    // loop to wrap them: it walks forward, and the new instructions land
+    // before `ins` so they would never be revisited.
     void add_nms_op()
     {
         apply_map.emplace("nonmaxsuppression", [=](instruction_ref ins) {
-            auto s      = ins->get_shape();
-            auto output = insert_allocation(ins, s);
-            std::vector<instruction_ref> cpu_inputs;
             auto inputs = ins->inputs();
-            std::transform(
-                inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) {
-                    return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in);
-                });
-            cpu_inputs.front() =
-                mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs);
-            auto cpu_out = mod->insert_instruction(ins, ins->get_operator(), cpu_inputs);
-            auto gpu_out =
-                mod->insert_instruction(ins, make_op("hip::copy_to_gpu"), cpu_out, output);
-            return mod->replace_instruction(ins, gpu_out);
+            const auto& boxes_s  = inputs[0]->get_shape();
+            const auto& scores_s = inputs[1]->get_shape();
+            const auto num_batches = boxes_s.lens()[0];
+            const auto num_boxes   = boxes_s.lens()[1];
+            const auto num_classes = scores_s.lens()[1];
+            const auto iou_packed  = num_boxes * (num_boxes - 1) / 2;
+
+            // Fill in missing optional scalar inputs with default literals.
+            const shape default_max_s{shape::int64_type, {1}};
+            const shape default_iou_s{shape::float_type, {1}};
+            const shape default_thr_s{shape::float_type, {1}};
+            if(inputs.size() < 3)
+                inputs.push_back(
+                    mod->insert_literal(ins, literal{default_max_s, {std::int64_t{0}}}));
+            if(inputs.size() < 4)
+                inputs.push_back(mod->insert_literal(ins, literal{default_iou_s, {0.0f}}));
+            if(inputs.size() < 5)
+                inputs.push_back(mod->insert_literal(ins, literal{default_thr_s, {0.0f}}));
+
+            bool center_point_box =
+                ins->get_operator().to_value().at("center_point_box").to<bool>();
+
+            // Mask is scratch only; allocate up-front so the standard
+            // replace_allocate pass can later turn it into hip::allocate.
+            shape mask_shape{shape::uint8_type, {num_batches * num_classes, iou_packed}};
+            auto mask_alloc = insert_allocation(ins, mask_shape);
+
+            auto sorted = mod->insert_instruction(
+                ins,
+                make_op("gpu::nms_sort", {{"center_point_box", center_point_box}}),
+                inputs[0],
+                inputs[1]);
+            sorted = insert_precompile_op(sorted);
+
+            auto filter = mod->insert_instruction(
+                ins,
+                make_op("gpu::nms_filter",
+                        {{"num_batches", num_batches},
+                         {"num_classes", num_classes},
+                         {"num_boxes", num_boxes}}),
+                sorted,
+                inputs[2],
+                inputs[3],
+                inputs[4],
+                mask_alloc);
+            filter = insert_precompile_op(filter);
+
+            auto raw_output =
+                mod->insert_instruction(ins, make_op("get_tuple_elem", {{"index", 0}}), filter);
+            auto bc_counts =
+                mod->insert_instruction(ins, make_op("get_tuple_elem", {{"index", 1}}), filter);
+
+            auto compact = mod->insert_instruction(
+                ins, make_op("gpu::nms_compact"), bc_counts, raw_output);
+            compact = insert_precompile_op(compact);
+
+            return mod->replace_instruction(ins, compact);
         });
     }
 
diff --git a/src/targets/gpu/prepare_nonmaxsuppression.cpp b/src/targets/gpu/nms_ops.cpp
similarity index 56%
rename from src/targets/gpu/prepare_nonmaxsuppression.cpp
rename to src/targets/gpu/nms_ops.cpp
index e6e2c021578..a1fb8fdfe48 100644
--- a/src/targets/gpu/prepare_nonmaxsuppression.cpp
+++ b/src/targets/gpu/nms_ops.cpp
@@ -22,14 +22,8 @@
  * THE SOFTWARE.
  *
  */
-#include <migraphx/gpu/prepare_nonmaxsuppression.hpp>
 #include <migraphx/bit.hpp>
 #include <migraphx/check_shapes.hpp>
-#include <migraphx/instruction.hpp>
-#include <migraphx/literal.hpp>
-#include <migraphx/make_op.hpp>
-#include <migraphx/matcher.hpp>
-#include <migraphx/module.hpp>
 #include <migraphx/register_op.hpp>
 
 #include <cstdint>
@@ -59,9 +53,10 @@ struct nms_sort
         if(boxes_s.lens().size() != 3 or scores_s.lens().size() != 3)
             MIGRAPHX_THROW("gpu::nms_sort: boxes and scores must be 3-D");
         const auto num_batches = boxes_s.lens()[0];
-        const auto num_boxes  = boxes_s.lens()[1];
+        const auto num_boxes   = boxes_s.lens()[1];
         const auto num_classes = scores_s.lens()[1];
-        const auto aligned_b = static_cast<std::size_t>(bit_ceil(static_cast<std::uint32_t>(num_boxes)));
+        const auto aligned_b =
+            static_cast<std::size_t>(bit_ceil(static_cast<std::uint32_t>(num_boxes)));
         shape out_scores_shape{shape::float_type, {num_batches * num_classes, aligned_b}};
         shape out_boxes_shape{shape::float_type, {num_batches * num_classes, aligned_b, 4}};
         shape out_box_index_shape{shape::int32_type, {num_batches * num_classes, aligned_b}};
@@ -120,86 +115,6 @@ struct nms_compact
 };
 MIGRAPHX_REGISTER_OP(nms_compact);
 
-namespace {
-
-struct find_nonmaxsuppression
-{
-    auto matcher() const { return match::name("nonmaxsuppression"); }
-
-    void apply(module& m, const match::matcher_result& r) const
-    {
-        auto ins    = r.result;
-        auto inputs = ins->inputs();
-        if(inputs.size() < 2 or inputs.size() > 5)
-            MIGRAPHX_THROW("prepare_nonmaxsuppression: unexpected input count " +
-                           std::to_string(inputs.size()));
-
-        const auto& boxes_s  = inputs[0]->get_shape();
-        const auto& scores_s = inputs[1]->get_shape();
-        if(boxes_s.ndim() != 3 or scores_s.ndim() != 3)
-            MIGRAPHX_THROW("prepare_nonmaxsuppression: boxes and scores must be 3-D");
-
-        const auto num_batches = boxes_s.lens()[0];
-        const auto num_boxes = boxes_s.lens()[1];
-        const auto num_classes = scores_s.lens()[1];
-        const auto iou_packed = (num_boxes * (num_boxes - 1) / 2);
-
-        // Fill in missing optional scalar inputs with default literals.
-        // TODO: this is the wrong way to handle this. Should be checking if the input is eval'able.
-        const shape default_max_s{shape::int64_type, {1}};
-        const shape default_iou_s{shape::float_type, {1}};
-        const shape default_thr_s{shape::float_type, {1}};
-        if(inputs.size() < 3)
-            inputs.push_back(m.insert_literal(ins, literal{default_max_s, {std::int64_t{0}}}));
-        if(inputs.size() < 4)
-            inputs.push_back(m.insert_literal(ins, literal{default_iou_s, {0.0f}}));
-        if(inputs.size() < 5)
-            inputs.push_back(m.insert_literal(ins, literal{default_thr_s, {0.0f}}));
-
-        auto op_val           = ins->get_operator().to_value();
-        bool center_point_box = op_val.at("center_point_box").to<bool>();
-
-        // Mask is scratch only; allocate up-front so the standard
-        // replace_allocate pass can later turn it into hip::allocate.
-        shape mask_shape{shape::uint8_type, {num_batches * num_classes, iou_packed}};
-        auto mask_alloc =
-            m.insert_instruction(ins, make_op("allocate", {{"shape", to_value(mask_shape)}}));
-
-        auto sorted = m.insert_instruction(
-            ins,
-            make_op("gpu::nms_sort", {{"center_point_box", center_point_box}}),
-            inputs[0],
-            inputs[1]);
-
-        auto filter = m.insert_instruction(
-            ins,
-            make_op("gpu::nms_filter",
-                    {{"num_batches", num_batches}, {"num_classes", num_classes}, {"num_boxes", num_boxes}}),
-            sorted,
-            inputs[2],
-            inputs[3],
-            inputs[4],
-            mask_alloc);
-
-        auto raw_output =
-            m.insert_instruction(ins, make_op("get_tuple_elem", {{"index", 0}}), filter);
-        auto bc_counts =
-            m.insert_instruction(ins, make_op("get_tuple_elem", {{"index", 1}}), filter);
-
-        auto compact =
-            m.insert_instruction(ins, make_op("gpu::nms_compact"), bc_counts, raw_output);
-
-        m.replace_instruction(ins, compact);
-    }
-};
-
-} // namespace
-
-void prepare_nonmaxsuppression::apply(module& m) const
-{
-    match::find_matches(m, find_nonmaxsuppression{});
-}
-
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
diff --git a/src/targets/gpu/target.cpp b/src/targets/gpu/target.cpp
index 8ff00a75b7b..3ed3e72033d 100644
--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -73,7 +73,6 @@
 #include <migraphx/gpu/fuse_mlir.hpp>
 #include <migraphx/gpu/fuse_ops.hpp>
 #include <migraphx/gpu/prefuse_ops.hpp>
-#include <migraphx/gpu/prepare_nonmaxsuppression.hpp>
 #include <migraphx/gpu/lowering.hpp>
 #include <migraphx/gpu/schedule_model.hpp>
 #include <migraphx/gpu/sync_device.hpp>
@@ -164,8 +163,6 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
         dead_code_elimination{},
         auto_contiguous{},
         dead_code_elimination{},
-        prepare_nonmaxsuppression{},
-        dead_code_elimination{},
         lowering{&ctx, options.offload_copy},
         eliminate_contiguous{"gpu::contiguous"},
         dead_code_elimination{},

From c5fb1070777d6eb49204075a505e983ed67d806d Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Thu, 21 May 2026 13:34:17 -0500
Subject: [PATCH 21/32] Add env var for retaining current NMS behavior for now

---
 docs/reference/MIGraphX-dev-env-vars.rst      |  8 ++++++++
 src/include/migraphx/op/nonmaxsuppression.hpp |  2 ++
 src/onnx/parse_nonmaxsuppression.cpp          | 15 ++++++++++++---
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/docs/reference/MIGraphX-dev-env-vars.rst b/docs/reference/MIGraphX-dev-env-vars.rst
index d84879717ec..c554dc31fee 100644
--- a/docs/reference/MIGraphX-dev-env-vars.rst
+++ b/docs/reference/MIGraphX-dev-env-vars.rst
@@ -298,6 +298,14 @@ Model performance tunable variables change the compilation behavior of a model.
 
       | Default: Full dynamic shape support is disabled.
 
+  * - | ``MIGRAPHX_USE_DYNAMIC_NMS``
+      | When set, the ``NonMaxSuppression`` ONNX parser performs a dynamic slice on the raw indices tensor to trim it to the number of selected boxes, producing an output with a dynamic shape.
+
+    - | ``1``: A dynamic slice is applied to the raw indices tensor, producing a dynamic-shaped output.
+      | ``0``: Returns to default behavior.
+
+      | Default: The whole raw indices tensor is returned without slicing.
+
 Matching
 **********
 
diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp
index d154733a581..22cc28aec41 100644
--- a/src/include/migraphx/op/nonmaxsuppression.hpp
+++ b/src/include/migraphx/op/nonmaxsuppression.hpp
@@ -231,6 +231,8 @@ struct nonmaxsuppression
                             double iou_threshold,
                             double score_threshold) const
     {
+        // NOTE: should not need to fill with 0
+        std::fill(output.begin(), output.end(), 0);
         const auto& lens       = scores.get_shape().lens();
         const auto num_batches = lens[0];
         const auto num_classes = lens[1];
diff --git a/src/onnx/parse_nonmaxsuppression.cpp b/src/onnx/parse_nonmaxsuppression.cpp
index 0ffffa03bcd..b67f33d880c 100644
--- a/src/onnx/parse_nonmaxsuppression.cpp
+++ b/src/onnx/parse_nonmaxsuppression.cpp
@@ -25,6 +25,8 @@
 #include <migraphx/ranges.hpp>
 #include <migraphx/make_op.hpp>
 
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_USE_DYNAMIC_NMS);
+
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace onnx {
@@ -42,9 +44,16 @@ struct parse_nonmaxsuppression : op_parser<parse_nonmaxsuppression>
         auto nms_ins = info.add_instruction(op, args);
         // variable ends input slice to handle dynamic shape output
         auto indices = info.add_instruction(make_op("get_tuple_elem", {{"index", 0}}), nms_ins);
-        auto num_selected = info.add_instruction(make_op("get_tuple_elem", {{"index", 1}}), nms_ins);
-        auto slice_ins = info.add_instruction(make_op("slice", {{"axes", {0}}, {"starts", {0}}}), indices, num_selected);
-        return slice_ins;
+        if(enabled(MIGRAPHX_USE_DYNAMIC_NMS{}))
+        {
+            return indices;
+        }
+        else
+        {
+            auto num_selected = info.add_instruction(make_op("get_tuple_elem", {{"index", 1}}), nms_ins);
+            auto slice_ins = info.add_instruction(make_op("slice", {{"axes", {0}}, {"starts", {0}}}), indices, num_selected);
+            return slice_ins;
+        }
     }
 };
 

From 289d5adf2047e56fef1117c5d57716ad60a8b41f Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Thu, 21 May 2026 13:43:05 -0500
Subject: [PATCH 22/32] Formatting

---
 src/include/migraphx/op/nonmaxsuppression.hpp |  13 +-
 src/onnx/parse_nonmaxsuppression.cpp          |   6 +-
 src/targets/gpu/compile_hip_code_object.cpp   |   1 -
 .../migraphx/gpu/compile_hip_code_object.hpp  |   2 +-
 src/targets/gpu/jit/nonmaxsuppression.cpp     |  44 +-
 .../migraphx/kernels/nonmaxsuppression.hpp    | 135 +--
 src/targets/gpu/lowering.cpp                  |  31 +-
 test/gpu/nonmaxsuppression.cpp                | 921 ++++++++++--------
 8 files changed, 654 insertions(+), 499 deletions(-)

diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp
index 22cc28aec41..b71bc4822eb 100644
--- a/src/include/migraphx/op/nonmaxsuppression.hpp
+++ b/src/include/migraphx/op/nonmaxsuppression.hpp
@@ -47,7 +47,8 @@
  *                    optional(max_output_boxes_per_class),
  *                    optional(iou_threshold),
  *                    optional(score_threshold));
- *  Outputs tuple of {tensor with dims[max_num_boxes, 3]: selected_box_indices, scalar int64_t: num_selected_indices} 
+ *  Outputs tuple of {tensor with dims[max_num_boxes, 3]: selected_box_indices, scalar int64_t:
+ * num_selected_indices}
  */
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -298,7 +299,7 @@ struct nonmaxsuppression
     argument compute(const shape& output_shape, std::vector<argument> args) const
     {
         // make buffer of maximum size
-        auto output_shapes = flatten_shapes({output_shape});
+        auto output_shapes     = flatten_shapes({output_shape});
         shape max_output_shape = {output_shapes.at(0).type(), output_shapes.at(0).max_lens()};
         argument result{max_output_shape};
         argument num_selected_result{output_shapes.at(1)};
@@ -307,9 +308,7 @@ struct nonmaxsuppression
             (args.size() > 2) ? (args.at(2).at<std::size_t>()) : 0;
         if(max_output_boxes_per_class == 0)
         {
-            num_selected_result.visit([&](auto output){
-                output[0] = 0;
-            });
+            num_selected_result.visit([&](auto output) { output[0] = 0; });
             return {{result, num_selected_result}};
         }
         double iou_threshold     = (args.size() > 3) ? (args.at(3).at<double>()) : 0.0f;
@@ -326,9 +325,7 @@ struct nonmaxsuppression
                                            score_threshold);
             });
         });
-        num_selected_result.visit([&](auto output){
-            output[0] = num_selected;
-        });
+        num_selected_result.visit([&](auto output) { output[0] = num_selected; });
         return {{result, num_selected_result}};
     }
 };
diff --git a/src/onnx/parse_nonmaxsuppression.cpp b/src/onnx/parse_nonmaxsuppression.cpp
index b67f33d880c..74427a9d5b8 100644
--- a/src/onnx/parse_nonmaxsuppression.cpp
+++ b/src/onnx/parse_nonmaxsuppression.cpp
@@ -50,8 +50,10 @@ struct parse_nonmaxsuppression : op_parser<parse_nonmaxsuppression>
         }
         else
         {
-            auto num_selected = info.add_instruction(make_op("get_tuple_elem", {{"index", 1}}), nms_ins);
-            auto slice_ins = info.add_instruction(make_op("slice", {{"axes", {0}}, {"starts", {0}}}), indices, num_selected);
+            auto num_selected =
+                info.add_instruction(make_op("get_tuple_elem", {{"index", 1}}), nms_ins);
+            auto slice_ins = info.add_instruction(
+                make_op("slice", {{"axes", {0}}, {"starts", {0}}}), indices, num_selected);
             return slice_ins;
         }
     }
diff --git a/src/targets/gpu/compile_hip_code_object.cpp b/src/targets/gpu/compile_hip_code_object.cpp
index efe3b4f80bd..868153c2c9e 100644
--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -192,7 +192,6 @@ compute_global_for(const context& ctx, std::size_t n, std::size_t over)
     };
 }
 
-
 // `n`: The amount of parallel work within a block.
 // `max_block_size`: Upper limit on block size.
 std::size_t compute_block_size(const context& ctx, std::size_t n, std::size_t max_block_size)
diff --git a/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp b/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
index 592e32b9af4..f11051916cf 100644
--- a/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
@@ -36,7 +36,7 @@ namespace gpu {
 struct context;
 
 struct hip_compile_options
-{   
+{
     // Total number of threads
     std::size_t global;
     // Threads per block
diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp
index 9d0c8cc3efd..732e8702410 100644
--- a/src/targets/gpu/jit/nonmaxsuppression.cpp
+++ b/src/targets/gpu/jit/nonmaxsuppression.cpp
@@ -145,10 +145,10 @@ struct nms_sort_compiler : compiler<nms_sort_compiler>
 
     operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
-        const auto& boxes_s  = inputs[0];
-        const auto& scores_s = inputs[1];
+        const auto& boxes_s    = inputs[0];
+        const auto& scores_s   = inputs[1];
         const auto num_batches = boxes_s.lens()[0];
-        const auto num_boxes = boxes_s.lens()[1];
+        const auto num_boxes   = boxes_s.lens()[1];
         const auto num_classes = scores_s.lens()[1];
         const auto aligned_num_boxes =
             static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(num_boxes)));
@@ -191,12 +191,12 @@ struct nms_filter_compiler : compiler<nms_filter_compiler>
     {
         const auto num_batches = v.at("num_batches").to<std::size_t>();
         const auto num_classes = v.at("num_classes").to<std::size_t>();
-        const auto num_boxes  = v.at("num_boxes").to<std::size_t>();
+        const auto num_boxes   = v.at("num_boxes").to<std::size_t>();
         const auto aligned_num_boxes =
             static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(num_boxes)));
         // TODO: tune for max block size?
         // ceil_div(num_boxes, 2) because of strided thread work distribution
-        const auto block_size = compute_block_size(ctx, (num_boxes + 1)/2, 256);
+        const auto block_size = compute_block_size(ctx, (num_boxes + 1) / 2, 256);
 
         hip_compile_options options;
         options.inputs         = flatten_shapes(inputs);
@@ -205,14 +205,14 @@ struct nms_filter_compiler : compiler<nms_filter_compiler>
         options.virtual_inputs = options.inputs;
         options.set_launch_params(v, num_batches * num_classes * block_size, block_size);
 
-        auto src = interpolate_string(
-            nms_filter_kernel_src,
-            {{"params", enum_params(options.inputs.size(), "void * private_p")},
-             {"args", enum_params(options.inputs.size(), "private_p")},
-             {"num_batches", std::to_string(num_batches)},
-             {"num_classes", std::to_string(num_classes)},
-             {"num_boxes", std::to_string(num_boxes)},
-             {"aligned_num_boxes", std::to_string(aligned_num_boxes)}});
+        auto src =
+            interpolate_string(nms_filter_kernel_src,
+                               {{"params", enum_params(options.inputs.size(), "void * private_p")},
+                                {"args", enum_params(options.inputs.size(), "private_p")},
+                                {"num_batches", std::to_string(num_batches)},
+                                {"num_classes", std::to_string(num_classes)},
+                                {"num_boxes", std::to_string(num_boxes)},
+                                {"aligned_num_boxes", std::to_string(aligned_num_boxes)}});
         return compile_hip_code_object(ctx, src, options);
     }
 
@@ -231,10 +231,10 @@ struct nms_compact_compiler : compiler<nms_compact_compiler>
 
     operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
     {
-        const auto& cnt_s = inputs[0];
-        const auto& indices_s = inputs[1];
+        const auto& cnt_s          = inputs[0];
+        const auto& indices_s      = inputs[1];
         const auto num_batch_class = cnt_s.elements();
-        const auto num_boxes = indices_s.elements() / (num_batch_class * std::size_t{3});
+        const auto num_boxes       = indices_s.elements() / (num_batch_class * std::size_t{3});
         // TODO: tune for block size?
         // num_boxes block size could also work?
         const auto block_size = compute_block_size(ctx, num_batch_class * num_boxes, 256);
@@ -246,12 +246,12 @@ struct nms_compact_compiler : compiler<nms_compact_compiler>
         options.virtual_inputs = options.inputs;
         options.set_launch_params(v, block_size, block_size);
 
-        auto src = interpolate_string(
-            nms_compact_kernel_src,
-            {{"params", enum_params(options.inputs.size(), "void * private_p")},
-             {"args", enum_params(options.inputs.size(), "private_p")},
-             {"num_batch_class", std::to_string(num_batch_class)},
-             {"num_boxes", std::to_string(num_boxes)}});
+        auto src =
+            interpolate_string(nms_compact_kernel_src,
+                               {{"params", enum_params(options.inputs.size(), "void * private_p")},
+                                {"args", enum_params(options.inputs.size(), "private_p")},
+                                {"num_batch_class", std::to_string(num_batch_class)},
+                                {"num_boxes", std::to_string(num_boxes)}});
         return compile_hip_code_object(ctx, src, options);
     }
 
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
index 5ba50f436d4..7ae9638e173 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
@@ -86,8 +86,7 @@ __device__ inline array<typename Box::type, 4> nms_normalize_box(const Box box)
 }
 
 template <class Box, class Threshold>
-__device__ inline bool
-nms_iou_over_threshold(const Box a, const Box b, const Threshold threshold)
+__device__ inline bool nms_iou_over_threshold(const Box a, const Box b, const Threshold threshold)
 {
     const float left   = max(a[0], b[0]);
     const float right  = min(a[2], b[2]);
@@ -129,38 +128,40 @@ template <bool CenterPointBox,
           class SortedScores,
           class SortedBoxes,
           class SortedIndices>
-__device__ void nonmaxsuppression_sort(
-    const Boxes boxes_tv,
-    const Scores scores_tv,
-    SortedScores sorted_scores,
-    SortedBoxes sorted_boxes,
-    SortedIndices sorted_indices)
+__device__ void nonmaxsuppression_sort(const Boxes boxes_tv,
+                                       const Scores scores_tv,
+                                       SortedScores sorted_scores,
+                                       SortedBoxes sorted_boxes,
+                                       SortedIndices sorted_indices)
 {
     static_assert(NumBatches > 0);
     static_assert(NumClasses > 0);
     static_assert(NumBoxes > 0);
     static_assert(AlignedNumBoxes > 0);
 
-    auto idx = make_index();
+    auto idx                 = make_index();
     const index_int block_id = idx.group;
-    const int batch_idx = block_id / NumClasses;
-    const int class_idx = block_id % NumClasses;
-    
+    const int batch_idx      = block_id / NumClasses;
+    const int class_idx      = block_id % NumClasses;
+
     // numpy indexing: scores[batch_idx, class_idx, :]
-    const auto my_scores = slice_tensor(scores_tv, array<index_int, 3>{batch_idx, class_idx, 0}, slice_axes<2>());
-   
-    using scores_type = typename SortedScores::type;
-    using boxes_type = typename SortedBoxes::type;
+    const auto my_scores =
+        slice_tensor(scores_tv, array<index_int, 3>{batch_idx, class_idx, 0}, slice_axes<2>());
+
+    using scores_type  = typename SortedScores::type;
+    using boxes_type   = typename SortedBoxes::type;
     using indices_type = typename SortedIndices::type;
     // Use shared memory for sorting per-block nms_data. Assuming it fits in LDS.
     // TODO: can add a static_assert on needed LDS size
-    __shared__ uninitialized_buffer<nms_data<scores_type, boxes_type, indices_type>, AlignedNumBoxes> block_nms_data;
+    __shared__
+        uninitialized_buffer<nms_data<scores_type, boxes_type, indices_type>, AlignedNumBoxes>
+            block_nms_data;
     idx.local_stride(AlignedNumBoxes, [&](auto i) {
         if(i < NumBoxes)
         {
-            block_nms_data[i].score     = my_scores[i];
-            block_nms_data[i].box       = nms_normalize_box<CenterPointBox>(
-                    slice_tensor(boxes_tv, array<index_int, 3>{batch_idx, i, 0}, slice_axes<2>()));
+            block_nms_data[i].score = my_scores[i];
+            block_nms_data[i].box   = nms_normalize_box<CenterPointBox>(
+                slice_tensor(boxes_tv, array<index_int, 3>{batch_idx, i, 0}, slice_axes<2>()));
             block_nms_data[i].box_index = static_cast<int32_t>(i);
         }
         else
@@ -175,12 +176,15 @@ __device__ void nonmaxsuppression_sort(
     bitonic_sort{nms_score_greater{}}.template block_sort<AlignedNumBoxes>(idx, block_nms_data);
 
     // Copy sorted result back to global memory.
-    auto block_out_scores = slice_tensor(sorted_scores, array<index_int, 2>{block_id, 0}, slice_axes<1>());
-    auto block_out_boxes = slice_tensor(sorted_boxes, array<index_int, 3>{block_id, 0, 0}, slice_axes<1, 2>());
-    auto block_out_indices = slice_tensor(sorted_indices, array<index_int, 2>{block_id, 0}, slice_axes<1>());
+    auto block_out_scores =
+        slice_tensor(sorted_scores, array<index_int, 2>{block_id, 0}, slice_axes<1>());
+    auto block_out_boxes =
+        slice_tensor(sorted_boxes, array<index_int, 3>{block_id, 0, 0}, slice_axes<1, 2>());
+    auto block_out_indices =
+        slice_tensor(sorted_indices, array<index_int, 2>{block_id, 0}, slice_axes<1>());
     idx.local_stride(AlignedNumBoxes, [&](auto i) {
-        block_out_scores[i]  = block_nms_data[i].score;
-        auto out_box_iter    = block_out_boxes.begin_at(array<index_int, 3>{0, i, 0});
+        block_out_scores[i] = block_nms_data[i].score;
+        auto out_box_iter   = block_out_boxes.begin_at(array<index_int, 3>{0, i, 0});
         copy(block_nms_data[i].box.begin(), block_nms_data[i].box.end(), out_box_iter);
         block_out_indices[i] = block_nms_data[i].box_index;
     });
@@ -192,7 +196,8 @@ __device__ void nonmaxsuppression_sort(
 // `nms_data`: nms_data nms_data{} tensor
 // `mask`: bool mask tensor
 template <index_int NumBoxes, class NMSData, class Mask>
-__device__ void nms_make_iou_mask(const index idx, const NMSData nms_data, Mask mask, const float iou_threshold)
+__device__ void
+nms_make_iou_mask(const index idx, const NMSData nms_data, Mask mask, const float iou_threshold)
 {
     static_assert(NumBoxes > 0);
     constexpr index_int half = NumBoxes / 2;
@@ -219,7 +224,12 @@ __device__ void nms_make_iou_mask(const index idx, const NMSData nms_data, Mask
 }
 
 // Greedy filter that writes selections into a per-batch per-class region of output.
-template <index_int NumBoxes, index_int NumClasses, class NMSData, class Mask, class Output, class Counts>
+template <index_int NumBoxes,
+          index_int NumClasses,
+          class NMSData,
+          class Mask,
+          class Output,
+          class Counts>
 __device__ void nms_filter_per_block(const index idx,
                                      const NMSData nms_data,
                                      const Mask mask,
@@ -230,15 +240,14 @@ __device__ void nms_filter_per_block(const index idx,
 {
     static_assert(NumBoxes > 0);
     const index_int block_id = idx.group;
-    const int batch_idx = block_id / NumClasses;
-    const int class_idx = block_id % NumClasses;
+    const int batch_idx      = block_id / NumClasses;
+    const int class_idx      = block_id % NumClasses;
     // TODO: use bits for removed mask
     __shared__ uint8_t removed[NumBoxes];
     // Match the ref op: only filter by score when score_threshold > 0.
     const bool do_filter = score_thr > 0.f;
-    idx.local_stride(NumBoxes, [&](auto i) {
-        removed[i] = (do_filter and nms_data[i].score < score_thr);
-    });
+    idx.local_stride(NumBoxes,
+                     [&](auto i) { removed[i] = (do_filter and nms_data[i].score < score_thr); });
     __syncthreads();
 
     index_int output_idx = 0;
@@ -301,49 +310,51 @@ __device__ void nonmaxsuppression_filter(const SortedScores sorted_scores,
     static_assert(NumClasses > 0);
     static_assert(NumBoxes > 0);
 
-    auto idx = make_index();
+    auto idx                  = make_index();
     const index_int block_idx = idx.group;
 
-    auto my_sorted_scores = slice_tensor(sorted_scores, array<index_int, 2>{block_idx, 0}, slice_axes<1>());
-    auto my_sorted_boxes = slice_tensor(sorted_boxes, array<index_int, 3>{block_idx, 0, 0}, slice_axes<1, 2>());
-    auto my_sorted_indices = slice_tensor(sorted_indices, array<index_int, 2>{block_idx, 0}, slice_axes<1>());
+    auto my_sorted_scores =
+        slice_tensor(sorted_scores, array<index_int, 2>{block_idx, 0}, slice_axes<1>());
+    auto my_sorted_boxes =
+        slice_tensor(sorted_boxes, array<index_int, 3>{block_idx, 0, 0}, slice_axes<1, 2>());
+    auto my_sorted_indices =
+        slice_tensor(sorted_indices, array<index_int, 2>{block_idx, 0}, slice_axes<1>());
 
-    using scores_type = typename SortedScores::type;
-    using boxes_type = typename SortedBoxes::type;
+    using scores_type  = typename SortedScores::type;
+    using boxes_type   = typename SortedBoxes::type;
     using indices_type = typename SortedIndices::type;
     // Use shared memory for sorting per-block nms_data. Assuming it fits in LDS.
     // TODO: can add a static_assert on needed LDS size
-    __shared__ uninitialized_buffer<nms_data<scores_type, boxes_type, indices_type>, NumBoxes> block_nms_data;
+    __shared__ uninitialized_buffer<nms_data<scores_type, boxes_type, indices_type>, NumBoxes>
+        block_nms_data;
 
     idx.local_stride(NumBoxes, [&](auto i) {
         block_nms_data[i].score = my_sorted_scores[i];
-        auto boxes_iter = my_sorted_boxes.begin_at(array<index_int, 3>{0, i, 0});
+        auto boxes_iter         = my_sorted_boxes.begin_at(array<index_int, 3>{0, i, 0});
         copy(boxes_iter, boxes_iter + 4, block_nms_data[i].box.begin());
         block_nms_data[i].box_index = my_sorted_indices[i];
     });
-    auto my_mask = slice_tensor(mask, array<index_int, 2>{block_idx, 0}, slice_axes<1>());
+    auto my_mask   = slice_tensor(mask, array<index_int, 2>{block_idx, 0}, slice_axes<1>());
     auto my_output = slice_tensor(output, array<index_int, 3>{block_idx, 0, 0}, slice_axes<1, 2>());
 
     // Read scalar tensor inputs
     const int max_output_boxes_per_class = max_out_p[0];
-    const float iou_thr_val   = iou_thr_p[0];
-    const float score_thr_val = score_thr_p[0];
+    const float iou_thr_val              = iou_thr_p[0];
+    const float score_thr_val            = score_thr_p[0];
 
     __syncthreads();
     nms_make_iou_mask<NumBoxes>(idx, block_nms_data, my_mask, iou_thr_val);
 
     __syncthreads();
-    nms_filter_per_block<NumBoxes, NumClasses>(
-        idx,
-        block_nms_data,
-        my_mask,
-        max_output_boxes_per_class,
-        score_thr_val,
-        my_output,
-        bc_counts);
+    nms_filter_per_block<NumBoxes, NumClasses>(idx,
+                                               block_nms_data,
+                                               my_mask,
+                                               max_output_boxes_per_class,
+                                               score_thr_val,
+                                               my_output,
+                                               bc_counts);
 }
 
-
 // Move batch/class box index entries to the beginning of the output buffer. Runs with 1 block.
 // `bc_counts`: Number of selected boxes per batch per class. (read-only)
 // `indices`: Box indices, kernel packs selected boxes in-place to the beginning
@@ -355,20 +366,20 @@ template <index_int NumBatchClass,
           class Idx,
           class Num,
           class Out>
-__device__ void nonmaxsuppression_compact(const Counts bc_counts,
-                                          const Idx indices,
-                                          Out output,
-                                          Num num_selected)
+__device__ void
+nonmaxsuppression_compact(const Counts bc_counts, const Idx indices, Out output, Num num_selected)
 {
     static_assert(NumBatchClass > 0);
     static_assert(NumBoxes > 0);
     // TODO: get a better bound on this
-    static_assert(NumBatchClass <= 8192, "nms_compact: NumBatchClass exceeds the LDS budget for offsets[]");
+    static_assert(NumBatchClass <= 8192,
+                  "nms_compact: NumBatchClass exceeds the LDS budget for offsets[]");
 
     auto idx = make_index();
     __shared__ index_int offsets[NumBatchClass];
     // Exclusive prefix sum on bc_counts to get offsets
-    // TODO: there's probably a better way to get the exclusive prefix sum rather than doing the minus each time.
+    // TODO: there's probably a better way to get the exclusive prefix sum rather than doing the
+    // minus each time.
     block_scan(
         idx,
         op::sum{},
@@ -381,22 +392,22 @@ __device__ void nonmaxsuppression_compact(const Counts bc_counts,
     // Get num_selected_boxes from last value of exclusive scan and add last bc_counts value.
     if(idx.local == 0)
     {
-        num_selected[0] = offsets[NumBatchClass-1] + bc_counts[NumBatchClass-1];
+        num_selected[0] = offsets[NumBatchClass - 1] + bc_counts[NumBatchClass - 1];
     }
 
     // rearrange index values to make the output packed.
     // TODO: this could be done in-place to save memory.
-    constexpr index_int index_size = 3;
+    constexpr index_int index_size  = 3;
     constexpr index_int max_entries = NumBatchClass * NumBoxes;
     idx.local_stride(max_entries, [&](auto i) {
         const index_int batch_class_idx = i / NumBoxes;
-        const index_int box_idx = i % NumBoxes;
+        const index_int box_idx         = i % NumBoxes;
         if(box_idx < bc_counts[batch_class_idx])
         {
             for(int k = 0; k < 3; ++k)
             {
                 output[(offsets[batch_class_idx] + box_idx) * index_size + k] =
-                indices[(batch_class_idx * NumBoxes + box_idx) * index_size + k] ;
+                    indices[(batch_class_idx * NumBoxes + box_idx) * index_size + k];
             }
         }
     });
diff --git a/src/targets/gpu/lowering.cpp b/src/targets/gpu/lowering.cpp
index 976d84ade0e..a7baf80a755 100644
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -458,8 +458,8 @@ struct miopen_apply
     {
         apply_map.emplace("nonmaxsuppression", [=](instruction_ref ins) {
             auto inputs = ins->inputs();
-            const auto& boxes_s  = inputs[0]->get_shape();
-            const auto& scores_s = inputs[1]->get_shape();
+            const auto& boxes_s    = inputs[0]->get_shape();
+            const auto& scores_s   = inputs[1]->get_shape();
             const auto num_batches = boxes_s.lens()[0];
             const auto num_boxes   = boxes_s.lens()[1];
             const auto num_classes = scores_s.lens()[1];
@@ -492,26 +492,25 @@ struct miopen_apply
                 inputs[1]);
             sorted = insert_precompile_op(sorted);
 
-            auto filter = mod->insert_instruction(
-                ins,
-                make_op("gpu::nms_filter",
-                        {{"num_batches", num_batches},
-                         {"num_classes", num_classes},
-                         {"num_boxes", num_boxes}}),
-                sorted,
-                inputs[2],
-                inputs[3],
-                inputs[4],
-                mask_alloc);
-            filter = insert_precompile_op(filter);
+            auto filter = mod->insert_instruction(ins,
+                                                  make_op("gpu::nms_filter",
+                                                          {{"num_batches", num_batches},
+                                                           {"num_classes", num_classes},
+                                                           {"num_boxes", num_boxes}}),
+                                                  sorted,
+                                                  inputs[2],
+                                                  inputs[3],
+                                                  inputs[4],
+                                                  mask_alloc);
+            filter      = insert_precompile_op(filter);
 
             auto raw_output =
                 mod->insert_instruction(ins, make_op("get_tuple_elem", {{"index", 0}}), filter);
             auto bc_counts =
                 mod->insert_instruction(ins, make_op("get_tuple_elem", {{"index", 1}}), filter);
 
-            auto compact = mod->insert_instruction(
-                ins, make_op("gpu::nms_compact"), bc_counts, raw_output);
+            auto compact =
+                mod->insert_instruction(ins, make_op("gpu::nms_compact"), bc_counts, raw_output);
             compact = insert_precompile_op(compact);
 
             return mod->replace_instruction(ins, compact);
diff --git a/test/gpu/nonmaxsuppression.cpp b/test/gpu/nonmaxsuppression.cpp
index 119b2ab6a8e..f123263b596 100644
--- a/test/gpu/nonmaxsuppression.cpp
+++ b/test/gpu/nonmaxsuppression.cpp
@@ -90,12 +90,12 @@ TEST_CASE(nms_default_test)
                             score_threshold);
     add_nms_return(mm, nms);
 
-    std::vector<float> boxes_vec = {0.5, 0.5,  1.0, 1.0, 0.5, 0.6,  1.0, 1.0, 0.5, 0.4,   1.0, 1.0,
-                                    0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0};
+    std::vector<float> boxes_vec  = {0.5, 0.5,  1.0, 1.0, 0.5, 0.6,  1.0, 1.0, 0.5, 0.4,   1.0, 1.0,
+                                     0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0};
     std::vector<float> scores_vec = {0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f};
-    int64_t max_out_val = 4;
-    float iou_val       = 0.5f;
-    float score_val     = 0.0f;
+    int64_t max_out_val           = 4;
+    float iou_val                 = 0.5f;
+    float score_val               = 0.0f;
 
     migraphx::parameter_map host_params;
     host_params["boxes"]           = migraphx::argument(boxes_s, boxes_vec.data());
@@ -133,10 +133,10 @@ TEST_CASE(nms_identical_all_test)
                                    score_threshold);
     add_nms_return(mm, nms);
 
-    std::vector<float> boxes_vec = {0.5, 0.5, 0.7, 0.7, 0.7, 0.7, 0.5, 0.5, 0.7, 0.7, 0.5, 0.5,
-                                    0.5, 0.5, 0.7, 0.7, 0.5, 0.5, 0.7, 0.7, 0.7, 0.7, 0.5, 0.5};
+    std::vector<float> boxes_vec  = {0.5, 0.5, 0.7, 0.7, 0.7, 0.7, 0.5, 0.5, 0.7, 0.7, 0.5, 0.5,
+                                     0.5, 0.5, 0.7, 0.7, 0.5, 0.5, 0.7, 0.7, 0.7, 0.7, 0.5, 0.5};
     std::vector<float> scores_vec = {0.9f, 0.9f, 0.9f, 0.9f, 0.9f, 0.9f};
-    float iou_val = 0.1f;
+    float iou_val                 = 0.1f;
 
     migraphx::parameter_map host_params;
     host_params["boxes"]         = migraphx::argument(boxes_s, boxes_vec.data());
@@ -171,9 +171,9 @@ TEST_CASE(nms_not_center_test)
                                    score_threshold);
     add_nms_return(mm, nms);
 
-    std::vector<float> boxes_vec = {1.0, 1.0,  0.0, 0.0,  0.0, 0.1,   1.0, 1.1,
-                                    0.0, 0.9,  1.0, -0.1, 0.0, 10.0,  1.0, 11.0,
-                                    1.0, 10.1, 0.0, 11.1, 1.0, 101.0, 0.0, 100.0};
+    std::vector<float> boxes_vec  = {1.0, 1.0,  0.0, 0.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, 0.9,  1.0, -0.1, 0.0, 10.0,  1.0, 11.0,
+                                     1.0, 10.1, 0.0, 11.1, 1.0, 101.0, 0.0, 100.0};
     std::vector<float> scores_vec = {0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f};
 
     migraphx::parameter_map host_params;
@@ -217,7 +217,7 @@ TEST_CASE(nms_transpose1_test)
         1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,  1.0,  1.0,
     };
     std::vector<float> scores_vec = {0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f};
-    int64_t max_out_val = 4;
+    int64_t max_out_val           = 4;
 
     migraphx::parameter_map host_params;
     host_params["boxes"]   = migraphx::argument(boxes_s, boxes_vec.data());
@@ -296,10 +296,10 @@ TEST_CASE(nms_multi_batch_test)
                             score_threshold);
     add_nms_return(mm, nms);
 
-    std::vector<float> boxes_vec = {0.5, 0.5,  1.0, 1.0, 0.5, 0.6,  1.0, 1.0, 0.5, 0.4,   1.0, 1.0,
-                                    0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0,
-                                    0.5, 0.5,  1.0, 1.0, 0.5, 0.6,  1.0, 1.0, 0.5, 0.4,   1.0, 1.0,
-                                    0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0};
+    std::vector<float> boxes_vec  = {0.5, 0.5,  1.0, 1.0, 0.5, 0.6,  1.0, 1.0, 0.5, 0.4,   1.0, 1.0,
+                                     0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0,
+                                     0.5, 0.5,  1.0, 1.0, 0.5, 0.6,  1.0, 1.0, 0.5, 0.4,   1.0, 1.0,
+                                     0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0};
     std::vector<float> scores_vec = {
         0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f, 0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f};
     int64_t max_out_val = 4;
@@ -343,9 +343,9 @@ TEST_CASE(nms_multi_class_test)
                             score_threshold);
     add_nms_return(mm, nms);
 
-    std::vector<float> boxes_vec = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
-                                    0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
-                                    0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0};
+    std::vector<float> boxes_vec  = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0};
     std::vector<float> scores_vec = {
         0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f, 0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f};
     float score_val = 0.0f;
@@ -376,26 +376,27 @@ TEST_CASE(nms_20boxes_test)
     auto iou_threshold   = mm->add_literal(0.5000f);
     auto score_threshold = mm->add_literal(0.0000f);
 
-    auto nms =
-        mm->add_instruction(migraphx::make_op("nonmaxsuppression"),
-                            boxes_p,
-                            scores_p,
-                            max_out_l,
-                            iou_threshold,
-                            score_threshold);
+    auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"),
+                                   boxes_p,
+                                   scores_p,
+                                   max_out_l,
+                                   iou_threshold,
+                                   score_threshold);
     add_nms_return(mm, nms);
 
     std::vector<float> boxes_vec = {
-        32.7256f, 35.1377f, 43.0832f, 42.2579f, 13.9286f, 15.6152f, 21.5240f, 28.2727f, 44.0782f, 37.5280f, 52.9916f, 48.3318f,
-        38.8011f, 32.1818f, 50.5110f, 37.5550f, 33.9761f, -1.6170f, 43.8622f, 11.0347f, 5.3569f, 42.6478f, 14.1070f, 54.9145f,
-        18.9216f, 34.8446f, 27.7505f, 41.2693f, -0.4375f, 36.7849f, 4.8178f, 41.8215f, 6.9987f, 1.1282f, 8.4302f, 11.6832f,
-        30.5954f, 21.0410f, 37.7095f, 23.9976f, 35.2360f, 16.6405f, 39.2402f, 20.4393f, 45.0158f, 45.7867f, 51.7352f, 46.8898f,
-        9.8174f, 26.1848f, 22.7651f, 38.2017f, 16.3854f, 35.9841f, 20.6606f, 46.2920f, 22.5697f, 16.7346f, 24.3859f, 27.6069f,
-        7.0039f, 5.3968f, 11.9433f, 17.3270f, 3.9409f, 24.0168f, 9.0512f, 31.4417f, 18.6518f, -1.2903f, 28.9187f, 7.6721f,
-        6.9462f, 39.9030f, 15.7447f, 42.8601f, 27.5034f, 30.2815f, 39.4780f, 32.8849f};
-    std::vector<float> scores_vec = {
-        0.6979f, 0.4657f, 0.8326f, 0.2503f, 0.1204f, 0.1810f, 0.7501f, 0.5157f, 0.2451f, 0.5509f, 0.2371f, 0.7267f,
-        0.5015f, 0.4429f, 0.3714f, 0.6673f, 0.4256f, 0.1789f, 0.2062f, 0.9657f};
+        32.7256f, 35.1377f, 43.0832f, 42.2579f, 13.9286f, 15.6152f, 21.5240f, 28.2727f, 44.0782f,
+        37.5280f, 52.9916f, 48.3318f, 38.8011f, 32.1818f, 50.5110f, 37.5550f, 33.9761f, -1.6170f,
+        43.8622f, 11.0347f, 5.3569f,  42.6478f, 14.1070f, 54.9145f, 18.9216f, 34.8446f, 27.7505f,
+        41.2693f, -0.4375f, 36.7849f, 4.8178f,  41.8215f, 6.9987f,  1.1282f,  8.4302f,  11.6832f,
+        30.5954f, 21.0410f, 37.7095f, 23.9976f, 35.2360f, 16.6405f, 39.2402f, 20.4393f, 45.0158f,
+        45.7867f, 51.7352f, 46.8898f, 9.8174f,  26.1848f, 22.7651f, 38.2017f, 16.3854f, 35.9841f,
+        20.6606f, 46.2920f, 22.5697f, 16.7346f, 24.3859f, 27.6069f, 7.0039f,  5.3968f,  11.9433f,
+        17.3270f, 3.9409f,  24.0168f, 9.0512f,  31.4417f, 18.6518f, -1.2903f, 28.9187f, 7.6721f,
+        6.9462f,  39.9030f, 15.7447f, 42.8601f, 27.5034f, 30.2815f, 39.4780f, 32.8849f};
+    std::vector<float> scores_vec = {0.6979f, 0.4657f, 0.8326f, 0.2503f, 0.1204f, 0.1810f, 0.7501f,
+                                     0.5157f, 0.2451f, 0.5509f, 0.2371f, 0.7267f, 0.5015f, 0.4429f,
+                                     0.3714f, 0.6673f, 0.4256f, 0.1789f, 0.2062f, 0.9657f};
 
     migraphx::parameter_map host_params;
     host_params["boxes"]  = migraphx::argument(boxes_s, boxes_vec.data());
@@ -403,7 +404,8 @@ TEST_CASE(nms_20boxes_test)
 
     auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params);
     indices.resize(static_cast<std::size_t>(num_selected) * 3);
-    std::vector<int64_t> gold = {0, 0, 19, 0, 0, 2, 0, 0, 6, 0, 0, 11, 0, 0, 0, 0, 0, 15, 0, 0, 9, 0, 0, 7, 0, 0, 12, 0, 0, 1};
+    std::vector<int64_t> gold = {0, 0, 19, 0, 0, 2, 0, 0, 6, 0, 0, 11, 0, 0, 0,
+                                 0, 0, 15, 0, 0, 9, 0, 0, 7, 0, 0, 12, 0, 0, 1};
     EXPECT(migraphx::verify::verify_rms_range(indices, gold));
     EXPECT(num_selected == 10);
 }
@@ -432,29 +434,35 @@ TEST_CASE(nms_50boxes_center_test)
     add_nms_return(mm, nms);
 
     std::vector<float> boxes_vec = {
-        90.8581f, 82.6292f, 23.5447f, 19.9060f, 69.9707f, 89.6161f, 29.1830f, 26.1572f, 26.5870f, 14.0249f, 15.5215f, 14.1630f,
-        96.9176f, 55.4036f, 5.1730f, 8.1873f, 77.8751f, 10.8576f, 1.4042f, 7.8632f, 71.6890f, 67.2240f, 7.6600f, 22.6344f,
-        44.9361f, 28.1234f, 4.8228f, 24.6805f, 27.2242f, 65.9423f, 20.6521f, 4.0531f, 9.6391f, 72.6995f, 4.5331f, 2.9302f,
-        90.2602f, 76.8647f, 15.6836f, 18.2386f, 45.5776f, 10.7741f, 21.1336f, 5.2390f, 20.2363f, 91.6012f, 17.8524f, 24.9153f,
-        30.5957f, 23.0214f, 6.7935f, 9.9997f, 57.9220f, 3.7413f, 24.3196f, 5.1723f, 17.6773f, 55.4852f, 21.7468f, 27.7081f,
-        85.6614f, 37.0922f, 22.4305f, 5.8004f, 75.8520f, 82.9790f, 4.8007f, 9.2569f, 71.9463f, 80.8251f, 4.5889f, 5.4548f,
-        43.2093f, 31.7139f, 27.8993f, 4.3492f, 62.7309f, 95.2899f, 12.5298f, 1.6133f, 58.4098f, 29.0918f, 9.7275f, 2.6065f,
-        64.9847f, 51.5057f, 15.1689f, 6.0646f, 8.4444f, 25.5965f, 20.2231f, 2.5481f, 41.5807f, 93.6044f, 28.7131f, 18.1432f,
-        4.1614f, 16.4608f, 9.3069f, 20.7407f, 49.3991f, 4.4911f, 27.8194f, 12.4153f, 32.9861f, 43.5097f, 1.7209f, 10.2217f,
-        14.4524f, 99.2376f, 17.1007f, 15.6313f, 10.3403f, 89.1677f, 19.3853f, 26.3751f, 58.7645f, 74.8608f, 4.0710f, 25.6828f,
-        17.0593f, 89.0792f, 5.0698f, 2.2608f, 92.5120f, 89.3447f, 13.1543f, 6.2635f, 58.1061f, 51.8858f, 29.0207f, 7.8656f,
-        34.6870f, 31.5929f, 18.2852f, 8.2322f, 59.0915f, 77.2012f, 28.0577f, 17.5657f, 2.2804f, 66.1661f, 24.3265f, 13.0716f,
-        95.8559f, 37.3658f, 14.5541f, 2.4284f, 48.2303f, 9.4467f, 23.7581f, 11.8348f, 78.2735f, 74.6790f, 1.5173f, 16.1888f,
-        8.2730f, 26.2461f, 4.1652f, 3.9485f, 48.6658f, 93.6813f, 25.0534f, 25.1703f, 49.0707f, 24.0971f, 24.1077f, 2.5069f,
-        93.7826f, 12.2758f, 7.7466f, 27.8204f, 57.1728f, 83.1113f, 16.3923f, 3.8743f, 47.3489f, 15.3284f, 18.5745f, 25.4637f,
-        26.6976f, 17.9268f, 26.1644f, 27.1769f, 33.1569f, 59.9383f, 18.4901f, 29.4075f, 52.0672f, 87.4562f, 12.9646f, 24.2588f,
-        43.8911f, 19.6435f, 11.8513f, 23.6048f, 2.1612f, 31.0324f, 13.3506f, 19.6320f};
+        90.8581f, 82.6292f, 23.5447f, 19.9060f, 69.9707f, 89.6161f, 29.1830f, 26.1572f, 26.5870f,
+        14.0249f, 15.5215f, 14.1630f, 96.9176f, 55.4036f, 5.1730f,  8.1873f,  77.8751f, 10.8576f,
+        1.4042f,  7.8632f,  71.6890f, 67.2240f, 7.6600f,  22.6344f, 44.9361f, 28.1234f, 4.8228f,
+        24.6805f, 27.2242f, 65.9423f, 20.6521f, 4.0531f,  9.6391f,  72.6995f, 4.5331f,  2.9302f,
+        90.2602f, 76.8647f, 15.6836f, 18.2386f, 45.5776f, 10.7741f, 21.1336f, 5.2390f,  20.2363f,
+        91.6012f, 17.8524f, 24.9153f, 30.5957f, 23.0214f, 6.7935f,  9.9997f,  57.9220f, 3.7413f,
+        24.3196f, 5.1723f,  17.6773f, 55.4852f, 21.7468f, 27.7081f, 85.6614f, 37.0922f, 22.4305f,
+        5.8004f,  75.8520f, 82.9790f, 4.8007f,  9.2569f,  71.9463f, 80.8251f, 4.5889f,  5.4548f,
+        43.2093f, 31.7139f, 27.8993f, 4.3492f,  62.7309f, 95.2899f, 12.5298f, 1.6133f,  58.4098f,
+        29.0918f, 9.7275f,  2.6065f,  64.9847f, 51.5057f, 15.1689f, 6.0646f,  8.4444f,  25.5965f,
+        20.2231f, 2.5481f,  41.5807f, 93.6044f, 28.7131f, 18.1432f, 4.1614f,  16.4608f, 9.3069f,
+        20.7407f, 49.3991f, 4.4911f,  27.8194f, 12.4153f, 32.9861f, 43.5097f, 1.7209f,  10.2217f,
+        14.4524f, 99.2376f, 17.1007f, 15.6313f, 10.3403f, 89.1677f, 19.3853f, 26.3751f, 58.7645f,
+        74.8608f, 4.0710f,  25.6828f, 17.0593f, 89.0792f, 5.0698f,  2.2608f,  92.5120f, 89.3447f,
+        13.1543f, 6.2635f,  58.1061f, 51.8858f, 29.0207f, 7.8656f,  34.6870f, 31.5929f, 18.2852f,
+        8.2322f,  59.0915f, 77.2012f, 28.0577f, 17.5657f, 2.2804f,  66.1661f, 24.3265f, 13.0716f,
+        95.8559f, 37.3658f, 14.5541f, 2.4284f,  48.2303f, 9.4467f,  23.7581f, 11.8348f, 78.2735f,
+        74.6790f, 1.5173f,  16.1888f, 8.2730f,  26.2461f, 4.1652f,  3.9485f,  48.6658f, 93.6813f,
+        25.0534f, 25.1703f, 49.0707f, 24.0971f, 24.1077f, 2.5069f,  93.7826f, 12.2758f, 7.7466f,
+        27.8204f, 57.1728f, 83.1113f, 16.3923f, 3.8743f,  47.3489f, 15.3284f, 18.5745f, 25.4637f,
+        26.6976f, 17.9268f, 26.1644f, 27.1769f, 33.1569f, 59.9383f, 18.4901f, 29.4075f, 52.0672f,
+        87.4562f, 12.9646f, 24.2588f, 43.8911f, 19.6435f, 11.8513f, 23.6048f, 2.1612f,  31.0324f,
+        13.3506f, 19.6320f};
     std::vector<float> scores_vec = {
-        0.8011f, 0.2211f, 0.5825f, 0.5628f, 0.8718f, 0.5165f, 0.4466f, 0.6756f, 0.3398f, 0.2258f, 0.5301f, 0.4752f,
-        0.3093f, 0.4308f, 0.4298f, 0.3947f, 0.4415f, 0.7172f, 0.3672f, 0.9540f, 0.9247f, 0.5328f, 0.3955f, 0.5819f,
-        0.8637f, 0.6873f, 0.8240f, 0.5795f, 0.6696f, 0.3593f, 0.7614f, 0.2822f, 0.7253f, 0.8746f, 0.2189f, 0.6529f,
-        0.1856f, 0.7531f, 0.1760f, 0.9423f, 0.2237f, 0.9630f, 0.8208f, 0.6343f, 0.8044f, 0.8156f, 0.9514f, 0.3280f,
-        0.6311f, 0.1855f};
+        0.8011f, 0.2211f, 0.5825f, 0.5628f, 0.8718f, 0.5165f, 0.4466f, 0.6756f, 0.3398f, 0.2258f,
+        0.5301f, 0.4752f, 0.3093f, 0.4308f, 0.4298f, 0.3947f, 0.4415f, 0.7172f, 0.3672f, 0.9540f,
+        0.9247f, 0.5328f, 0.3955f, 0.5819f, 0.8637f, 0.6873f, 0.8240f, 0.5795f, 0.6696f, 0.3593f,
+        0.7614f, 0.2822f, 0.7253f, 0.8746f, 0.2189f, 0.6529f, 0.1856f, 0.7531f, 0.1760f, 0.9423f,
+        0.2237f, 0.9630f, 0.8208f, 0.6343f, 0.8044f, 0.8156f, 0.9514f, 0.3280f, 0.6311f, 0.1855f};
 
     migraphx::parameter_map host_params;
     host_params["boxes"]  = migraphx::argument(boxes_s, boxes_vec.data());
@@ -462,7 +470,10 @@ TEST_CASE(nms_50boxes_center_test)
 
     auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params);
     indices.resize(static_cast<std::size_t>(num_selected) * 3);
-    std::vector<int64_t> gold = {0, 0, 41, 0, 0, 19, 0, 0, 46, 0, 0, 39, 0, 0, 20, 0, 0, 33, 0, 0, 4, 0, 0, 24, 0, 0, 26, 0, 0, 42, 0, 0, 45, 0, 0, 44, 0, 0, 0, 0, 0, 30, 0, 0, 32, 0, 0, 17, 0, 0, 25, 0, 0, 7, 0, 0, 28, 0, 0, 35};
+    std::vector<int64_t> gold = {0, 0, 41, 0, 0, 19, 0, 0, 46, 0, 0, 39, 0, 0, 20,
+                                 0, 0, 33, 0, 0, 4,  0, 0, 24, 0, 0, 26, 0, 0, 42,
+                                 0, 0, 45, 0, 0, 44, 0, 0, 0,  0, 0, 30, 0, 0, 32,
+                                 0, 0, 17, 0, 0, 25, 0, 0, 7,  0, 0, 28, 0, 0, 35};
     EXPECT(migraphx::verify::verify_rms_range(indices, gold));
     EXPECT(num_selected == 20);
 }
@@ -481,101 +492,136 @@ TEST_CASE(nms_100boxes_2batch_test)
     auto iou_threshold   = mm->add_literal(0.5000f);
     auto score_threshold = mm->add_literal(0.1000f);
 
-    auto nms =
-        mm->add_instruction(migraphx::make_op("nonmaxsuppression"),
-                            boxes_p,
-                            scores_p,
-                            max_out_l,
-                            iou_threshold,
-                            score_threshold);
+    auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"),
+                                   boxes_p,
+                                   scores_p,
+                                   max_out_l,
+                                   iou_threshold,
+                                   score_threshold);
     add_nms_return(mm, nms);
 
     std::vector<float> boxes_vec = {
-        -3.8699f, 108.8880f, 20.8101f, 137.5783f, 149.9079f, 29.3134f, 203.7504f, 39.2031f, 121.6031f, 107.1528f, 162.2282f, 118.8275f,
-        27.1146f, 87.2265f, 42.1365f, 141.7457f, -7.3128f, 91.3799f, 44.0012f, 95.0142f, 25.9397f, 97.1572f, 47.4736f, 111.8955f,
-        170.3318f, 143.6689f, 221.6791f, 161.9004f, 82.3933f, 144.8881f, 101.0310f, 174.8098f, 138.9017f, 80.6305f, 174.7306f, 116.2308f,
-        115.0719f, 104.8666f, 139.4914f, 134.9707f, 105.8753f, 183.2658f, 123.0900f, 189.2287f, 2.3726f, 16.2585f, 55.6795f, 31.6349f,
-        183.1709f, -1.9651f, 195.2389f, 48.8066f, 57.2666f, -1.7671f, 63.2705f, 36.8507f, 105.0166f, 111.9228f, 126.1903f, 151.2225f,
-        118.2848f, 63.4507f, 161.6255f, 103.9927f, 105.5274f, 131.8586f, 154.1659f, 177.8699f, 158.1560f, 132.0321f, 218.0818f, 136.4605f,
-        20.4451f, 55.4126f, 38.9305f, 78.0425f, 89.1363f, 163.2572f, 114.2048f, 196.0894f, 76.2707f, 142.0220f, 85.3431f, 162.9909f,
-        77.3750f, 28.6949f, 112.2925f, 79.5191f, -6.0851f, 58.1025f, 53.7721f, 87.5743f, 5.6429f, 39.7135f, 47.9949f, 86.0625f,
-        37.5563f, 5.8879f, 73.6739f, 57.1568f, 48.8660f, 14.1653f, 73.0158f, 44.9480f, 58.0793f, 159.8937f, 113.0820f, 214.5573f,
-        107.0385f, 69.7607f, 137.3566f, 105.4010f, 122.4620f, 51.0809f, 131.3896f, 102.2471f, 71.0835f, 135.3897f, 93.6408f, 156.4846f,
-        79.2752f, 95.3835f, 84.2380f, 125.8137f, 37.0673f, 171.0514f, 49.9841f, 203.4046f, 116.6400f, 152.4634f, 118.6825f, 159.6572f,
-        49.5364f, 83.6166f, 77.2799f, 108.1312f, -12.0070f, 47.7104f, 26.4309f, 102.8334f, 73.0529f, 178.2168f, 94.3071f, 216.4359f,
-        81.9253f, 137.8156f, 107.7278f, 149.2885f, 16.3219f, 179.7427f, 73.9152f, 200.7352f, 91.8087f, 17.5434f, 137.1745f, 29.8480f,
-        96.6991f, 168.8745f, 129.6096f, 171.3390f, 131.5065f, 99.5547f, 149.2944f, 155.2749f, 102.6283f, 10.6622f, 156.5511f, 38.1065f,
-        123.0512f, 108.0793f, 137.9220f, 127.2239f, 53.1452f, 119.0642f, 73.3404f, 155.3743f, 130.1690f, 1.7448f, 184.8039f, 3.1763f,
-        93.7074f, 82.1619f, 125.9504f, 99.5652f, 63.8853f, 143.8404f, 108.6820f, 186.3194f, 107.2755f, 39.8756f, 143.1295f, 78.2680f,
-        52.3550f, 62.2463f, 91.9079f, 121.1729f, 93.2160f, 69.6623f, 111.8797f, 107.2634f, 139.7207f, 45.7991f, 154.9616f, 74.9719f,
-        167.2671f, 160.7261f, 187.2941f, 206.6506f, 179.1259f, 129.1106f, 189.2970f, 183.4070f, 74.4343f, 0.3572f, 127.0189f, 43.8782f,
-        95.1992f, 170.4922f, 112.9108f, 228.3217f, 142.9101f, 152.2709f, 177.0380f, 199.4092f, 39.0269f, 30.7110f, 86.7534f, 82.8523f,
-        143.8537f, 163.5132f, 191.0993f, 171.2454f, 85.3959f, -0.8223f, 112.2607f, 43.3901f, 8.6218f, 186.3383f, 37.7209f, 213.3036f,
-        -15.4319f, 116.3204f, 44.2555f, 149.9535f, 147.9980f, 110.2290f, 188.7993f, 149.8210f, -13.4183f, -11.0214f, 35.6454f, 47.1977f,
-        28.9969f, 149.8616f, 83.2476f, 208.9517f, 43.0921f, -3.2028f, 90.5599f, 14.8026f, 28.6361f, 26.0199f, 40.5617f, 70.3113f,
-        45.6946f, 5.9799f, 79.8627f, 51.2289f, 145.0326f, 144.6320f, 152.0444f, 166.0751f, -16.8246f, 35.4867f, 22.6978f, 43.7950f,
-        136.7519f, 180.4197f, 194.1175f, 183.8356f, 155.6840f, 107.8222f, 186.9352f, 154.6854f, 61.1796f, -7.7136f, 87.7250f, 22.1787f,
-        29.1652f, -28.4875f, 32.2799f, 30.6594f, 91.3547f, -3.8851f, 148.9814f, 24.5483f, 20.3959f, 91.8365f, 27.4731f, 150.5336f,
-        71.2720f, 147.6549f, 74.6957f, 172.9379f, 183.9269f, 23.7969f, 199.4448f, 71.6242f, 196.6597f, 166.8796f, 201.5260f, 172.8839f,
-        140.4950f, -5.4397f, 168.3470f, 28.3325f, 46.4677f, 136.0320f, 77.9169f, 184.3535f, 127.8122f, 157.7804f, 147.2538f, 213.3378f,
-        139.0779f, 129.6555f, 143.0846f, 179.1879f, 73.7761f, 138.0335f, 81.3605f, 141.2148f, 116.3348f, 156.1013f, 140.0206f, 179.0908f,
-        -0.1401f, 6.0937f, 4.4311f, 9.9669f, 20.7149f, 36.6326f, 62.9081f, 44.0802f, 98.4106f, 4.5632f, 111.6248f, 45.4062f,
-        23.3391f, 79.3651f, 42.1614f, 122.4473f, 21.0547f, 125.7129f, 45.3081f, 172.3624f, 154.4709f, 99.9714f, 180.0508f, 152.0333f,
-        197.2776f, 147.9130f, 198.3756f, 192.5394f, 107.3878f, 6.9169f, 115.0000f, 55.1683f, 141.8624f, 144.9798f, 193.7655f, 148.8687f,
-        197.5280f, 31.1895f, 198.6007f, 46.0271f, 12.8282f, 35.3058f, 43.8101f, 72.9977f, 74.7088f, 116.1662f, 104.5894f, 167.7956f,
-        68.1883f, 195.4082f, 88.8408f, 196.6737f, 2.7857f, 106.6272f, 29.2340f, 137.9903f, 127.5389f, -9.5799f, 174.5932f, 31.3800f,
-        61.4403f, 121.8884f, 112.0713f, 124.6352f, 15.4868f, 35.9096f, 55.8899f, 68.2298f, 35.5922f, 56.6701f, 44.2246f, 72.3261f,
-        163.1796f, 40.7751f, 180.4136f, 56.2181f, 177.9262f, 90.7157f, 187.1069f, 101.2297f, 33.5656f, 108.4211f, 51.2933f, 164.8822f,
-        73.5555f, 18.9549f, 114.3649f, 72.3462f, 119.3443f, 42.7151f, 174.0536f, 89.5792f, 169.1987f, 170.3059f, 182.1476f, 201.8479f,
-        59.3192f, -5.2591f, 92.3019f, 24.6868f, 82.2129f, 76.0264f, 124.5949f, 108.2814f, 119.7321f, 125.9828f, 176.9545f, 158.6404f,
-        127.7304f, 16.7712f, 164.7240f, 43.4104f, 148.5664f, 5.0880f, 164.6177f, 13.8616f, 95.0352f, 23.4340f, 132.9384f, 31.8482f,
-        10.9685f, 155.1733f, 30.8775f, 212.3560f, 151.4989f, -12.8680f, 210.0904f, 16.5719f, 160.8241f, 9.0448f, 185.4050f, 66.2840f,
-        138.8994f, 0.9312f, 180.3396f, 11.5822f, 18.7873f, 5.2706f, 21.1577f, 38.9812f, 28.5777f, 117.4022f, 53.1813f, 130.6575f,
-        122.4044f, 40.3588f, 175.0358f, 56.2967f, -13.8737f, 112.4558f, 23.1297f, 115.2290f, 182.2486f, 114.0300f, 209.4412f, 122.0482f,
-        47.3188f, 142.3400f, 103.5391f, 197.4341f, 118.1700f, -9.0369f, 169.5550f, 10.9335f, 167.5089f, 152.2341f, 187.5196f, 189.1137f,
-        62.3618f, 109.6059f, 95.4902f, 138.0417f, 48.8767f, 20.2354f, 78.7763f, 44.8620f, 102.5983f, 138.3968f, 140.8982f, 170.7781f,
-        105.8416f, 165.0748f, 126.5542f, 177.1219f, 74.1239f, 21.1889f, 89.5320f, 80.5165f, 92.9311f, 159.1187f, 147.7788f, 208.3988f,
-        159.3220f, 68.5139f, 214.8306f, 113.2691f, 68.1500f, 106.3565f, 118.9061f, 135.0133f, 9.9914f, 191.9200f, 68.7055f, 201.9398f,
-        52.9639f, 44.6476f, 97.9184f, 99.9669f, 55.7637f, 152.0609f, 101.8791f, 173.2028f, 3.2253f, 61.7017f, 49.2181f, 65.6580f,
-        17.8964f, 149.2418f, 47.2522f, 170.4436f, 122.9471f, 96.2103f, 150.8778f, 144.0833f, 60.3089f, 24.4012f, 75.4822f, 62.1410f,
-        171.4575f, 60.1555f, 210.5018f, 105.4550f, 39.6844f, 39.6149f, 57.7543f, 87.4394f, 11.6796f, 8.8690f, 27.8902f, 22.3743f,
-        132.9151f, -21.7847f, 168.4868f, 33.7186f, 163.6127f, 55.8750f, 188.8017f, 82.7164f, 48.6664f, -15.5441f, 62.5789f, 23.1577f,
-        15.8440f, 32.5294f, 64.9913f, 33.6657f, 11.2664f, 115.2323f, 63.0400f, 174.8410f, 98.9553f, 132.8318f, 109.8496f, 150.4047f,
-        92.9619f, 145.3852f, 94.4048f, 150.0469f, 41.4721f, 49.4119f, 62.3038f, 77.4494f, -14.9919f, 173.6975f, 33.0612f, 182.3103f,
-        71.0426f, 113.7725f, 121.5539f, 123.7598f, 187.2858f, 6.0529f, 196.4472f, 44.3576f, 107.1609f, 16.6524f, 153.8468f, 40.8351f,
-        95.1880f, 110.9244f, 103.0146f, 166.3137f, 10.1316f, 24.6737f, 34.1453f, 44.5039f, 20.5283f, 79.5362f, 80.4462f, 123.3809f,
-        52.7734f, 184.2525f, 65.1362f, 212.4573f, 147.9188f, -19.1670f, 158.0026f, 20.7701f, 162.3696f, -14.8751f, 188.3148f, 21.5070f,
-        161.5482f, 184.1698f, 199.1086f, 213.0640f, 168.8931f, 88.4010f, 224.9343f, 145.4546f, 167.0391f, 14.7719f, 225.9076f, 35.9920f,
-        188.0454f, 173.7320f, 193.1542f, 185.1889f, 9.7935f, 155.5723f, 18.9354f, 196.5798f, 3.7319f, 81.7829f, 51.3855f, 132.6973f,
-        52.4097f, 122.6709f, 69.3770f, 126.0459f, 83.9766f, 40.8733f, 137.1827f, 68.4016f, -0.6763f, -16.7244f, 39.4674f, 36.9323f,
-        165.3600f, 96.2998f, 172.9588f, 141.5273f, 98.2916f, 29.1927f, 148.4108f, 88.7094f, 102.7704f, 116.5475f, 114.1754f, 148.9009f,
-        20.0692f, 147.2792f, 46.0554f, 187.2189f, 33.8616f, -5.7911f, 67.4406f, 13.0553f, 16.7898f, 90.6905f, 47.3350f, 147.5951f,
-        149.6448f, 34.9492f, 191.1284f, 57.5630f, 97.0913f, 152.4916f, 136.5998f, 197.0638f, 117.2606f, 38.3403f, 176.7911f, 63.1255f,
-        29.2236f, 105.0804f, 89.1895f, 139.2277f, 58.5150f, 88.9746f, 89.9861f, 132.4418f, 77.6626f, 63.7197f, 84.2794f, 94.7469f,
-        130.0316f, 108.2651f, 173.9744f, 162.7832f, 125.1590f, 132.2845f, 183.7822f, 158.0233f, 31.4721f, 93.7989f, 51.2533f, 132.9762f,
-        174.2021f, 141.0848f, 202.4134f, 162.2841f, 11.1001f, 184.1428f, 37.1620f, 209.2240f, 177.2076f, 70.3730f, 181.2413f, 97.3360f,
-        -0.2527f, 98.7053f, 40.4109f, 107.1279f, 41.9845f, -0.7119f, 63.8314f, 5.6998f, 145.5655f, 139.0148f, 193.0259f, 179.3967f,
-        10.8509f, 84.2082f, 60.9460f, 123.8838f, 57.9873f, 61.5364f, 107.4399f, 101.6481f, 77.1802f, 17.7313f, 102.7635f, 19.8975f,
-        39.0662f, 167.7982f, 59.0374f, 188.0644f, 119.4588f, 72.6661f, 164.6393f, 85.3368f, 146.1259f, 113.0609f, 194.4079f, 159.9718f,
-        159.9229f, 3.9862f, 189.9071f, 55.7634f, 41.0200f, 184.5329f, 94.7088f, 200.0870f};
+        -3.8699f,  108.8880f, 20.8101f,  137.5783f, 149.9079f, 29.3134f,  203.7504f, 39.2031f,
+        121.6031f, 107.1528f, 162.2282f, 118.8275f, 27.1146f,  87.2265f,  42.1365f,  141.7457f,
+        -7.3128f,  91.3799f,  44.0012f,  95.0142f,  25.9397f,  97.1572f,  47.4736f,  111.8955f,
+        170.3318f, 143.6689f, 221.6791f, 161.9004f, 82.3933f,  144.8881f, 101.0310f, 174.8098f,
+        138.9017f, 80.6305f,  174.7306f, 116.2308f, 115.0719f, 104.8666f, 139.4914f, 134.9707f,
+        105.8753f, 183.2658f, 123.0900f, 189.2287f, 2.3726f,   16.2585f,  55.6795f,  31.6349f,
+        183.1709f, -1.9651f,  195.2389f, 48.8066f,  57.2666f,  -1.7671f,  63.2705f,  36.8507f,
+        105.0166f, 111.9228f, 126.1903f, 151.2225f, 118.2848f, 63.4507f,  161.6255f, 103.9927f,
+        105.5274f, 131.8586f, 154.1659f, 177.8699f, 158.1560f, 132.0321f, 218.0818f, 136.4605f,
+        20.4451f,  55.4126f,  38.9305f,  78.0425f,  89.1363f,  163.2572f, 114.2048f, 196.0894f,
+        76.2707f,  142.0220f, 85.3431f,  162.9909f, 77.3750f,  28.6949f,  112.2925f, 79.5191f,
+        -6.0851f,  58.1025f,  53.7721f,  87.5743f,  5.6429f,   39.7135f,  47.9949f,  86.0625f,
+        37.5563f,  5.8879f,   73.6739f,  57.1568f,  48.8660f,  14.1653f,  73.0158f,  44.9480f,
+        58.0793f,  159.8937f, 113.0820f, 214.5573f, 107.0385f, 69.7607f,  137.3566f, 105.4010f,
+        122.4620f, 51.0809f,  131.3896f, 102.2471f, 71.0835f,  135.3897f, 93.6408f,  156.4846f,
+        79.2752f,  95.3835f,  84.2380f,  125.8137f, 37.0673f,  171.0514f, 49.9841f,  203.4046f,
+        116.6400f, 152.4634f, 118.6825f, 159.6572f, 49.5364f,  83.6166f,  77.2799f,  108.1312f,
+        -12.0070f, 47.7104f,  26.4309f,  102.8334f, 73.0529f,  178.2168f, 94.3071f,  216.4359f,
+        81.9253f,  137.8156f, 107.7278f, 149.2885f, 16.3219f,  179.7427f, 73.9152f,  200.7352f,
+        91.8087f,  17.5434f,  137.1745f, 29.8480f,  96.6991f,  168.8745f, 129.6096f, 171.3390f,
+        131.5065f, 99.5547f,  149.2944f, 155.2749f, 102.6283f, 10.6622f,  156.5511f, 38.1065f,
+        123.0512f, 108.0793f, 137.9220f, 127.2239f, 53.1452f,  119.0642f, 73.3404f,  155.3743f,
+        130.1690f, 1.7448f,   184.8039f, 3.1763f,   93.7074f,  82.1619f,  125.9504f, 99.5652f,
+        63.8853f,  143.8404f, 108.6820f, 186.3194f, 107.2755f, 39.8756f,  143.1295f, 78.2680f,
+        52.3550f,  62.2463f,  91.9079f,  121.1729f, 93.2160f,  69.6623f,  111.8797f, 107.2634f,
+        139.7207f, 45.7991f,  154.9616f, 74.9719f,  167.2671f, 160.7261f, 187.2941f, 206.6506f,
+        179.1259f, 129.1106f, 189.2970f, 183.4070f, 74.4343f,  0.3572f,   127.0189f, 43.8782f,
+        95.1992f,  170.4922f, 112.9108f, 228.3217f, 142.9101f, 152.2709f, 177.0380f, 199.4092f,
+        39.0269f,  30.7110f,  86.7534f,  82.8523f,  143.8537f, 163.5132f, 191.0993f, 171.2454f,
+        85.3959f,  -0.8223f,  112.2607f, 43.3901f,  8.6218f,   186.3383f, 37.7209f,  213.3036f,
+        -15.4319f, 116.3204f, 44.2555f,  149.9535f, 147.9980f, 110.2290f, 188.7993f, 149.8210f,
+        -13.4183f, -11.0214f, 35.6454f,  47.1977f,  28.9969f,  149.8616f, 83.2476f,  208.9517f,
+        43.0921f,  -3.2028f,  90.5599f,  14.8026f,  28.6361f,  26.0199f,  40.5617f,  70.3113f,
+        45.6946f,  5.9799f,   79.8627f,  51.2289f,  145.0326f, 144.6320f, 152.0444f, 166.0751f,
+        -16.8246f, 35.4867f,  22.6978f,  43.7950f,  136.7519f, 180.4197f, 194.1175f, 183.8356f,
+        155.6840f, 107.8222f, 186.9352f, 154.6854f, 61.1796f,  -7.7136f,  87.7250f,  22.1787f,
+        29.1652f,  -28.4875f, 32.2799f,  30.6594f,  91.3547f,  -3.8851f,  148.9814f, 24.5483f,
+        20.3959f,  91.8365f,  27.4731f,  150.5336f, 71.2720f,  147.6549f, 74.6957f,  172.9379f,
+        183.9269f, 23.7969f,  199.4448f, 71.6242f,  196.6597f, 166.8796f, 201.5260f, 172.8839f,
+        140.4950f, -5.4397f,  168.3470f, 28.3325f,  46.4677f,  136.0320f, 77.9169f,  184.3535f,
+        127.8122f, 157.7804f, 147.2538f, 213.3378f, 139.0779f, 129.6555f, 143.0846f, 179.1879f,
+        73.7761f,  138.0335f, 81.3605f,  141.2148f, 116.3348f, 156.1013f, 140.0206f, 179.0908f,
+        -0.1401f,  6.0937f,   4.4311f,   9.9669f,   20.7149f,  36.6326f,  62.9081f,  44.0802f,
+        98.4106f,  4.5632f,   111.6248f, 45.4062f,  23.3391f,  79.3651f,  42.1614f,  122.4473f,
+        21.0547f,  125.7129f, 45.3081f,  172.3624f, 154.4709f, 99.9714f,  180.0508f, 152.0333f,
+        197.2776f, 147.9130f, 198.3756f, 192.5394f, 107.3878f, 6.9169f,   115.0000f, 55.1683f,
+        141.8624f, 144.9798f, 193.7655f, 148.8687f, 197.5280f, 31.1895f,  198.6007f, 46.0271f,
+        12.8282f,  35.3058f,  43.8101f,  72.9977f,  74.7088f,  116.1662f, 104.5894f, 167.7956f,
+        68.1883f,  195.4082f, 88.8408f,  196.6737f, 2.7857f,   106.6272f, 29.2340f,  137.9903f,
+        127.5389f, -9.5799f,  174.5932f, 31.3800f,  61.4403f,  121.8884f, 112.0713f, 124.6352f,
+        15.4868f,  35.9096f,  55.8899f,  68.2298f,  35.5922f,  56.6701f,  44.2246f,  72.3261f,
+        163.1796f, 40.7751f,  180.4136f, 56.2181f,  177.9262f, 90.7157f,  187.1069f, 101.2297f,
+        33.5656f,  108.4211f, 51.2933f,  164.8822f, 73.5555f,  18.9549f,  114.3649f, 72.3462f,
+        119.3443f, 42.7151f,  174.0536f, 89.5792f,  169.1987f, 170.3059f, 182.1476f, 201.8479f,
+        59.3192f,  -5.2591f,  92.3019f,  24.6868f,  82.2129f,  76.0264f,  124.5949f, 108.2814f,
+        119.7321f, 125.9828f, 176.9545f, 158.6404f, 127.7304f, 16.7712f,  164.7240f, 43.4104f,
+        148.5664f, 5.0880f,   164.6177f, 13.8616f,  95.0352f,  23.4340f,  132.9384f, 31.8482f,
+        10.9685f,  155.1733f, 30.8775f,  212.3560f, 151.4989f, -12.8680f, 210.0904f, 16.5719f,
+        160.8241f, 9.0448f,   185.4050f, 66.2840f,  138.8994f, 0.9312f,   180.3396f, 11.5822f,
+        18.7873f,  5.2706f,   21.1577f,  38.9812f,  28.5777f,  117.4022f, 53.1813f,  130.6575f,
+        122.4044f, 40.3588f,  175.0358f, 56.2967f,  -13.8737f, 112.4558f, 23.1297f,  115.2290f,
+        182.2486f, 114.0300f, 209.4412f, 122.0482f, 47.3188f,  142.3400f, 103.5391f, 197.4341f,
+        118.1700f, -9.0369f,  169.5550f, 10.9335f,  167.5089f, 152.2341f, 187.5196f, 189.1137f,
+        62.3618f,  109.6059f, 95.4902f,  138.0417f, 48.8767f,  20.2354f,  78.7763f,  44.8620f,
+        102.5983f, 138.3968f, 140.8982f, 170.7781f, 105.8416f, 165.0748f, 126.5542f, 177.1219f,
+        74.1239f,  21.1889f,  89.5320f,  80.5165f,  92.9311f,  159.1187f, 147.7788f, 208.3988f,
+        159.3220f, 68.5139f,  214.8306f, 113.2691f, 68.1500f,  106.3565f, 118.9061f, 135.0133f,
+        9.9914f,   191.9200f, 68.7055f,  201.9398f, 52.9639f,  44.6476f,  97.9184f,  99.9669f,
+        55.7637f,  152.0609f, 101.8791f, 173.2028f, 3.2253f,   61.7017f,  49.2181f,  65.6580f,
+        17.8964f,  149.2418f, 47.2522f,  170.4436f, 122.9471f, 96.2103f,  150.8778f, 144.0833f,
+        60.3089f,  24.4012f,  75.4822f,  62.1410f,  171.4575f, 60.1555f,  210.5018f, 105.4550f,
+        39.6844f,  39.6149f,  57.7543f,  87.4394f,  11.6796f,  8.8690f,   27.8902f,  22.3743f,
+        132.9151f, -21.7847f, 168.4868f, 33.7186f,  163.6127f, 55.8750f,  188.8017f, 82.7164f,
+        48.6664f,  -15.5441f, 62.5789f,  23.1577f,  15.8440f,  32.5294f,  64.9913f,  33.6657f,
+        11.2664f,  115.2323f, 63.0400f,  174.8410f, 98.9553f,  132.8318f, 109.8496f, 150.4047f,
+        92.9619f,  145.3852f, 94.4048f,  150.0469f, 41.4721f,  49.4119f,  62.3038f,  77.4494f,
+        -14.9919f, 173.6975f, 33.0612f,  182.3103f, 71.0426f,  113.7725f, 121.5539f, 123.7598f,
+        187.2858f, 6.0529f,   196.4472f, 44.3576f,  107.1609f, 16.6524f,  153.8468f, 40.8351f,
+        95.1880f,  110.9244f, 103.0146f, 166.3137f, 10.1316f,  24.6737f,  34.1453f,  44.5039f,
+        20.5283f,  79.5362f,  80.4462f,  123.3809f, 52.7734f,  184.2525f, 65.1362f,  212.4573f,
+        147.9188f, -19.1670f, 158.0026f, 20.7701f,  162.3696f, -14.8751f, 188.3148f, 21.5070f,
+        161.5482f, 184.1698f, 199.1086f, 213.0640f, 168.8931f, 88.4010f,  224.9343f, 145.4546f,
+        167.0391f, 14.7719f,  225.9076f, 35.9920f,  188.0454f, 173.7320f, 193.1542f, 185.1889f,
+        9.7935f,   155.5723f, 18.9354f,  196.5798f, 3.7319f,   81.7829f,  51.3855f,  132.6973f,
+        52.4097f,  122.6709f, 69.3770f,  126.0459f, 83.9766f,  40.8733f,  137.1827f, 68.4016f,
+        -0.6763f,  -16.7244f, 39.4674f,  36.9323f,  165.3600f, 96.2998f,  172.9588f, 141.5273f,
+        98.2916f,  29.1927f,  148.4108f, 88.7094f,  102.7704f, 116.5475f, 114.1754f, 148.9009f,
+        20.0692f,  147.2792f, 46.0554f,  187.2189f, 33.8616f,  -5.7911f,  67.4406f,  13.0553f,
+        16.7898f,  90.6905f,  47.3350f,  147.5951f, 149.6448f, 34.9492f,  191.1284f, 57.5630f,
+        97.0913f,  152.4916f, 136.5998f, 197.0638f, 117.2606f, 38.3403f,  176.7911f, 63.1255f,
+        29.2236f,  105.0804f, 89.1895f,  139.2277f, 58.5150f,  88.9746f,  89.9861f,  132.4418f,
+        77.6626f,  63.7197f,  84.2794f,  94.7469f,  130.0316f, 108.2651f, 173.9744f, 162.7832f,
+        125.1590f, 132.2845f, 183.7822f, 158.0233f, 31.4721f,  93.7989f,  51.2533f,  132.9762f,
+        174.2021f, 141.0848f, 202.4134f, 162.2841f, 11.1001f,  184.1428f, 37.1620f,  209.2240f,
+        177.2076f, 70.3730f,  181.2413f, 97.3360f,  -0.2527f,  98.7053f,  40.4109f,  107.1279f,
+        41.9845f,  -0.7119f,  63.8314f,  5.6998f,   145.5655f, 139.0148f, 193.0259f, 179.3967f,
+        10.8509f,  84.2082f,  60.9460f,  123.8838f, 57.9873f,  61.5364f,  107.4399f, 101.6481f,
+        77.1802f,  17.7313f,  102.7635f, 19.8975f,  39.0662f,  167.7982f, 59.0374f,  188.0644f,
+        119.4588f, 72.6661f,  164.6393f, 85.3368f,  146.1259f, 113.0609f, 194.4079f, 159.9718f,
+        159.9229f, 3.9862f,   189.9071f, 55.7634f,  41.0200f,  184.5329f, 94.7088f,  200.0870f};
     std::vector<float> scores_vec = {
-        0.1439f, 0.8791f, 0.0961f, 0.1535f, 0.5338f, 0.0675f, 0.0528f, 0.0005f, 0.4363f, 0.7746f, 0.0348f, 0.6523f,
-        0.8231f, 0.1680f, 0.1469f, 0.8608f, 0.8231f, 0.5389f, 0.8192f, 0.0928f, 0.3945f, 0.7378f, 0.2575f, 0.7523f,
-        0.5042f, 0.7503f, 0.4647f, 0.3679f, 0.2192f, 0.2084f, 0.7515f, 0.1189f, 0.0860f, 0.1763f, 0.1753f, 0.8231f,
-        0.3985f, 0.9904f, 0.1372f, 0.6535f, 0.4487f, 0.3929f, 0.8751f, 0.9756f, 0.8729f, 0.1923f, 0.2208f, 0.6561f,
-        0.2891f, 0.7347f, 0.5664f, 0.5509f, 0.8285f, 0.7105f, 0.0266f, 0.0495f, 0.6016f, 0.4862f, 0.2602f, 0.4187f,
-        0.7579f, 0.8266f, 0.5612f, 0.3854f, 0.2707f, 0.5219f, 0.3147f, 0.5641f, 0.6767f, 0.0661f, 0.0011f, 0.2123f,
-        0.8945f, 0.6463f, 0.1720f, 0.8903f, 0.4700f, 0.4761f, 0.9355f, 0.0595f, 0.2152f, 0.5858f, 0.1955f, 0.6795f,
-        0.2141f, 0.0992f, 0.2070f, 0.4227f, 0.1761f, 0.1347f, 0.8603f, 0.3204f, 0.3608f, 0.0553f, 0.3574f, 0.2648f,
-        0.6105f, 0.2054f, 0.8884f, 0.9297f, 0.0998f, 0.1074f, 0.1153f, 0.6196f, 0.1220f, 0.8524f, 0.7543f, 0.8198f,
-        0.5261f, 0.9967f, 0.0442f, 0.4013f, 0.3239f, 0.9486f, 0.5769f, 0.8062f, 0.1703f, 0.9786f, 0.4986f, 0.4937f,
-        0.9709f, 0.3807f, 0.3975f, 0.5848f, 0.1281f, 0.3211f, 0.1932f, 0.1033f, 0.8661f, 0.5893f, 0.3587f, 0.4087f,
-        0.4315f, 0.6331f, 0.9268f, 0.9328f, 0.3915f, 0.3293f, 0.4510f, 0.5679f, 0.4618f, 0.6588f, 0.5544f, 0.3207f,
-        0.3457f, 0.3786f, 0.0946f, 0.1661f, 0.7231f, 0.3891f, 0.2145f, 0.5627f, 0.7555f, 0.2574f, 0.8268f, 0.9275f,
-        0.5974f, 0.6689f, 0.0526f, 0.9455f, 0.3925f, 0.9239f, 0.5790f, 0.0046f, 0.0385f, 0.6804f, 0.5627f, 0.0265f,
-        0.7435f, 0.8521f, 0.4964f, 0.4658f, 0.0055f, 0.7866f, 0.3307f, 0.8788f, 0.3731f, 0.5651f, 0.2703f, 0.1606f,
-        0.7749f, 0.4966f, 0.5365f, 0.9654f, 0.9636f, 0.8556f, 0.1876f, 0.5943f, 0.8781f, 0.3745f, 0.1011f, 0.8110f,
-        0.4818f, 0.5644f, 0.9821f, 0.6072f, 0.4250f, 0.3700f, 0.4176f, 0.1184f};
+        0.1439f, 0.8791f, 0.0961f, 0.1535f, 0.5338f, 0.0675f, 0.0528f, 0.0005f, 0.4363f, 0.7746f,
+        0.0348f, 0.6523f, 0.8231f, 0.1680f, 0.1469f, 0.8608f, 0.8231f, 0.5389f, 0.8192f, 0.0928f,
+        0.3945f, 0.7378f, 0.2575f, 0.7523f, 0.5042f, 0.7503f, 0.4647f, 0.3679f, 0.2192f, 0.2084f,
+        0.7515f, 0.1189f, 0.0860f, 0.1763f, 0.1753f, 0.8231f, 0.3985f, 0.9904f, 0.1372f, 0.6535f,
+        0.4487f, 0.3929f, 0.8751f, 0.9756f, 0.8729f, 0.1923f, 0.2208f, 0.6561f, 0.2891f, 0.7347f,
+        0.5664f, 0.5509f, 0.8285f, 0.7105f, 0.0266f, 0.0495f, 0.6016f, 0.4862f, 0.2602f, 0.4187f,
+        0.7579f, 0.8266f, 0.5612f, 0.3854f, 0.2707f, 0.5219f, 0.3147f, 0.5641f, 0.6767f, 0.0661f,
+        0.0011f, 0.2123f, 0.8945f, 0.6463f, 0.1720f, 0.8903f, 0.4700f, 0.4761f, 0.9355f, 0.0595f,
+        0.2152f, 0.5858f, 0.1955f, 0.6795f, 0.2141f, 0.0992f, 0.2070f, 0.4227f, 0.1761f, 0.1347f,
+        0.8603f, 0.3204f, 0.3608f, 0.0553f, 0.3574f, 0.2648f, 0.6105f, 0.2054f, 0.8884f, 0.9297f,
+        0.0998f, 0.1074f, 0.1153f, 0.6196f, 0.1220f, 0.8524f, 0.7543f, 0.8198f, 0.5261f, 0.9967f,
+        0.0442f, 0.4013f, 0.3239f, 0.9486f, 0.5769f, 0.8062f, 0.1703f, 0.9786f, 0.4986f, 0.4937f,
+        0.9709f, 0.3807f, 0.3975f, 0.5848f, 0.1281f, 0.3211f, 0.1932f, 0.1033f, 0.8661f, 0.5893f,
+        0.3587f, 0.4087f, 0.4315f, 0.6331f, 0.9268f, 0.9328f, 0.3915f, 0.3293f, 0.4510f, 0.5679f,
+        0.4618f, 0.6588f, 0.5544f, 0.3207f, 0.3457f, 0.3786f, 0.0946f, 0.1661f, 0.7231f, 0.3891f,
+        0.2145f, 0.5627f, 0.7555f, 0.2574f, 0.8268f, 0.9275f, 0.5974f, 0.6689f, 0.0526f, 0.9455f,
+        0.3925f, 0.9239f, 0.5790f, 0.0046f, 0.0385f, 0.6804f, 0.5627f, 0.0265f, 0.7435f, 0.8521f,
+        0.4964f, 0.4658f, 0.0055f, 0.7866f, 0.3307f, 0.8788f, 0.3731f, 0.5651f, 0.2703f, 0.1606f,
+        0.7749f, 0.4966f, 0.5365f, 0.9654f, 0.9636f, 0.8556f, 0.1876f, 0.5943f, 0.8781f, 0.3745f,
+        0.1011f, 0.8110f, 0.4818f, 0.5644f, 0.9821f, 0.6072f, 0.4250f, 0.3700f, 0.4176f, 0.1184f};
 
     migraphx::parameter_map host_params;
     host_params["boxes"]  = migraphx::argument(boxes_s, boxes_vec.data());
@@ -583,7 +629,11 @@ TEST_CASE(nms_100boxes_2batch_test)
 
     auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params);
     indices.resize(static_cast<std::size_t>(num_selected) * 3);
-    std::vector<int64_t> gold = {0, 0, 37, 0, 0, 43, 0, 0, 78, 0, 0, 99, 0, 0, 72, 0, 0, 75, 0, 0, 98, 0, 0, 1, 0, 0, 42, 0, 0, 44, 0, 0, 15, 0, 0, 90, 0, 0, 52, 0, 0, 61, 0, 0, 12, 1, 0, 9, 1, 0, 94, 1, 0, 17, 1, 0, 20, 1, 0, 83, 1, 0, 84, 1, 0, 13, 1, 0, 59, 1, 0, 35, 1, 0, 55, 1, 0, 34, 1, 0, 61, 1, 0, 75, 1, 0, 88, 1, 0, 28};
+    std::vector<int64_t> gold = {0, 0, 37, 0, 0, 43, 0, 0, 78, 0, 0, 99, 0, 0, 72, 0, 0, 75,
+                                 0, 0, 98, 0, 0, 1,  0, 0, 42, 0, 0, 44, 0, 0, 15, 0, 0, 90,
+                                 0, 0, 52, 0, 0, 61, 0, 0, 12, 1, 0, 9,  1, 0, 94, 1, 0, 17,
+                                 1, 0, 20, 1, 0, 83, 1, 0, 84, 1, 0, 13, 1, 0, 59, 1, 0, 35,
+                                 1, 0, 55, 1, 0, 34, 1, 0, 61, 1, 0, 75, 1, 0, 88, 1, 0, 28};
     EXPECT(migraphx::verify::verify_rms_range(indices, gold));
     EXPECT(num_selected == 30);
 }
@@ -602,35 +652,39 @@ TEST_CASE(nms_30boxes_3class_test)
     auto iou_threshold   = mm->add_literal(0.4500f);
     auto score_threshold = mm->add_literal(0.1500f);
 
-    auto nms =
-        mm->add_instruction(migraphx::make_op("nonmaxsuppression"),
-                            boxes_p,
-                            scores_p,
-                            max_out_l,
-                            iou_threshold,
-                            score_threshold);
+    auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"),
+                                   boxes_p,
+                                   scores_p,
+                                   max_out_l,
+                                   iou_threshold,
+                                   score_threshold);
     add_nms_return(mm, nms);
 
     std::vector<float> boxes_vec = {
-        31.2680f, 53.5348f, 37.7043f, 73.6253f, 1.8071f, 55.2945f, 3.9368f, 78.7402f, 40.5016f, 12.5670f, 45.0345f, 32.9366f,
-        78.2552f, 12.9548f, 80.7117f, 35.6526f, 73.9527f, 67.9870f, 79.4405f, 71.9065f, -3.8066f, -7.7339f, 10.2705f, 11.5692f,
-        45.4706f, 34.8613f, 67.4569f, 48.4119f, 17.4632f, 30.3439f, 30.8192f, 43.8443f, 64.5403f, 44.3725f, 79.9380f, 66.0477f,
-        0.7877f, 1.3956f, 6.4307f, 24.7471f, 65.1632f, 44.8608f, 84.5766f, 62.0721f, 59.3935f, 24.0849f, 74.6026f, 36.1925f,
-        -1.0372f, 43.7485f, 19.8379f, 55.2458f, -6.6257f, -1.7353f, 16.1976f, 8.1505f, 62.2758f, 32.2798f, 71.2775f, 41.5966f,
-        10.9190f, 36.7777f, 14.0023f, 46.7824f, 39.6937f, 15.6139f, 45.8900f, 18.6783f, 67.7244f, 9.7794f, 78.7948f, 12.5604f,
-        34.0204f, 5.6094f, 56.7713f, 24.5464f, 26.9281f, 21.9014f, 36.6292f, 33.1611f, 26.2374f, -3.4581f, 44.9652f, 18.9477f,
-        -1.6661f, 68.2450f, 11.7649f, 83.3261f, 74.8979f, 31.4950f, 80.1025f, 33.3041f, 20.6639f, 62.4061f, 29.0408f, 67.0291f,
-        7.1374f, 75.0864f, 23.1608f, 80.8203f, 14.6460f, -5.2621f, 31.1216f, 18.1798f, 71.6501f, 49.1185f, 82.6496f, 55.1487f,
-        4.4135f, 63.2815f, 10.6723f, 76.1439f, 60.5823f, 39.4727f, 78.1862f, 62.0048f, 54.1855f, 22.5844f, 59.0696f, 46.0598f};
+        31.2680f, 53.5348f, 37.7043f, 73.6253f, 1.8071f,  55.2945f, 3.9368f,  78.7402f, 40.5016f,
+        12.5670f, 45.0345f, 32.9366f, 78.2552f, 12.9548f, 80.7117f, 35.6526f, 73.9527f, 67.9870f,
+        79.4405f, 71.9065f, -3.8066f, -7.7339f, 10.2705f, 11.5692f, 45.4706f, 34.8613f, 67.4569f,
+        48.4119f, 17.4632f, 30.3439f, 30.8192f, 43.8443f, 64.5403f, 44.3725f, 79.9380f, 66.0477f,
+        0.7877f,  1.3956f,  6.4307f,  24.7471f, 65.1632f, 44.8608f, 84.5766f, 62.0721f, 59.3935f,
+        24.0849f, 74.6026f, 36.1925f, -1.0372f, 43.7485f, 19.8379f, 55.2458f, -6.6257f, -1.7353f,
+        16.1976f, 8.1505f,  62.2758f, 32.2798f, 71.2775f, 41.5966f, 10.9190f, 36.7777f, 14.0023f,
+        46.7824f, 39.6937f, 15.6139f, 45.8900f, 18.6783f, 67.7244f, 9.7794f,  78.7948f, 12.5604f,
+        34.0204f, 5.6094f,  56.7713f, 24.5464f, 26.9281f, 21.9014f, 36.6292f, 33.1611f, 26.2374f,
+        -3.4581f, 44.9652f, 18.9477f, -1.6661f, 68.2450f, 11.7649f, 83.3261f, 74.8979f, 31.4950f,
+        80.1025f, 33.3041f, 20.6639f, 62.4061f, 29.0408f, 67.0291f, 7.1374f,  75.0864f, 23.1608f,
+        80.8203f, 14.6460f, -5.2621f, 31.1216f, 18.1798f, 71.6501f, 49.1185f, 82.6496f, 55.1487f,
+        4.4135f,  63.2815f, 10.6723f, 76.1439f, 60.5823f, 39.4727f, 78.1862f, 62.0048f, 54.1855f,
+        22.5844f, 59.0696f, 46.0598f};
     std::vector<float> scores_vec = {
-        0.9367f, 0.1879f, 0.1073f, 0.4976f, 0.5195f, 0.5082f, 0.4367f, 0.9948f, 0.4863f, 0.4779f, 0.4218f, 0.0668f,
-        0.5930f, 0.2280f, 0.6376f, 0.0508f, 0.9814f, 0.4690f, 0.8968f, 0.4756f, 0.0603f, 0.8222f, 0.6482f, 0.7818f,
-        0.4282f, 0.6379f, 0.8562f, 0.6311f, 0.3477f, 0.6625f, 0.6719f, 0.9606f, 0.3709f, 0.4251f, 0.8121f, 0.5058f,
-        0.7366f, 0.4597f, 0.2155f, 0.7452f, 0.1312f, 0.1986f, 0.6268f, 0.7473f, 0.8947f, 0.2726f, 0.1107f, 0.9560f,
-        0.1544f, 0.1977f, 0.2913f, 0.5294f, 0.8828f, 0.7605f, 0.7082f, 0.1752f, 0.3577f, 0.4784f, 0.1474f, 0.2734f,
-        0.3083f, 0.1273f, 0.5502f, 0.7050f, 0.0699f, 0.4811f, 0.7822f, 0.7480f, 0.8151f, 0.4482f, 0.8206f, 0.2408f,
-        0.3608f, 0.1764f, 0.4675f, 0.3921f, 0.2409f, 0.7518f, 0.3138f, 0.2728f, 0.1309f, 0.4388f, 0.3030f, 0.3693f,
-        0.2360f, 0.7632f, 0.9300f, 0.4979f, 0.6430f, 0.8672f};
+        0.9367f, 0.1879f, 0.1073f, 0.4976f, 0.5195f, 0.5082f, 0.4367f, 0.9948f, 0.4863f, 0.4779f,
+        0.4218f, 0.0668f, 0.5930f, 0.2280f, 0.6376f, 0.0508f, 0.9814f, 0.4690f, 0.8968f, 0.4756f,
+        0.0603f, 0.8222f, 0.6482f, 0.7818f, 0.4282f, 0.6379f, 0.8562f, 0.6311f, 0.3477f, 0.6625f,
+        0.6719f, 0.9606f, 0.3709f, 0.4251f, 0.8121f, 0.5058f, 0.7366f, 0.4597f, 0.2155f, 0.7452f,
+        0.1312f, 0.1986f, 0.6268f, 0.7473f, 0.8947f, 0.2726f, 0.1107f, 0.9560f, 0.1544f, 0.1977f,
+        0.2913f, 0.5294f, 0.8828f, 0.7605f, 0.7082f, 0.1752f, 0.3577f, 0.4784f, 0.1474f, 0.2734f,
+        0.3083f, 0.1273f, 0.5502f, 0.7050f, 0.0699f, 0.4811f, 0.7822f, 0.7480f, 0.8151f, 0.4482f,
+        0.8206f, 0.2408f, 0.3608f, 0.1764f, 0.4675f, 0.3921f, 0.2409f, 0.7518f, 0.3138f, 0.2728f,
+        0.1309f, 0.4388f, 0.3030f, 0.3693f, 0.2360f, 0.7632f, 0.9300f, 0.4979f, 0.6430f, 0.8672f};
 
     migraphx::parameter_map host_params;
     host_params["boxes"]  = migraphx::argument(boxes_s, boxes_vec.data());
@@ -638,7 +692,9 @@ TEST_CASE(nms_30boxes_3class_test)
 
     auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params);
     indices.resize(static_cast<std::size_t>(num_selected) * 3);
-    std::vector<int64_t> gold = {0, 0, 7, 0, 0, 16, 0, 0, 0, 0, 0, 18, 0, 0, 26, 0, 1, 1, 0, 1, 17, 0, 1, 14, 0, 1, 22, 0, 1, 4, 0, 2, 26, 0, 2, 29, 0, 2, 10, 0, 2, 6, 0, 2, 25};
+    std::vector<int64_t> gold = {0, 0, 7,  0, 0, 16, 0, 0, 0,  0, 0, 18, 0, 0, 26,
+                                 0, 1, 1,  0, 1, 17, 0, 1, 14, 0, 1, 22, 0, 1, 4,
+                                 0, 2, 26, 0, 2, 29, 0, 2, 10, 0, 2, 6,  0, 2, 25};
     EXPECT(migraphx::verify::verify_rms_range(indices, gold));
     EXPECT(num_selected == 15);
 }
@@ -657,218 +713,296 @@ TEST_CASE(nms_200boxes_2batch_2class_test)
     auto iou_threshold   = mm->add_literal(0.3000f);
     auto score_threshold = mm->add_literal(0.2500f);
 
-    auto nms =
-        mm->add_instruction(migraphx::make_op("nonmaxsuppression"),
-                            boxes_p,
-                            scores_p,
-                            max_out_l,
-                            iou_threshold,
-                            score_threshold);
+    auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"),
+                                   boxes_p,
+                                   scores_p,
+                                   max_out_l,
+                                   iou_threshold,
+                                   score_threshold);
     add_nms_return(mm, nms);
 
     std::vector<float> boxes_vec = {
-        132.1894f, 453.1169f, 199.9736f, 545.7127f, 64.3090f, 275.1729f, 104.8258f, 338.3436f, 76.1273f, 401.7875f, 135.6448f, 487.9920f,
-        12.8305f, 442.3624f, 77.1708f, 466.2458f, -5.9609f, 340.1129f, 126.0715f, 451.3386f, 15.0119f, 224.3769f, 56.2927f, 236.5545f,
-        427.8277f, -14.2917f, 561.9954f, 95.4457f, 4.7940f, -55.8461f, 69.2637f, 71.6517f, 41.3494f, 202.9014f, 91.1927f, 274.2992f,
-        375.6902f, 208.6749f, 451.5645f, 285.6396f, 258.4982f, 179.9212f, 321.7420f, 227.4412f, 367.5344f, 211.3590f, 406.8828f, 356.8083f,
-        277.5064f, 220.9636f, 353.4056f, 331.1991f, 429.2783f, 390.3169f, 452.8968f, 446.2962f, 292.5150f, 40.8054f, 345.9525f, 67.8517f,
-        218.4112f, 95.7302f, 303.7139f, 129.4475f, 325.0759f, 361.4403f, 387.6738f, 431.5647f, 161.8149f, 353.1971f, 285.5779f, 494.6398f,
-        153.4061f, 442.2182f, 192.6577f, 552.6060f, 161.0782f, 419.9203f, 306.5742f, 452.9917f, 25.8953f, 380.4122f, 143.8188f, 509.4868f,
-        325.7002f, 128.4980f, 470.8716f, 185.8499f, 67.4107f, 136.8775f, 193.2931f, 264.7841f, 65.6790f, 115.5359f, 87.8525f, 152.5492f,
-        83.4548f, 256.5595f, 162.8974f, 349.7399f, 407.8717f, 399.8657f, 434.1985f, 538.9396f, 103.6427f, 152.6073f, 226.5586f, 192.0336f,
-        299.0049f, 226.3779f, 387.0450f, 330.6239f, 408.0779f, 74.0950f, 448.3318f, 222.2046f, -30.8828f, 73.1804f, 108.6275f, 96.6196f,
-        373.4308f, 90.5068f, 391.5936f, 104.6787f, 111.3250f, -21.7549f, 196.3405f, 79.7002f, 54.0937f, 448.8364f, 162.5287f, 500.4571f,
-        339.5665f, 195.6321f, 349.3349f, 207.2475f, 409.8580f, 381.1502f, 499.9386f, 452.9707f, 86.2250f, 284.0088f, 208.7943f, 397.3206f,
-        278.8861f, 74.2190f, 289.9477f, 117.7022f, 106.2550f, 62.2701f, 183.5792f, 113.1921f, 257.3803f, 342.4895f, 296.9053f, 469.4987f,
-        261.0432f, 93.1105f, 360.8189f, 171.6012f, 295.8262f, 393.3591f, 314.5092f, 519.9261f, 241.4629f, 36.2717f, 382.0835f, 103.7837f,
-        0.3826f, 267.3577f, 134.6972f, 410.3510f, 332.4151f, 358.2527f, 361.1253f, 456.2211f, 312.7919f, 108.4937f, 361.9585f, 126.7627f,
-        297.0153f, 71.6643f, 385.5729f, 204.5431f, -16.9604f, 445.3092f, 91.0309f, 519.2097f, 189.9415f, 121.2467f, 256.8973f, 143.3509f,
-        192.3739f, 203.1031f, 216.6613f, 226.8539f, 35.0965f, 164.5365f, 51.6150f, 267.9791f, 36.2014f, 122.4881f, 186.1665f, 130.5466f,
-        186.0576f, 366.0443f, 254.9050f, 409.7468f, 305.9496f, 375.0105f, 436.9568f, 396.8388f, 82.0940f, 155.7987f, 154.9680f, 222.5193f,
-        345.6593f, 386.1935f, 484.0906f, 448.9323f, 265.8611f, 67.1577f, 279.9372f, 145.9173f, 371.2164f, -19.1800f, 389.2053f, 23.4858f,
-        166.5204f, 282.6964f, 306.0356f, 288.4709f, 178.5089f, 450.7671f, 320.6853f, 543.3107f, 285.9132f, -9.0198f, 333.8062f, 47.6641f,
-        437.0255f, 54.9746f, 490.9451f, 153.0235f, 211.6987f, 250.8616f, 280.1138f, 268.0530f, 232.8247f, 403.4440f, 295.8328f, 406.4968f,
-        286.3401f, 25.5231f, 315.6569f, 63.5189f, 301.3286f, 163.1046f, 436.1865f, 232.1301f, 16.5538f, 343.6795f, 55.2966f, 403.3963f,
-        204.8009f, 124.9041f, 310.8865f, 246.6391f, 235.2927f, 65.7693f, 246.2989f, 123.0671f, 457.4555f, 57.7300f, 464.2295f, 137.7658f,
-        197.5504f, 160.3075f, 295.9562f, 249.7413f, 208.4036f, 237.5821f, 259.9170f, 241.8350f, 431.7683f, 392.0298f, 530.4317f, 469.7846f,
-        217.7836f, 294.9363f, 232.7928f, 347.3161f, 19.1783f, 313.3156f, 161.7061f, 377.0863f, 52.1937f, 483.5222f, 164.7224f, 499.4650f,
-        -18.1881f, 147.1016f, 113.3757f, 264.7419f, -10.3830f, 130.9681f, 10.9511f, 272.3863f, 191.6208f, 459.5145f, 240.3248f, 463.8325f,
-        356.6797f, 77.6355f, 412.5629f, 168.2401f, 326.2139f, 307.5013f, 407.2526f, 422.3140f, -6.5422f, 355.5684f, 38.6912f, 399.0047f,
-        279.9745f, -10.2789f, 290.0085f, 108.0669f, 49.1601f, 186.5052f, 105.1230f, 281.7262f, 451.0742f, 30.5586f, 490.0021f, 170.0038f,
-        54.4314f, 19.1028f, 112.9336f, 166.2725f, 298.1461f, 228.2593f, 328.4931f, 235.5688f, 143.1079f, 111.0670f, 183.1305f, 178.3627f,
-        273.5727f, 356.7796f, 367.9886f, 439.2808f, 176.7118f, 442.3701f, 235.5468f, 465.2348f, 353.5905f, 375.8070f, 406.0526f, 426.9136f,
-        75.0636f, 58.9357f, 155.6155f, 207.0952f, 394.8923f, 135.3580f, 510.8995f, 138.7764f, 221.3792f, 93.1523f, 278.8305f, 161.5760f,
-        333.7764f, 4.2413f, 422.3168f, 130.7968f, 352.3830f, 447.2686f, 497.3472f, 496.5298f, 460.0268f, 164.7789f, 538.8018f, 237.2689f,
-        43.6929f, 38.9803f, 180.2527f, 185.7092f, 83.8176f, 387.4572f, 203.0748f, 459.2138f, 120.3420f, 189.3440f, 130.0911f, 209.8513f,
-        98.9678f, 13.2052f, 163.9035f, 21.9117f, 238.6976f, 10.0373f, 343.7471f, 151.9043f, 422.7512f, 299.3224f, 570.7713f, 339.9280f,
-        460.4900f, 353.3999f, 529.7881f, 429.5054f, 255.9741f, 98.2099f, 270.7991f, 112.7245f, 277.1439f, 426.6355f, 361.8833f, 490.7601f,
-        420.0563f, 355.7057f, 439.9143f, 495.2914f, 409.9785f, 386.2606f, 522.9550f, 462.1201f, 63.6084f, 40.9810f, 140.2522f, 186.6801f,
-        209.8752f, 5.4847f, 318.6665f, 45.0513f, 351.1511f, 395.6231f, 481.6860f, 471.8004f, 104.2444f, 88.3651f, 198.9577f, 217.4352f,
-        173.7778f, 275.5634f, 266.0312f, 343.3530f, 436.0951f, 358.6616f, 549.5261f, 401.3052f, 429.2604f, -0.0863f, 555.7863f, 128.3795f,
-        387.8089f, 360.8724f, 518.2979f, 419.9659f, 396.0101f, 429.2169f, 402.4382f, 509.2946f, 92.6291f, 290.9362f, 176.5014f, 437.4388f,
-        143.8130f, 206.2184f, 177.0371f, 235.0044f, 209.0457f, 415.3847f, 338.2372f, 461.2934f, 231.5831f, 260.9141f, 329.1943f, 266.5435f,
-        220.9448f, 342.6935f, 284.5580f, 402.0774f, 303.8214f, 394.8393f, 332.8489f, 425.6666f, 178.4043f, 323.5138f, 229.9188f, 425.8390f,
-        321.6556f, 129.9190f, 427.5185f, 157.9359f, 151.0502f, 8.1484f, 182.4998f, 109.6955f, 157.8666f, 99.0403f, 172.8104f, 139.2982f,
-        -3.0452f, 224.4737f, 130.2711f, 278.4012f, 36.9224f, 226.1483f, 151.7898f, 279.1286f, 409.8757f, 237.4242f, 440.6452f, 345.2202f,
-        200.8640f, 162.2960f, 245.4184f, 232.8059f, 41.0147f, 366.0289f, 186.8531f, 420.8625f, 326.4108f, 392.5565f, 432.9303f, 520.5973f,
-        231.0067f, 80.2522f, 322.9745f, 166.4729f, -12.8403f, 351.8312f, 33.9963f, 384.6920f, 135.3959f, 271.4291f, 180.9655f, 406.5427f,
-        85.0562f, 235.5178f, 91.9452f, 287.5727f, 273.1645f, 90.8612f, 382.7083f, 97.6691f, 133.7990f, 360.2684f, 141.2321f, 434.9638f,
-        31.6115f, 470.5798f, 33.3353f, 490.0465f, -27.3799f, 342.6524f, 82.3149f, 379.1839f, 219.6726f, 402.7702f, 362.0547f, 515.0898f,
-        -45.9977f, 481.8516f, 67.7212f, 502.3336f, 388.7589f, 115.4080f, 460.0333f, 236.6427f, 40.9882f, 248.8122f, 114.4089f, 389.4114f,
-        270.2910f, 191.2797f, 336.2753f, 282.6530f, 197.6581f, 439.8926f, 247.0300f, 546.7361f, 182.0580f, -6.7583f, 260.7935f, 100.5661f,
-        3.2778f, 131.7233f, 68.5193f, 280.6516f, 356.3126f, 411.8249f, 446.4396f, 463.7141f, 379.1163f, 129.3928f, 513.9362f, 154.6585f,
-        -69.1199f, 354.7185f, 80.1365f, 433.0744f, 82.9357f, 151.1645f, 95.6685f, 231.6187f, 422.7932f, 476.2348f, 481.1110f, 503.7437f,
-        260.7842f, 395.5883f, 288.7094f, 487.9416f, 48.2868f, 149.1079f, 101.7528f, 152.2125f, 79.4785f, 315.4853f, 123.3120f, 454.7079f,
-        316.4901f, 148.2175f, 343.4961f, 188.6391f, 304.9847f, 299.7342f, 419.8321f, 306.6287f, 262.2399f, 320.6758f, 337.1869f, 337.8050f,
-        407.5904f, 396.3992f, 545.5580f, 433.1963f, 244.1037f, -8.6806f, 249.9599f, 33.1314f, 144.6461f, 107.1346f, 155.6258f, 113.0233f,
-        208.0726f, 334.6470f, 269.1603f, 377.2708f, 173.3525f, 266.8875f, 186.3138f, 296.6358f, 92.1346f, 219.0953f, 132.2813f, 276.5098f,
-        -50.9776f, -1.5900f, 96.9408f, 56.8000f, 160.0388f, 148.3819f, 192.1737f, 199.8940f, 340.4449f, 407.6198f, 370.9644f, 457.4804f,
-        -34.0173f, 8.2614f, 52.4551f, 22.6314f, 181.9884f, 195.8403f, 257.1901f, 200.5959f, 278.2621f, 457.0166f, 365.7473f, 488.1317f,
-        276.6353f, -31.4300f, 333.7688f, 82.3108f, 326.2304f, 300.5375f, 450.4180f, 449.1682f, 394.4356f, 59.1311f, 416.0841f, 198.4815f,
-        323.4377f, 395.2401f, 388.2682f, 471.3687f, -0.4884f, 332.9131f, 103.2861f, 413.1549f, 172.3276f, 418.9163f, 302.6948f, 466.7889f,
-        273.6699f, 49.8039f, 329.7361f, 166.1209f, 79.9860f, 208.1720f, 165.5801f, 323.1208f, 15.6250f, 326.2367f, 26.9268f, 453.0333f,
-        98.6064f, 55.6348f, 124.9839f, 190.0650f, 221.7964f, 82.5141f, 233.0980f, 148.2322f, 152.2380f, -44.0412f, 261.6923f, 71.2233f,
-        66.3730f, 418.6809f, 110.2940f, 539.8344f, 357.7888f, 331.5282f, 466.6268f, 378.4887f, 457.3967f, 248.0516f, 468.2900f, 387.5087f,
-        35.9143f, 364.4689f, 165.4340f, 379.5258f, 402.0395f, 191.2334f, 527.5334f, 340.3795f, 1.8053f, 180.1951f, 16.0557f, 295.9387f,
-        460.2114f, 217.3174f, 464.7511f, 232.2148f, 471.2709f, 270.8305f, 480.6579f, 369.6087f, -58.0695f, 97.7211f, 70.1214f, 103.8139f,
-        363.5242f, 386.1504f, 399.4951f, 501.9083f, 443.7544f, 345.8341f, 526.4471f, 465.9183f, 420.6959f, 129.4022f, 485.2063f, 220.1614f,
-        425.5884f, 224.9686f, 545.1217f, 353.6407f, 238.2388f, 62.7213f, 312.0847f, 78.3060f, 1.2788f, 465.1168f, 76.8773f, 507.2295f,
-        350.7072f, 420.0901f, 499.0819f, 482.8026f, 295.2295f, 457.2856f, 318.5988f, 464.6119f, 248.9387f, 366.2193f, 368.7308f, 464.4846f,
-        266.4057f, -43.0988f, 411.9049f, 94.8485f, 365.3591f, 230.8355f, 381.3726f, 246.8133f, 213.6699f, 419.1429f, 302.9046f, 467.1919f,
-        282.3146f, 326.7091f, 321.6300f, 338.5049f, 157.0835f, 271.7193f, 238.9818f, 413.4953f, -3.7474f, 97.9864f, 45.0004f, 165.3309f,
-        28.3577f, 158.4742f, 71.5941f, 260.1006f, 284.2465f, 120.1271f, 370.7495f, 246.4540f, 483.6205f, 186.3921f, 511.9348f, 335.0511f,
-        -27.5488f, 218.5612f, 43.3521f, 243.6668f, 229.8062f, 103.3855f, 327.7773f, 223.5129f, 365.4548f, 86.1273f, 385.5540f, 219.3533f,
-        343.5581f, 121.2852f, 483.2167f, 129.5677f, 234.4260f, 125.8439f, 310.7789f, 239.2034f, 248.4032f, 48.0437f, 371.5128f, 101.8978f,
-        299.1465f, 387.2317f, 397.5784f, 484.8726f, 376.0880f, 262.2631f, 482.8782f, 339.8563f, 7.2930f, 47.0424f, 114.9965f, 86.7440f,
-        397.3961f, 336.3557f, 528.7860f, 357.5037f, -33.2049f, 414.6207f, 59.2223f, 433.0458f, 396.8727f, 110.5703f, 439.3271f, 126.9654f,
-        30.4567f, 27.2849f, 46.2837f, 123.3157f, 51.6484f, -22.3715f, 142.9798f, 30.9887f, -3.4962f, 6.9860f, 7.3904f, 40.2644f,
-        204.1520f, 329.0802f, 241.1047f, 433.1711f, 162.1569f, 441.9229f, 172.2023f, 545.2635f, 41.6043f, -18.2279f, 124.3886f, 63.1082f,
-        213.0999f, 303.8811f, 237.9903f, 444.1898f, 155.2101f, 6.7177f, 247.1608f, 65.1444f, 324.4111f, 233.2946f, 443.2500f, 358.8382f,
-        384.8351f, 371.9398f, 508.2953f, 384.1355f, 302.7226f, 123.9848f, 349.8446f, 235.2196f, 20.8081f, -68.6720f, 103.6023f, 79.6067f,
-        105.2511f, 234.0231f, 190.1397f, 361.1662f, 420.9290f, 451.9373f, 492.3893f, 539.3073f, -4.9387f, 81.6146f, 93.6732f, 176.0028f,
-        187.2764f, 67.9256f, 219.5794f, 121.5657f, 397.7987f, 10.8413f, 544.7059f, 113.0846f, 467.5255f, 219.7334f, 483.1394f, 335.5223f,
-        143.3246f, 223.3545f, 267.8786f, 373.0906f, 288.9383f, 358.9469f, 378.4586f, 433.3239f, 209.6311f, 371.4695f, 247.1145f, 381.6038f,
-        320.6775f, 401.3793f, 432.7831f, 491.1622f, 8.9968f, 393.5190f, 22.5845f, 412.2537f, 13.8844f, 104.8985f, 130.2727f, 142.3685f,
-        262.6455f, 252.9446f, 351.5533f, 302.9328f, 107.5252f, 93.7443f, 125.0270f, 203.6677f, 326.6030f, 150.6990f, 339.4493f, 179.0864f,
-        119.1742f, 453.1236f, 232.0488f, 478.8208f, 420.9991f, 337.0981f, 465.6465f, 344.7978f, 342.8767f, 421.7388f, 476.3827f, 552.8516f,
-        189.1445f, 156.2901f, 303.6933f, 260.6224f, 333.9324f, 265.2428f, 438.9627f, 272.1948f, 114.3128f, 240.9499f, 156.8251f, 246.1655f,
-        193.8135f, 11.5223f, 300.4463f, 95.7648f, 27.6040f, 96.8022f, 169.8780f, 139.8998f, 423.1219f, 218.8621f, 437.7643f, 308.7743f,
-        386.7347f, 0.8091f, 436.3329f, 66.5652f, 433.0917f, 396.4442f, 469.0579f, 535.0178f, 408.9413f, 39.9801f, 468.5356f, 83.8636f,
-        423.9944f, 47.8940f, 535.6019f, 150.0867f, 78.3370f, 378.1336f, 149.9992f, 387.1877f, 422.8927f, -23.2443f, 508.9316f, 120.1789f,
-        261.7021f, 376.5726f, 309.5111f, 523.7055f, 200.2215f, 307.2894f, 222.2736f, 418.4116f, 259.8004f, -0.8479f, 300.5735f, 69.4688f,
-        106.7550f, 329.0340f, 235.8474f, 362.8130f, 98.8964f, 254.7818f, 189.6566f, 376.8467f, 91.9970f, 323.3163f, 149.3173f, 434.0331f,
-        -18.1340f, 397.0634f, 100.5620f, 431.1345f, 242.9804f, 325.0598f, 253.5845f, 393.2908f, 424.4659f, 258.1096f, 463.2957f, 328.0667f,
-        297.4333f, 99.1641f, 332.7187f, 223.2992f, 186.5782f, 297.1904f, 334.3975f, 400.0833f, 161.1921f, 430.0698f, 267.4008f, 526.9018f,
-        185.6758f, 244.8488f, 278.7259f, 342.6730f, 103.7673f, 311.5224f, 105.5101f, 352.8224f, 397.2368f, 190.3715f, 425.6990f, 246.7565f,
-        51.3437f, 374.1586f, 147.0393f, 381.9622f, 329.5223f, 439.7066f, 387.1005f, 557.9608f, 310.6336f, 47.4363f, 449.3514f, 112.9530f,
-        229.9626f, 68.0539f, 344.9065f, 134.3514f, 397.6331f, 250.9398f, 465.2933f, 288.4979f, 89.1863f, 224.5854f, 201.8640f, 256.7900f,
-        367.6410f, 241.4922f, 513.9763f, 330.0776f, 329.8622f, 6.7118f, 399.5483f, 42.3622f, 351.0067f, 196.8547f, 447.7431f, 207.4218f,
-        263.3493f, 233.8098f, 401.2304f, 349.1684f, 404.1452f, 264.0487f, 442.1978f, 321.1426f, 430.0009f, 299.8394f, 563.0980f, 357.4945f,
-        202.3143f, 327.4748f, 217.8485f, 392.7412f, 358.1485f, 259.5528f, 455.7672f, 381.9944f, 313.4684f, 370.7192f, 431.1113f, 419.5239f,
-        180.1469f, 255.4066f, 272.7232f, 369.3540f, 426.0572f, 198.2577f, 500.8918f, 339.2499f, 150.7206f, 253.3635f, 243.7053f, 352.8329f,
-        270.9340f, 17.9364f, 294.5319f, 83.2569f, 36.4112f, 80.3679f, 69.5312f, 192.7886f, 92.2801f, 229.0865f, 133.4951f, 298.3132f,
-        375.3135f, 405.1188f, 465.3827f, 467.8684f, 164.8547f, 299.8922f, 231.6980f, 379.1594f, 178.3286f, 21.0337f, 215.7555f, 69.3744f,
-        56.7212f, 287.8708f, 189.2598f, 304.4041f, 217.4480f, 79.4625f, 274.1624f, 142.2755f, 369.1791f, 357.2809f, 436.6378f, 376.7356f,
-        416.5593f, 382.6425f, 478.6048f, 444.7983f, 21.0025f, 254.7366f, 49.1120f, 338.7197f, 232.4042f, 225.8433f, 342.4166f, 365.5193f,
-        199.7265f, 166.0972f, 267.5468f, 172.4943f, 305.4298f, 176.3264f, 308.8521f, 269.9237f, 151.3188f, 397.4529f, 295.9569f, 466.6555f,
-        138.0480f, 359.6507f, 260.5968f, 363.6696f, 181.5352f, 240.7855f, 290.3455f, 278.9682f, 225.7522f, 174.7890f, 356.2469f, 193.4433f,
-        182.4345f, 8.5387f, 318.5487f, 41.8410f, 210.4292f, 50.5482f, 261.7152f, 92.4592f, 362.9012f, 66.1153f, 454.9341f, 126.9099f,
-        326.9678f, 146.7783f, 418.6802f, 226.6052f, 150.2754f, 471.4981f, 191.1031f, 472.6456f, 383.2531f, 240.0174f, 417.3240f, 265.1360f,
-        417.8392f, 109.9494f, 435.8114f, 124.8908f, 27.1272f, 11.4244f, 126.3650f, 94.3257f, 232.6628f, 144.1367f, 350.0197f, 194.1688f,
-        85.4650f, 366.5097f, 199.8470f, 449.2209f, 345.5237f, 174.6456f, 393.6487f, 208.6972f, 103.6008f, 383.9478f, 135.1845f, 388.5580f,
-        301.4075f, 330.7206f, 369.9960f, 471.9843f, 86.3247f, 46.8414f, 168.7999f, 63.9793f, 186.5999f, 294.3789f, 324.5439f, 314.2809f,
-        408.6489f, 468.1303f, 539.9976f, 490.9658f, 121.9074f, 127.4639f, 259.4001f, 274.6741f, 374.0247f, -21.0436f, 501.7138f, 71.9877f,
-        421.1110f, 415.6848f, 565.8336f, 507.6180f, 402.2457f, 367.8241f, 472.6052f, 515.8422f, 78.8962f, 253.9820f, 86.9698f, 268.1594f,
-        403.1037f, 203.0262f, 416.5545f, 349.2269f, -13.5009f, 90.1716f, 45.6503f, 121.5695f, 176.9532f, 362.8065f, 216.3486f, 456.6442f,
-        422.2061f, 217.5038f, 448.5273f, 281.0963f, 272.8624f, -12.1655f, 415.8898f, 46.0433f, 251.3114f, 271.6299f, 281.4290f, 411.3851f,
-        121.9583f, 463.6307f, 265.9058f, 486.8656f, 348.9660f, 339.7936f, 463.3310f, 489.3569f, 306.5287f, 109.8543f, 403.0297f, 167.3439f,
-        183.3392f, -22.1712f, 285.0661f, 75.4963f, 421.0473f, 397.5667f, 471.4370f, 542.7847f, 66.3152f, 463.7401f, 163.6328f, 473.3226f,
-        70.7872f, 196.9543f, 99.6043f, 335.4611f, 251.0428f, 278.3568f, 391.7609f, 363.9607f, 463.0136f, 178.3225f, 508.9808f, 284.2776f,
-        104.1169f, 198.2685f, 143.1397f, 221.4969f, 71.3536f, 19.4869f, 178.3168f, 99.9616f, 20.3440f, -2.3003f, 119.1549f, 99.0532f,
-        396.1600f, 81.8756f, 464.4035f, 150.8565f, 65.5815f, 406.2740f, 160.8160f, 430.3668f, 239.2070f, 54.2293f, 263.9715f, 91.6030f,
-        444.7733f, 49.1971f, 546.0992f, 177.5016f, -14.5900f, 271.2390f, 26.7309f, 277.3751f, 257.4168f, 54.2554f, 299.0693f, 160.8758f,
-        243.5621f, 6.6488f, 268.7269f, 156.5579f, 378.4616f, 280.6006f, 428.9858f, 282.7156f, 152.4626f, 171.5487f, 202.8190f, 196.5445f,
-        170.8344f, 262.3559f, 239.5070f, 363.8034f, 69.2827f, 451.1334f, 98.6552f, 461.0720f, 355.5286f, 31.0572f, 385.2867f, 119.9359f,
-        351.4949f, 405.2588f, 433.2140f, 508.1748f, 58.2303f, 406.9281f, 78.4330f, 495.5619f, 144.9057f, 386.8375f, 248.5514f, 442.2501f,
-        375.6284f, 263.1954f, 517.2766f, 368.0905f, -30.9426f, 265.2984f, 33.6499f, 354.8483f, 81.7472f, 303.6374f, 217.0119f, 335.5753f,
-        269.6966f, 302.7942f, 285.3457f, 387.7014f, 163.3466f, -57.9610f, 170.7473f, 74.4432f, 81.7806f, 428.8672f, 190.2646f, 529.2253f,
-        172.8226f, 257.1534f, 287.2148f, 328.4503f, 27.4537f, 366.2749f, 154.0694f, 415.1909f, 260.0797f, 181.7424f, 269.5455f, 195.5394f,
-        294.9684f, -12.5261f, 411.7275f, 24.9233f, 259.0953f, 253.5339f, 316.1996f, 256.2007f, 23.4560f, 179.5914f, 69.6533f, 327.5987f,
-        408.8140f, 201.4197f, 435.5946f, 235.5696f, 12.7857f, 108.6503f, 162.1921f, 231.0668f, 377.1631f, 111.8490f, 387.6489f, 137.9771f,
-        118.1705f, 242.1441f, 242.3947f, 285.4007f, 343.2383f, 155.9774f, 439.5230f, 219.3007f, 47.8730f, 460.2977f, 158.3999f, 509.6342f,
-        39.8081f, 26.4865f, 146.8540f, 146.4408f, 184.0596f, 87.9846f, 312.9663f, 231.6809f, 2.2755f, 81.2708f, 30.6605f, 212.6897f,
-        112.0872f, 259.7130f, 113.2101f, 283.5961f, 316.9157f, 191.2768f, 407.0965f, 308.0034f, 391.8293f, 310.3482f, 445.5542f, 333.3923f,
-        30.6705f, 406.4540f, 50.1148f, 543.5478f, 426.6715f, 103.5286f, 455.4062f, 181.6925f, 373.5433f, 320.8254f, 423.9739f, 371.9462f,
-        429.1098f, 0.3217f, 440.5745f, 24.7185f, 344.4742f, 129.8145f, 353.9543f, 132.5740f, 268.3326f, 212.8878f, 405.8205f, 250.8319f,
-        238.7950f, -53.0971f, 286.2983f, 84.0919f};
+        132.1894f, 453.1169f, 199.9736f, 545.7127f, 64.3090f,  275.1729f, 104.8258f, 338.3436f,
+        76.1273f,  401.7875f, 135.6448f, 487.9920f, 12.8305f,  442.3624f, 77.1708f,  466.2458f,
+        -5.9609f,  340.1129f, 126.0715f, 451.3386f, 15.0119f,  224.3769f, 56.2927f,  236.5545f,
+        427.8277f, -14.2917f, 561.9954f, 95.4457f,  4.7940f,   -55.8461f, 69.2637f,  71.6517f,
+        41.3494f,  202.9014f, 91.1927f,  274.2992f, 375.6902f, 208.6749f, 451.5645f, 285.6396f,
+        258.4982f, 179.9212f, 321.7420f, 227.4412f, 367.5344f, 211.3590f, 406.8828f, 356.8083f,
+        277.5064f, 220.9636f, 353.4056f, 331.1991f, 429.2783f, 390.3169f, 452.8968f, 446.2962f,
+        292.5150f, 40.8054f,  345.9525f, 67.8517f,  218.4112f, 95.7302f,  303.7139f, 129.4475f,
+        325.0759f, 361.4403f, 387.6738f, 431.5647f, 161.8149f, 353.1971f, 285.5779f, 494.6398f,
+        153.4061f, 442.2182f, 192.6577f, 552.6060f, 161.0782f, 419.9203f, 306.5742f, 452.9917f,
+        25.8953f,  380.4122f, 143.8188f, 509.4868f, 325.7002f, 128.4980f, 470.8716f, 185.8499f,
+        67.4107f,  136.8775f, 193.2931f, 264.7841f, 65.6790f,  115.5359f, 87.8525f,  152.5492f,
+        83.4548f,  256.5595f, 162.8974f, 349.7399f, 407.8717f, 399.8657f, 434.1985f, 538.9396f,
+        103.6427f, 152.6073f, 226.5586f, 192.0336f, 299.0049f, 226.3779f, 387.0450f, 330.6239f,
+        408.0779f, 74.0950f,  448.3318f, 222.2046f, -30.8828f, 73.1804f,  108.6275f, 96.6196f,
+        373.4308f, 90.5068f,  391.5936f, 104.6787f, 111.3250f, -21.7549f, 196.3405f, 79.7002f,
+        54.0937f,  448.8364f, 162.5287f, 500.4571f, 339.5665f, 195.6321f, 349.3349f, 207.2475f,
+        409.8580f, 381.1502f, 499.9386f, 452.9707f, 86.2250f,  284.0088f, 208.7943f, 397.3206f,
+        278.8861f, 74.2190f,  289.9477f, 117.7022f, 106.2550f, 62.2701f,  183.5792f, 113.1921f,
+        257.3803f, 342.4895f, 296.9053f, 469.4987f, 261.0432f, 93.1105f,  360.8189f, 171.6012f,
+        295.8262f, 393.3591f, 314.5092f, 519.9261f, 241.4629f, 36.2717f,  382.0835f, 103.7837f,
+        0.3826f,   267.3577f, 134.6972f, 410.3510f, 332.4151f, 358.2527f, 361.1253f, 456.2211f,
+        312.7919f, 108.4937f, 361.9585f, 126.7627f, 297.0153f, 71.6643f,  385.5729f, 204.5431f,
+        -16.9604f, 445.3092f, 91.0309f,  519.2097f, 189.9415f, 121.2467f, 256.8973f, 143.3509f,
+        192.3739f, 203.1031f, 216.6613f, 226.8539f, 35.0965f,  164.5365f, 51.6150f,  267.9791f,
+        36.2014f,  122.4881f, 186.1665f, 130.5466f, 186.0576f, 366.0443f, 254.9050f, 409.7468f,
+        305.9496f, 375.0105f, 436.9568f, 396.8388f, 82.0940f,  155.7987f, 154.9680f, 222.5193f,
+        345.6593f, 386.1935f, 484.0906f, 448.9323f, 265.8611f, 67.1577f,  279.9372f, 145.9173f,
+        371.2164f, -19.1800f, 389.2053f, 23.4858f,  166.5204f, 282.6964f, 306.0356f, 288.4709f,
+        178.5089f, 450.7671f, 320.6853f, 543.3107f, 285.9132f, -9.0198f,  333.8062f, 47.6641f,
+        437.0255f, 54.9746f,  490.9451f, 153.0235f, 211.6987f, 250.8616f, 280.1138f, 268.0530f,
+        232.8247f, 403.4440f, 295.8328f, 406.4968f, 286.3401f, 25.5231f,  315.6569f, 63.5189f,
+        301.3286f, 163.1046f, 436.1865f, 232.1301f, 16.5538f,  343.6795f, 55.2966f,  403.3963f,
+        204.8009f, 124.9041f, 310.8865f, 246.6391f, 235.2927f, 65.7693f,  246.2989f, 123.0671f,
+        457.4555f, 57.7300f,  464.2295f, 137.7658f, 197.5504f, 160.3075f, 295.9562f, 249.7413f,
+        208.4036f, 237.5821f, 259.9170f, 241.8350f, 431.7683f, 392.0298f, 530.4317f, 469.7846f,
+        217.7836f, 294.9363f, 232.7928f, 347.3161f, 19.1783f,  313.3156f, 161.7061f, 377.0863f,
+        52.1937f,  483.5222f, 164.7224f, 499.4650f, -18.1881f, 147.1016f, 113.3757f, 264.7419f,
+        -10.3830f, 130.9681f, 10.9511f,  272.3863f, 191.6208f, 459.5145f, 240.3248f, 463.8325f,
+        356.6797f, 77.6355f,  412.5629f, 168.2401f, 326.2139f, 307.5013f, 407.2526f, 422.3140f,
+        -6.5422f,  355.5684f, 38.6912f,  399.0047f, 279.9745f, -10.2789f, 290.0085f, 108.0669f,
+        49.1601f,  186.5052f, 105.1230f, 281.7262f, 451.0742f, 30.5586f,  490.0021f, 170.0038f,
+        54.4314f,  19.1028f,  112.9336f, 166.2725f, 298.1461f, 228.2593f, 328.4931f, 235.5688f,
+        143.1079f, 111.0670f, 183.1305f, 178.3627f, 273.5727f, 356.7796f, 367.9886f, 439.2808f,
+        176.7118f, 442.3701f, 235.5468f, 465.2348f, 353.5905f, 375.8070f, 406.0526f, 426.9136f,
+        75.0636f,  58.9357f,  155.6155f, 207.0952f, 394.8923f, 135.3580f, 510.8995f, 138.7764f,
+        221.3792f, 93.1523f,  278.8305f, 161.5760f, 333.7764f, 4.2413f,   422.3168f, 130.7968f,
+        352.3830f, 447.2686f, 497.3472f, 496.5298f, 460.0268f, 164.7789f, 538.8018f, 237.2689f,
+        43.6929f,  38.9803f,  180.2527f, 185.7092f, 83.8176f,  387.4572f, 203.0748f, 459.2138f,
+        120.3420f, 189.3440f, 130.0911f, 209.8513f, 98.9678f,  13.2052f,  163.9035f, 21.9117f,
+        238.6976f, 10.0373f,  343.7471f, 151.9043f, 422.7512f, 299.3224f, 570.7713f, 339.9280f,
+        460.4900f, 353.3999f, 529.7881f, 429.5054f, 255.9741f, 98.2099f,  270.7991f, 112.7245f,
+        277.1439f, 426.6355f, 361.8833f, 490.7601f, 420.0563f, 355.7057f, 439.9143f, 495.2914f,
+        409.9785f, 386.2606f, 522.9550f, 462.1201f, 63.6084f,  40.9810f,  140.2522f, 186.6801f,
+        209.8752f, 5.4847f,   318.6665f, 45.0513f,  351.1511f, 395.6231f, 481.6860f, 471.8004f,
+        104.2444f, 88.3651f,  198.9577f, 217.4352f, 173.7778f, 275.5634f, 266.0312f, 343.3530f,
+        436.0951f, 358.6616f, 549.5261f, 401.3052f, 429.2604f, -0.0863f,  555.7863f, 128.3795f,
+        387.8089f, 360.8724f, 518.2979f, 419.9659f, 396.0101f, 429.2169f, 402.4382f, 509.2946f,
+        92.6291f,  290.9362f, 176.5014f, 437.4388f, 143.8130f, 206.2184f, 177.0371f, 235.0044f,
+        209.0457f, 415.3847f, 338.2372f, 461.2934f, 231.5831f, 260.9141f, 329.1943f, 266.5435f,
+        220.9448f, 342.6935f, 284.5580f, 402.0774f, 303.8214f, 394.8393f, 332.8489f, 425.6666f,
+        178.4043f, 323.5138f, 229.9188f, 425.8390f, 321.6556f, 129.9190f, 427.5185f, 157.9359f,
+        151.0502f, 8.1484f,   182.4998f, 109.6955f, 157.8666f, 99.0403f,  172.8104f, 139.2982f,
+        -3.0452f,  224.4737f, 130.2711f, 278.4012f, 36.9224f,  226.1483f, 151.7898f, 279.1286f,
+        409.8757f, 237.4242f, 440.6452f, 345.2202f, 200.8640f, 162.2960f, 245.4184f, 232.8059f,
+        41.0147f,  366.0289f, 186.8531f, 420.8625f, 326.4108f, 392.5565f, 432.9303f, 520.5973f,
+        231.0067f, 80.2522f,  322.9745f, 166.4729f, -12.8403f, 351.8312f, 33.9963f,  384.6920f,
+        135.3959f, 271.4291f, 180.9655f, 406.5427f, 85.0562f,  235.5178f, 91.9452f,  287.5727f,
+        273.1645f, 90.8612f,  382.7083f, 97.6691f,  133.7990f, 360.2684f, 141.2321f, 434.9638f,
+        31.6115f,  470.5798f, 33.3353f,  490.0465f, -27.3799f, 342.6524f, 82.3149f,  379.1839f,
+        219.6726f, 402.7702f, 362.0547f, 515.0898f, -45.9977f, 481.8516f, 67.7212f,  502.3336f,
+        388.7589f, 115.4080f, 460.0333f, 236.6427f, 40.9882f,  248.8122f, 114.4089f, 389.4114f,
+        270.2910f, 191.2797f, 336.2753f, 282.6530f, 197.6581f, 439.8926f, 247.0300f, 546.7361f,
+        182.0580f, -6.7583f,  260.7935f, 100.5661f, 3.2778f,   131.7233f, 68.5193f,  280.6516f,
+        356.3126f, 411.8249f, 446.4396f, 463.7141f, 379.1163f, 129.3928f, 513.9362f, 154.6585f,
+        -69.1199f, 354.7185f, 80.1365f,  433.0744f, 82.9357f,  151.1645f, 95.6685f,  231.6187f,
+        422.7932f, 476.2348f, 481.1110f, 503.7437f, 260.7842f, 395.5883f, 288.7094f, 487.9416f,
+        48.2868f,  149.1079f, 101.7528f, 152.2125f, 79.4785f,  315.4853f, 123.3120f, 454.7079f,
+        316.4901f, 148.2175f, 343.4961f, 188.6391f, 304.9847f, 299.7342f, 419.8321f, 306.6287f,
+        262.2399f, 320.6758f, 337.1869f, 337.8050f, 407.5904f, 396.3992f, 545.5580f, 433.1963f,
+        244.1037f, -8.6806f,  249.9599f, 33.1314f,  144.6461f, 107.1346f, 155.6258f, 113.0233f,
+        208.0726f, 334.6470f, 269.1603f, 377.2708f, 173.3525f, 266.8875f, 186.3138f, 296.6358f,
+        92.1346f,  219.0953f, 132.2813f, 276.5098f, -50.9776f, -1.5900f,  96.9408f,  56.8000f,
+        160.0388f, 148.3819f, 192.1737f, 199.8940f, 340.4449f, 407.6198f, 370.9644f, 457.4804f,
+        -34.0173f, 8.2614f,   52.4551f,  22.6314f,  181.9884f, 195.8403f, 257.1901f, 200.5959f,
+        278.2621f, 457.0166f, 365.7473f, 488.1317f, 276.6353f, -31.4300f, 333.7688f, 82.3108f,
+        326.2304f, 300.5375f, 450.4180f, 449.1682f, 394.4356f, 59.1311f,  416.0841f, 198.4815f,
+        323.4377f, 395.2401f, 388.2682f, 471.3687f, -0.4884f,  332.9131f, 103.2861f, 413.1549f,
+        172.3276f, 418.9163f, 302.6948f, 466.7889f, 273.6699f, 49.8039f,  329.7361f, 166.1209f,
+        79.9860f,  208.1720f, 165.5801f, 323.1208f, 15.6250f,  326.2367f, 26.9268f,  453.0333f,
+        98.6064f,  55.6348f,  124.9839f, 190.0650f, 221.7964f, 82.5141f,  233.0980f, 148.2322f,
+        152.2380f, -44.0412f, 261.6923f, 71.2233f,  66.3730f,  418.6809f, 110.2940f, 539.8344f,
+        357.7888f, 331.5282f, 466.6268f, 378.4887f, 457.3967f, 248.0516f, 468.2900f, 387.5087f,
+        35.9143f,  364.4689f, 165.4340f, 379.5258f, 402.0395f, 191.2334f, 527.5334f, 340.3795f,
+        1.8053f,   180.1951f, 16.0557f,  295.9387f, 460.2114f, 217.3174f, 464.7511f, 232.2148f,
+        471.2709f, 270.8305f, 480.6579f, 369.6087f, -58.0695f, 97.7211f,  70.1214f,  103.8139f,
+        363.5242f, 386.1504f, 399.4951f, 501.9083f, 443.7544f, 345.8341f, 526.4471f, 465.9183f,
+        420.6959f, 129.4022f, 485.2063f, 220.1614f, 425.5884f, 224.9686f, 545.1217f, 353.6407f,
+        238.2388f, 62.7213f,  312.0847f, 78.3060f,  1.2788f,   465.1168f, 76.8773f,  507.2295f,
+        350.7072f, 420.0901f, 499.0819f, 482.8026f, 295.2295f, 457.2856f, 318.5988f, 464.6119f,
+        248.9387f, 366.2193f, 368.7308f, 464.4846f, 266.4057f, -43.0988f, 411.9049f, 94.8485f,
+        365.3591f, 230.8355f, 381.3726f, 246.8133f, 213.6699f, 419.1429f, 302.9046f, 467.1919f,
+        282.3146f, 326.7091f, 321.6300f, 338.5049f, 157.0835f, 271.7193f, 238.9818f, 413.4953f,
+        -3.7474f,  97.9864f,  45.0004f,  165.3309f, 28.3577f,  158.4742f, 71.5941f,  260.1006f,
+        284.2465f, 120.1271f, 370.7495f, 246.4540f, 483.6205f, 186.3921f, 511.9348f, 335.0511f,
+        -27.5488f, 218.5612f, 43.3521f,  243.6668f, 229.8062f, 103.3855f, 327.7773f, 223.5129f,
+        365.4548f, 86.1273f,  385.5540f, 219.3533f, 343.5581f, 121.2852f, 483.2167f, 129.5677f,
+        234.4260f, 125.8439f, 310.7789f, 239.2034f, 248.4032f, 48.0437f,  371.5128f, 101.8978f,
+        299.1465f, 387.2317f, 397.5784f, 484.8726f, 376.0880f, 262.2631f, 482.8782f, 339.8563f,
+        7.2930f,   47.0424f,  114.9965f, 86.7440f,  397.3961f, 336.3557f, 528.7860f, 357.5037f,
+        -33.2049f, 414.6207f, 59.2223f,  433.0458f, 396.8727f, 110.5703f, 439.3271f, 126.9654f,
+        30.4567f,  27.2849f,  46.2837f,  123.3157f, 51.6484f,  -22.3715f, 142.9798f, 30.9887f,
+        -3.4962f,  6.9860f,   7.3904f,   40.2644f,  204.1520f, 329.0802f, 241.1047f, 433.1711f,
+        162.1569f, 441.9229f, 172.2023f, 545.2635f, 41.6043f,  -18.2279f, 124.3886f, 63.1082f,
+        213.0999f, 303.8811f, 237.9903f, 444.1898f, 155.2101f, 6.7177f,   247.1608f, 65.1444f,
+        324.4111f, 233.2946f, 443.2500f, 358.8382f, 384.8351f, 371.9398f, 508.2953f, 384.1355f,
+        302.7226f, 123.9848f, 349.8446f, 235.2196f, 20.8081f,  -68.6720f, 103.6023f, 79.6067f,
+        105.2511f, 234.0231f, 190.1397f, 361.1662f, 420.9290f, 451.9373f, 492.3893f, 539.3073f,
+        -4.9387f,  81.6146f,  93.6732f,  176.0028f, 187.2764f, 67.9256f,  219.5794f, 121.5657f,
+        397.7987f, 10.8413f,  544.7059f, 113.0846f, 467.5255f, 219.7334f, 483.1394f, 335.5223f,
+        143.3246f, 223.3545f, 267.8786f, 373.0906f, 288.9383f, 358.9469f, 378.4586f, 433.3239f,
+        209.6311f, 371.4695f, 247.1145f, 381.6038f, 320.6775f, 401.3793f, 432.7831f, 491.1622f,
+        8.9968f,   393.5190f, 22.5845f,  412.2537f, 13.8844f,  104.8985f, 130.2727f, 142.3685f,
+        262.6455f, 252.9446f, 351.5533f, 302.9328f, 107.5252f, 93.7443f,  125.0270f, 203.6677f,
+        326.6030f, 150.6990f, 339.4493f, 179.0864f, 119.1742f, 453.1236f, 232.0488f, 478.8208f,
+        420.9991f, 337.0981f, 465.6465f, 344.7978f, 342.8767f, 421.7388f, 476.3827f, 552.8516f,
+        189.1445f, 156.2901f, 303.6933f, 260.6224f, 333.9324f, 265.2428f, 438.9627f, 272.1948f,
+        114.3128f, 240.9499f, 156.8251f, 246.1655f, 193.8135f, 11.5223f,  300.4463f, 95.7648f,
+        27.6040f,  96.8022f,  169.8780f, 139.8998f, 423.1219f, 218.8621f, 437.7643f, 308.7743f,
+        386.7347f, 0.8091f,   436.3329f, 66.5652f,  433.0917f, 396.4442f, 469.0579f, 535.0178f,
+        408.9413f, 39.9801f,  468.5356f, 83.8636f,  423.9944f, 47.8940f,  535.6019f, 150.0867f,
+        78.3370f,  378.1336f, 149.9992f, 387.1877f, 422.8927f, -23.2443f, 508.9316f, 120.1789f,
+        261.7021f, 376.5726f, 309.5111f, 523.7055f, 200.2215f, 307.2894f, 222.2736f, 418.4116f,
+        259.8004f, -0.8479f,  300.5735f, 69.4688f,  106.7550f, 329.0340f, 235.8474f, 362.8130f,
+        98.8964f,  254.7818f, 189.6566f, 376.8467f, 91.9970f,  323.3163f, 149.3173f, 434.0331f,
+        -18.1340f, 397.0634f, 100.5620f, 431.1345f, 242.9804f, 325.0598f, 253.5845f, 393.2908f,
+        424.4659f, 258.1096f, 463.2957f, 328.0667f, 297.4333f, 99.1641f,  332.7187f, 223.2992f,
+        186.5782f, 297.1904f, 334.3975f, 400.0833f, 161.1921f, 430.0698f, 267.4008f, 526.9018f,
+        185.6758f, 244.8488f, 278.7259f, 342.6730f, 103.7673f, 311.5224f, 105.5101f, 352.8224f,
+        397.2368f, 190.3715f, 425.6990f, 246.7565f, 51.3437f,  374.1586f, 147.0393f, 381.9622f,
+        329.5223f, 439.7066f, 387.1005f, 557.9608f, 310.6336f, 47.4363f,  449.3514f, 112.9530f,
+        229.9626f, 68.0539f,  344.9065f, 134.3514f, 397.6331f, 250.9398f, 465.2933f, 288.4979f,
+        89.1863f,  224.5854f, 201.8640f, 256.7900f, 367.6410f, 241.4922f, 513.9763f, 330.0776f,
+        329.8622f, 6.7118f,   399.5483f, 42.3622f,  351.0067f, 196.8547f, 447.7431f, 207.4218f,
+        263.3493f, 233.8098f, 401.2304f, 349.1684f, 404.1452f, 264.0487f, 442.1978f, 321.1426f,
+        430.0009f, 299.8394f, 563.0980f, 357.4945f, 202.3143f, 327.4748f, 217.8485f, 392.7412f,
+        358.1485f, 259.5528f, 455.7672f, 381.9944f, 313.4684f, 370.7192f, 431.1113f, 419.5239f,
+        180.1469f, 255.4066f, 272.7232f, 369.3540f, 426.0572f, 198.2577f, 500.8918f, 339.2499f,
+        150.7206f, 253.3635f, 243.7053f, 352.8329f, 270.9340f, 17.9364f,  294.5319f, 83.2569f,
+        36.4112f,  80.3679f,  69.5312f,  192.7886f, 92.2801f,  229.0865f, 133.4951f, 298.3132f,
+        375.3135f, 405.1188f, 465.3827f, 467.8684f, 164.8547f, 299.8922f, 231.6980f, 379.1594f,
+        178.3286f, 21.0337f,  215.7555f, 69.3744f,  56.7212f,  287.8708f, 189.2598f, 304.4041f,
+        217.4480f, 79.4625f,  274.1624f, 142.2755f, 369.1791f, 357.2809f, 436.6378f, 376.7356f,
+        416.5593f, 382.6425f, 478.6048f, 444.7983f, 21.0025f,  254.7366f, 49.1120f,  338.7197f,
+        232.4042f, 225.8433f, 342.4166f, 365.5193f, 199.7265f, 166.0972f, 267.5468f, 172.4943f,
+        305.4298f, 176.3264f, 308.8521f, 269.9237f, 151.3188f, 397.4529f, 295.9569f, 466.6555f,
+        138.0480f, 359.6507f, 260.5968f, 363.6696f, 181.5352f, 240.7855f, 290.3455f, 278.9682f,
+        225.7522f, 174.7890f, 356.2469f, 193.4433f, 182.4345f, 8.5387f,   318.5487f, 41.8410f,
+        210.4292f, 50.5482f,  261.7152f, 92.4592f,  362.9012f, 66.1153f,  454.9341f, 126.9099f,
+        326.9678f, 146.7783f, 418.6802f, 226.6052f, 150.2754f, 471.4981f, 191.1031f, 472.6456f,
+        383.2531f, 240.0174f, 417.3240f, 265.1360f, 417.8392f, 109.9494f, 435.8114f, 124.8908f,
+        27.1272f,  11.4244f,  126.3650f, 94.3257f,  232.6628f, 144.1367f, 350.0197f, 194.1688f,
+        85.4650f,  366.5097f, 199.8470f, 449.2209f, 345.5237f, 174.6456f, 393.6487f, 208.6972f,
+        103.6008f, 383.9478f, 135.1845f, 388.5580f, 301.4075f, 330.7206f, 369.9960f, 471.9843f,
+        86.3247f,  46.8414f,  168.7999f, 63.9793f,  186.5999f, 294.3789f, 324.5439f, 314.2809f,
+        408.6489f, 468.1303f, 539.9976f, 490.9658f, 121.9074f, 127.4639f, 259.4001f, 274.6741f,
+        374.0247f, -21.0436f, 501.7138f, 71.9877f,  421.1110f, 415.6848f, 565.8336f, 507.6180f,
+        402.2457f, 367.8241f, 472.6052f, 515.8422f, 78.8962f,  253.9820f, 86.9698f,  268.1594f,
+        403.1037f, 203.0262f, 416.5545f, 349.2269f, -13.5009f, 90.1716f,  45.6503f,  121.5695f,
+        176.9532f, 362.8065f, 216.3486f, 456.6442f, 422.2061f, 217.5038f, 448.5273f, 281.0963f,
+        272.8624f, -12.1655f, 415.8898f, 46.0433f,  251.3114f, 271.6299f, 281.4290f, 411.3851f,
+        121.9583f, 463.6307f, 265.9058f, 486.8656f, 348.9660f, 339.7936f, 463.3310f, 489.3569f,
+        306.5287f, 109.8543f, 403.0297f, 167.3439f, 183.3392f, -22.1712f, 285.0661f, 75.4963f,
+        421.0473f, 397.5667f, 471.4370f, 542.7847f, 66.3152f,  463.7401f, 163.6328f, 473.3226f,
+        70.7872f,  196.9543f, 99.6043f,  335.4611f, 251.0428f, 278.3568f, 391.7609f, 363.9607f,
+        463.0136f, 178.3225f, 508.9808f, 284.2776f, 104.1169f, 198.2685f, 143.1397f, 221.4969f,
+        71.3536f,  19.4869f,  178.3168f, 99.9616f,  20.3440f,  -2.3003f,  119.1549f, 99.0532f,
+        396.1600f, 81.8756f,  464.4035f, 150.8565f, 65.5815f,  406.2740f, 160.8160f, 430.3668f,
+        239.2070f, 54.2293f,  263.9715f, 91.6030f,  444.7733f, 49.1971f,  546.0992f, 177.5016f,
+        -14.5900f, 271.2390f, 26.7309f,  277.3751f, 257.4168f, 54.2554f,  299.0693f, 160.8758f,
+        243.5621f, 6.6488f,   268.7269f, 156.5579f, 378.4616f, 280.6006f, 428.9858f, 282.7156f,
+        152.4626f, 171.5487f, 202.8190f, 196.5445f, 170.8344f, 262.3559f, 239.5070f, 363.8034f,
+        69.2827f,  451.1334f, 98.6552f,  461.0720f, 355.5286f, 31.0572f,  385.2867f, 119.9359f,
+        351.4949f, 405.2588f, 433.2140f, 508.1748f, 58.2303f,  406.9281f, 78.4330f,  495.5619f,
+        144.9057f, 386.8375f, 248.5514f, 442.2501f, 375.6284f, 263.1954f, 517.2766f, 368.0905f,
+        -30.9426f, 265.2984f, 33.6499f,  354.8483f, 81.7472f,  303.6374f, 217.0119f, 335.5753f,
+        269.6966f, 302.7942f, 285.3457f, 387.7014f, 163.3466f, -57.9610f, 170.7473f, 74.4432f,
+        81.7806f,  428.8672f, 190.2646f, 529.2253f, 172.8226f, 257.1534f, 287.2148f, 328.4503f,
+        27.4537f,  366.2749f, 154.0694f, 415.1909f, 260.0797f, 181.7424f, 269.5455f, 195.5394f,
+        294.9684f, -12.5261f, 411.7275f, 24.9233f,  259.0953f, 253.5339f, 316.1996f, 256.2007f,
+        23.4560f,  179.5914f, 69.6533f,  327.5987f, 408.8140f, 201.4197f, 435.5946f, 235.5696f,
+        12.7857f,  108.6503f, 162.1921f, 231.0668f, 377.1631f, 111.8490f, 387.6489f, 137.9771f,
+        118.1705f, 242.1441f, 242.3947f, 285.4007f, 343.2383f, 155.9774f, 439.5230f, 219.3007f,
+        47.8730f,  460.2977f, 158.3999f, 509.6342f, 39.8081f,  26.4865f,  146.8540f, 146.4408f,
+        184.0596f, 87.9846f,  312.9663f, 231.6809f, 2.2755f,   81.2708f,  30.6605f,  212.6897f,
+        112.0872f, 259.7130f, 113.2101f, 283.5961f, 316.9157f, 191.2768f, 407.0965f, 308.0034f,
+        391.8293f, 310.3482f, 445.5542f, 333.3923f, 30.6705f,  406.4540f, 50.1148f,  543.5478f,
+        426.6715f, 103.5286f, 455.4062f, 181.6925f, 373.5433f, 320.8254f, 423.9739f, 371.9462f,
+        429.1098f, 0.3217f,   440.5745f, 24.7185f,  344.4742f, 129.8145f, 353.9543f, 132.5740f,
+        268.3326f, 212.8878f, 405.8205f, 250.8319f, 238.7950f, -53.0971f, 286.2983f, 84.0919f};
     std::vector<float> scores_vec = {
-        0.9822f, 0.9644f, 0.1426f, 0.7149f, 0.6008f, 0.6906f, 0.0962f, 0.1886f, 0.0766f, 0.6041f, 0.9866f, 0.6720f,
-        0.7108f, 0.9846f, 0.6780f, 0.0402f, 0.8670f, 0.3647f, 0.0044f, 0.5072f, 0.9370f, 0.2573f, 0.4915f, 0.1738f,
-        0.0577f, 0.0805f, 0.7270f, 0.8641f, 0.1433f, 0.2883f, 0.1950f, 0.0269f, 0.5534f, 0.6999f, 0.6479f, 0.3881f,
-        0.5550f, 0.0941f, 0.1543f, 0.9318f, 0.7615f, 0.9227f, 0.9167f, 0.6494f, 0.9282f, 0.4167f, 0.0036f, 0.0626f,
-        0.1095f, 0.0954f, 0.3517f, 0.7013f, 0.7906f, 0.5902f, 0.1464f, 0.7479f, 0.3548f, 0.0130f, 0.2806f, 0.3306f,
-        0.2742f, 0.8119f, 0.7599f, 0.6956f, 0.1390f, 0.8078f, 0.6772f, 0.1948f, 0.6481f, 0.4835f, 0.4394f, 0.1121f,
-        0.5183f, 0.0999f, 0.1643f, 0.1325f, 0.9541f, 0.2849f, 0.3552f, 0.3221f, 0.8983f, 0.5630f, 0.9192f, 0.2999f,
-        0.1148f, 0.5562f, 0.3455f, 0.8019f, 0.8794f, 0.4726f, 0.9714f, 0.5530f, 0.2709f, 0.4890f, 0.0373f, 0.8040f,
-        0.1014f, 0.3087f, 0.5653f, 0.0430f, 0.0793f, 0.6961f, 0.0718f, 0.4771f, 0.3387f, 0.2281f, 0.1888f, 0.7634f,
-        0.9515f, 0.1402f, 0.9597f, 0.5948f, 0.6417f, 0.7099f, 0.7041f, 0.8198f, 0.4835f, 0.5334f, 0.3238f, 0.1053f,
-        0.6646f, 0.0336f, 0.2756f, 0.0942f, 0.1907f, 0.6387f, 0.6285f, 0.4211f, 0.0902f, 0.4334f, 0.3527f, 0.7205f,
-        0.5790f, 0.4916f, 0.4870f, 0.9663f, 0.7563f, 0.4970f, 0.4792f, 0.0265f, 0.9425f, 0.3192f, 0.2559f, 0.9994f,
-        0.7187f, 0.0474f, 0.0619f, 0.0255f, 0.5996f, 0.0716f, 0.9334f, 0.9369f, 0.5461f, 0.6166f, 0.2919f, 0.0640f,
-        0.7375f, 0.1018f, 0.0856f, 0.3112f, 0.0125f, 0.4340f, 0.7077f, 0.8013f, 0.6043f, 0.8469f, 0.4065f, 0.8488f,
-        0.5065f, 0.2230f, 0.9441f, 0.2750f, 0.0262f, 0.2427f, 0.3667f, 0.3513f, 0.5247f, 0.8831f, 0.2923f, 0.5208f,
-        0.3401f, 0.8218f, 0.1576f, 0.1035f, 0.5030f, 0.6719f, 0.7955f, 0.5896f, 0.7738f, 0.3927f, 0.0329f, 0.1161f,
-        0.0387f, 0.3289f, 0.4955f, 0.3563f, 0.5606f, 0.4806f, 0.6779f, 0.6670f, 0.3181f, 0.3462f, 0.5851f, 0.5964f,
-        0.3147f, 0.3303f, 0.6940f, 0.6474f, 0.1351f, 0.4410f, 0.8927f, 0.0363f, 0.8552f, 0.1632f, 0.5072f, 0.4243f,
-        0.0101f, 0.9154f, 0.4549f, 0.9543f, 0.2867f, 0.8663f, 0.9224f, 0.5568f, 0.2027f, 0.6852f, 0.5490f, 0.9445f,
-        0.4393f, 0.2685f, 0.1383f, 0.6986f, 0.9741f, 0.0283f, 0.7404f, 0.9269f, 0.0748f, 0.1102f, 0.6920f, 0.6480f,
-        0.0688f, 0.8344f, 0.5234f, 0.9072f, 0.8780f, 0.8125f, 0.5159f, 0.2517f, 0.5060f, 0.1008f, 0.6588f, 0.1340f,
-        0.5112f, 0.0544f, 0.2995f, 0.2321f, 0.6200f, 0.7868f, 0.0573f, 0.8503f, 0.8608f, 0.3423f, 0.6590f, 0.4026f,
-        0.1542f, 0.5287f, 0.0864f, 0.8785f, 0.9243f, 0.8216f, 0.5625f, 0.5576f, 0.9846f, 0.2479f, 0.0759f, 0.5619f,
-        0.3288f, 0.3223f, 0.0071f, 0.5962f, 0.2640f, 0.1879f, 0.0404f, 0.3644f, 0.8790f, 0.3367f, 0.6791f, 0.7565f,
-        0.3281f, 0.8216f, 0.6919f, 0.5592f, 0.0010f, 0.0351f, 0.9909f, 0.7823f, 0.9376f, 0.9023f, 0.0204f, 0.7918f,
-        0.4511f, 0.7896f, 0.0067f, 0.2882f, 0.7513f, 0.7930f, 0.6197f, 0.3013f, 0.3104f, 0.9668f, 0.4392f, 0.4471f,
-        0.5523f, 0.4095f, 0.5527f, 0.4323f, 0.8267f, 0.9091f, 0.9321f, 0.5643f, 0.4421f, 0.7052f, 0.8383f, 0.5630f,
-        0.7000f, 0.7497f, 0.6764f, 0.7461f, 0.2086f, 0.4984f, 0.5883f, 0.0025f, 0.8560f, 0.6100f, 0.1291f, 0.8164f,
-        0.7171f, 0.7583f, 0.3920f, 0.8542f, 0.4140f, 0.5705f, 0.0006f, 0.6449f, 0.7182f, 0.5671f, 0.4966f, 0.8099f,
-        0.6814f, 0.2781f, 0.9591f, 0.7073f, 0.9879f, 0.9713f, 0.9189f, 0.7554f, 0.6094f, 0.1722f, 0.5434f, 0.7654f,
-        0.5209f, 0.8682f, 0.1097f, 0.3809f, 0.5060f, 0.4323f, 0.1086f, 0.1535f, 0.8376f, 0.4844f, 0.0487f, 0.0165f,
-        0.4735f, 0.1644f, 0.7051f, 0.7953f, 0.2283f, 0.5922f, 0.1544f, 0.3036f, 0.8888f, 0.5441f, 0.8859f, 0.2252f,
-        0.3300f, 0.4710f, 0.4801f, 0.9976f, 0.1144f, 0.8520f, 0.8637f, 0.5532f, 0.3440f, 0.5192f, 0.2925f, 0.7991f,
-        0.4983f, 0.9258f, 0.6227f, 0.5143f, 0.7111f, 0.5039f, 0.9045f, 0.1844f, 0.9733f, 0.8122f, 0.8607f, 0.4829f,
-        0.8372f, 0.3068f, 0.7619f, 0.1405f, 0.3071f, 0.4457f, 0.3223f, 0.3870f, 0.8201f, 0.2567f, 0.7453f, 0.0737f,
-        0.7657f, 0.7920f, 0.4017f, 0.7225f, 0.9151f, 0.8007f, 0.3904f, 0.4842f, 0.7794f, 0.2926f, 0.8039f, 0.3281f,
-        0.8060f, 0.0868f, 0.0444f, 0.9977f, 0.8695f, 0.8828f, 0.9513f, 0.4383f, 0.2868f, 0.1300f, 0.5012f, 0.2200f,
-        0.9356f, 0.0040f, 0.1432f, 0.2465f, 0.1990f, 0.2258f, 0.6560f, 0.3275f, 0.6150f, 0.8903f, 0.6026f, 0.6945f,
-        0.3655f, 0.1597f, 0.3206f, 0.9643f, 0.6218f, 0.2775f, 0.4509f, 0.8355f, 0.6684f, 0.5607f, 0.8852f, 0.6724f,
-        0.6427f, 0.1898f, 0.1064f, 0.9651f, 0.5989f, 0.4157f, 0.5890f, 0.0618f, 0.8221f, 0.2166f, 0.8045f, 0.5344f,
-        0.2766f, 0.0302f, 0.8158f, 0.1765f, 0.0518f, 0.7559f, 0.3500f, 0.3893f, 0.2471f, 0.8592f, 0.2973f, 0.2102f,
-        0.3092f, 0.2031f, 0.3177f, 0.0829f, 0.1585f, 0.4171f, 0.8795f, 0.0573f, 0.2127f, 0.9083f, 0.8900f, 0.6795f,
-        0.2405f, 0.4198f, 0.2112f, 0.1286f, 0.3800f, 0.5758f, 0.3599f, 0.6108f, 0.2963f, 0.3459f, 0.7907f, 0.8783f,
-        0.3220f, 0.5715f, 0.2782f, 0.0533f, 0.7379f, 0.1710f, 0.4257f, 0.4870f, 0.1845f, 0.0946f, 0.3480f, 0.9523f,
-        0.6151f, 0.3814f, 0.0389f, 0.6003f, 0.0923f, 0.5425f, 0.7520f, 0.4236f, 0.2994f, 0.0474f, 0.0248f, 0.4300f,
-        0.8833f, 0.2441f, 0.5741f, 0.6843f, 0.0608f, 0.1531f, 0.3313f, 0.6701f, 0.4390f, 0.7342f, 0.8676f, 0.7584f,
-        0.9922f, 0.7544f, 0.8522f, 0.8324f, 0.7303f, 0.8018f, 0.9347f, 0.4752f, 0.6383f, 0.5149f, 0.8510f, 0.4314f,
-        0.8197f, 0.7994f, 0.9619f, 0.2489f, 0.7096f, 0.7569f, 0.9363f, 0.9069f, 0.5735f, 0.5792f, 0.1673f, 0.9750f,
-        0.2550f, 0.7247f, 0.7958f, 0.4412f, 0.2112f, 0.1890f, 0.8565f, 0.5108f, 0.0901f, 0.7170f, 0.2502f, 0.8764f,
-        0.3096f, 0.2003f, 0.0849f, 0.5115f, 0.4507f, 0.7513f, 0.4646f, 0.3438f, 0.2617f, 0.2781f, 0.9278f, 0.1651f,
-        0.9882f, 0.3269f, 0.0884f, 0.2487f, 0.0584f, 0.7900f, 0.5126f, 0.3370f, 0.6620f, 0.6306f, 0.9399f, 0.9613f,
-        0.6807f, 0.8178f, 0.7924f, 0.4913f, 0.7045f, 0.0783f, 0.7580f, 0.9618f, 0.0850f, 0.8361f, 0.9330f, 0.2262f,
-        0.5248f, 0.9279f, 0.9602f, 0.1279f, 0.3490f, 0.6981f, 0.2216f, 0.3248f, 0.0233f, 0.1535f, 0.5623f, 0.6531f,
-        0.6489f, 0.7784f, 0.4153f, 0.2735f, 0.0156f, 0.2066f, 0.3124f, 0.1782f, 0.0201f, 0.1574f, 0.6661f, 0.6296f,
-        0.9357f, 0.7982f, 0.5678f, 0.1376f, 0.5641f, 0.0616f, 0.4309f, 0.3903f, 0.4278f, 0.2798f, 0.6858f, 0.8409f,
-        0.7685f, 0.6278f, 0.5383f, 0.0311f, 0.7229f, 0.5450f, 0.2707f, 0.3278f, 0.9356f, 0.6244f, 0.4759f, 0.6209f,
-        0.4137f, 0.4702f, 0.2903f, 0.4399f, 0.6856f, 0.0399f, 0.7950f, 0.2830f, 0.6826f, 0.6427f, 0.6526f, 0.6081f,
-        0.9591f, 0.5083f, 0.7323f, 0.7054f, 0.2363f, 0.2833f, 0.4240f, 0.2777f, 0.3667f, 0.3910f, 0.6039f, 0.2199f,
-        0.8043f, 0.4375f, 0.7062f, 0.0814f, 0.4700f, 0.0282f, 0.6759f, 0.3437f, 0.9493f, 0.3241f, 0.5638f, 0.2574f,
-        0.6201f, 0.4670f, 0.3706f, 0.2037f, 0.1115f, 0.1199f, 0.9990f, 0.4123f, 0.0019f, 0.9529f, 0.0200f, 0.4186f,
-        0.7175f, 0.9146f, 0.7129f, 0.4636f, 0.9744f, 0.0393f, 0.9869f, 0.8494f, 0.9289f, 0.2548f, 0.1425f, 0.6633f,
-        0.5159f, 0.5232f, 0.9246f, 0.6201f, 0.3111f, 0.4001f, 0.1335f, 0.1923f, 0.1434f, 0.8103f, 0.7049f, 0.5303f,
-        0.3744f, 0.6685f, 0.8129f, 0.8812f, 0.5470f, 0.8199f, 0.5113f, 0.4745f, 0.8654f, 0.3864f, 0.3959f, 0.3049f,
-        0.5187f, 0.5449f, 0.6605f, 0.4305f, 0.2178f, 0.8668f, 0.3460f, 0.9229f, 0.2074f, 0.5601f, 0.5366f, 0.8286f,
-        0.1389f, 0.9099f, 0.5314f, 0.5861f, 0.5102f, 0.0360f, 0.4971f, 0.2635f, 0.3427f, 0.6491f, 0.4977f, 0.0932f,
-        0.0730f, 0.1857f, 0.1909f, 0.6083f, 0.1778f, 0.8817f, 0.2098f, 0.0911f, 0.8757f, 0.2953f, 0.4254f, 0.9590f,
-        0.9444f, 0.7149f, 0.0689f, 0.5933f, 0.9891f, 0.9469f, 0.1060f, 0.3960f};
+        0.9822f, 0.9644f, 0.1426f, 0.7149f, 0.6008f, 0.6906f, 0.0962f, 0.1886f, 0.0766f, 0.6041f,
+        0.9866f, 0.6720f, 0.7108f, 0.9846f, 0.6780f, 0.0402f, 0.8670f, 0.3647f, 0.0044f, 0.5072f,
+        0.9370f, 0.2573f, 0.4915f, 0.1738f, 0.0577f, 0.0805f, 0.7270f, 0.8641f, 0.1433f, 0.2883f,
+        0.1950f, 0.0269f, 0.5534f, 0.6999f, 0.6479f, 0.3881f, 0.5550f, 0.0941f, 0.1543f, 0.9318f,
+        0.7615f, 0.9227f, 0.9167f, 0.6494f, 0.9282f, 0.4167f, 0.0036f, 0.0626f, 0.1095f, 0.0954f,
+        0.3517f, 0.7013f, 0.7906f, 0.5902f, 0.1464f, 0.7479f, 0.3548f, 0.0130f, 0.2806f, 0.3306f,
+        0.2742f, 0.8119f, 0.7599f, 0.6956f, 0.1390f, 0.8078f, 0.6772f, 0.1948f, 0.6481f, 0.4835f,
+        0.4394f, 0.1121f, 0.5183f, 0.0999f, 0.1643f, 0.1325f, 0.9541f, 0.2849f, 0.3552f, 0.3221f,
+        0.8983f, 0.5630f, 0.9192f, 0.2999f, 0.1148f, 0.5562f, 0.3455f, 0.8019f, 0.8794f, 0.4726f,
+        0.9714f, 0.5530f, 0.2709f, 0.4890f, 0.0373f, 0.8040f, 0.1014f, 0.3087f, 0.5653f, 0.0430f,
+        0.0793f, 0.6961f, 0.0718f, 0.4771f, 0.3387f, 0.2281f, 0.1888f, 0.7634f, 0.9515f, 0.1402f,
+        0.9597f, 0.5948f, 0.6417f, 0.7099f, 0.7041f, 0.8198f, 0.4835f, 0.5334f, 0.3238f, 0.1053f,
+        0.6646f, 0.0336f, 0.2756f, 0.0942f, 0.1907f, 0.6387f, 0.6285f, 0.4211f, 0.0902f, 0.4334f,
+        0.3527f, 0.7205f, 0.5790f, 0.4916f, 0.4870f, 0.9663f, 0.7563f, 0.4970f, 0.4792f, 0.0265f,
+        0.9425f, 0.3192f, 0.2559f, 0.9994f, 0.7187f, 0.0474f, 0.0619f, 0.0255f, 0.5996f, 0.0716f,
+        0.9334f, 0.9369f, 0.5461f, 0.6166f, 0.2919f, 0.0640f, 0.7375f, 0.1018f, 0.0856f, 0.3112f,
+        0.0125f, 0.4340f, 0.7077f, 0.8013f, 0.6043f, 0.8469f, 0.4065f, 0.8488f, 0.5065f, 0.2230f,
+        0.9441f, 0.2750f, 0.0262f, 0.2427f, 0.3667f, 0.3513f, 0.5247f, 0.8831f, 0.2923f, 0.5208f,
+        0.3401f, 0.8218f, 0.1576f, 0.1035f, 0.5030f, 0.6719f, 0.7955f, 0.5896f, 0.7738f, 0.3927f,
+        0.0329f, 0.1161f, 0.0387f, 0.3289f, 0.4955f, 0.3563f, 0.5606f, 0.4806f, 0.6779f, 0.6670f,
+        0.3181f, 0.3462f, 0.5851f, 0.5964f, 0.3147f, 0.3303f, 0.6940f, 0.6474f, 0.1351f, 0.4410f,
+        0.8927f, 0.0363f, 0.8552f, 0.1632f, 0.5072f, 0.4243f, 0.0101f, 0.9154f, 0.4549f, 0.9543f,
+        0.2867f, 0.8663f, 0.9224f, 0.5568f, 0.2027f, 0.6852f, 0.5490f, 0.9445f, 0.4393f, 0.2685f,
+        0.1383f, 0.6986f, 0.9741f, 0.0283f, 0.7404f, 0.9269f, 0.0748f, 0.1102f, 0.6920f, 0.6480f,
+        0.0688f, 0.8344f, 0.5234f, 0.9072f, 0.8780f, 0.8125f, 0.5159f, 0.2517f, 0.5060f, 0.1008f,
+        0.6588f, 0.1340f, 0.5112f, 0.0544f, 0.2995f, 0.2321f, 0.6200f, 0.7868f, 0.0573f, 0.8503f,
+        0.8608f, 0.3423f, 0.6590f, 0.4026f, 0.1542f, 0.5287f, 0.0864f, 0.8785f, 0.9243f, 0.8216f,
+        0.5625f, 0.5576f, 0.9846f, 0.2479f, 0.0759f, 0.5619f, 0.3288f, 0.3223f, 0.0071f, 0.5962f,
+        0.2640f, 0.1879f, 0.0404f, 0.3644f, 0.8790f, 0.3367f, 0.6791f, 0.7565f, 0.3281f, 0.8216f,
+        0.6919f, 0.5592f, 0.0010f, 0.0351f, 0.9909f, 0.7823f, 0.9376f, 0.9023f, 0.0204f, 0.7918f,
+        0.4511f, 0.7896f, 0.0067f, 0.2882f, 0.7513f, 0.7930f, 0.6197f, 0.3013f, 0.3104f, 0.9668f,
+        0.4392f, 0.4471f, 0.5523f, 0.4095f, 0.5527f, 0.4323f, 0.8267f, 0.9091f, 0.9321f, 0.5643f,
+        0.4421f, 0.7052f, 0.8383f, 0.5630f, 0.7000f, 0.7497f, 0.6764f, 0.7461f, 0.2086f, 0.4984f,
+        0.5883f, 0.0025f, 0.8560f, 0.6100f, 0.1291f, 0.8164f, 0.7171f, 0.7583f, 0.3920f, 0.8542f,
+        0.4140f, 0.5705f, 0.0006f, 0.6449f, 0.7182f, 0.5671f, 0.4966f, 0.8099f, 0.6814f, 0.2781f,
+        0.9591f, 0.7073f, 0.9879f, 0.9713f, 0.9189f, 0.7554f, 0.6094f, 0.1722f, 0.5434f, 0.7654f,
+        0.5209f, 0.8682f, 0.1097f, 0.3809f, 0.5060f, 0.4323f, 0.1086f, 0.1535f, 0.8376f, 0.4844f,
+        0.0487f, 0.0165f, 0.4735f, 0.1644f, 0.7051f, 0.7953f, 0.2283f, 0.5922f, 0.1544f, 0.3036f,
+        0.8888f, 0.5441f, 0.8859f, 0.2252f, 0.3300f, 0.4710f, 0.4801f, 0.9976f, 0.1144f, 0.8520f,
+        0.8637f, 0.5532f, 0.3440f, 0.5192f, 0.2925f, 0.7991f, 0.4983f, 0.9258f, 0.6227f, 0.5143f,
+        0.7111f, 0.5039f, 0.9045f, 0.1844f, 0.9733f, 0.8122f, 0.8607f, 0.4829f, 0.8372f, 0.3068f,
+        0.7619f, 0.1405f, 0.3071f, 0.4457f, 0.3223f, 0.3870f, 0.8201f, 0.2567f, 0.7453f, 0.0737f,
+        0.7657f, 0.7920f, 0.4017f, 0.7225f, 0.9151f, 0.8007f, 0.3904f, 0.4842f, 0.7794f, 0.2926f,
+        0.8039f, 0.3281f, 0.8060f, 0.0868f, 0.0444f, 0.9977f, 0.8695f, 0.8828f, 0.9513f, 0.4383f,
+        0.2868f, 0.1300f, 0.5012f, 0.2200f, 0.9356f, 0.0040f, 0.1432f, 0.2465f, 0.1990f, 0.2258f,
+        0.6560f, 0.3275f, 0.6150f, 0.8903f, 0.6026f, 0.6945f, 0.3655f, 0.1597f, 0.3206f, 0.9643f,
+        0.6218f, 0.2775f, 0.4509f, 0.8355f, 0.6684f, 0.5607f, 0.8852f, 0.6724f, 0.6427f, 0.1898f,
+        0.1064f, 0.9651f, 0.5989f, 0.4157f, 0.5890f, 0.0618f, 0.8221f, 0.2166f, 0.8045f, 0.5344f,
+        0.2766f, 0.0302f, 0.8158f, 0.1765f, 0.0518f, 0.7559f, 0.3500f, 0.3893f, 0.2471f, 0.8592f,
+        0.2973f, 0.2102f, 0.3092f, 0.2031f, 0.3177f, 0.0829f, 0.1585f, 0.4171f, 0.8795f, 0.0573f,
+        0.2127f, 0.9083f, 0.8900f, 0.6795f, 0.2405f, 0.4198f, 0.2112f, 0.1286f, 0.3800f, 0.5758f,
+        0.3599f, 0.6108f, 0.2963f, 0.3459f, 0.7907f, 0.8783f, 0.3220f, 0.5715f, 0.2782f, 0.0533f,
+        0.7379f, 0.1710f, 0.4257f, 0.4870f, 0.1845f, 0.0946f, 0.3480f, 0.9523f, 0.6151f, 0.3814f,
+        0.0389f, 0.6003f, 0.0923f, 0.5425f, 0.7520f, 0.4236f, 0.2994f, 0.0474f, 0.0248f, 0.4300f,
+        0.8833f, 0.2441f, 0.5741f, 0.6843f, 0.0608f, 0.1531f, 0.3313f, 0.6701f, 0.4390f, 0.7342f,
+        0.8676f, 0.7584f, 0.9922f, 0.7544f, 0.8522f, 0.8324f, 0.7303f, 0.8018f, 0.9347f, 0.4752f,
+        0.6383f, 0.5149f, 0.8510f, 0.4314f, 0.8197f, 0.7994f, 0.9619f, 0.2489f, 0.7096f, 0.7569f,
+        0.9363f, 0.9069f, 0.5735f, 0.5792f, 0.1673f, 0.9750f, 0.2550f, 0.7247f, 0.7958f, 0.4412f,
+        0.2112f, 0.1890f, 0.8565f, 0.5108f, 0.0901f, 0.7170f, 0.2502f, 0.8764f, 0.3096f, 0.2003f,
+        0.0849f, 0.5115f, 0.4507f, 0.7513f, 0.4646f, 0.3438f, 0.2617f, 0.2781f, 0.9278f, 0.1651f,
+        0.9882f, 0.3269f, 0.0884f, 0.2487f, 0.0584f, 0.7900f, 0.5126f, 0.3370f, 0.6620f, 0.6306f,
+        0.9399f, 0.9613f, 0.6807f, 0.8178f, 0.7924f, 0.4913f, 0.7045f, 0.0783f, 0.7580f, 0.9618f,
+        0.0850f, 0.8361f, 0.9330f, 0.2262f, 0.5248f, 0.9279f, 0.9602f, 0.1279f, 0.3490f, 0.6981f,
+        0.2216f, 0.3248f, 0.0233f, 0.1535f, 0.5623f, 0.6531f, 0.6489f, 0.7784f, 0.4153f, 0.2735f,
+        0.0156f, 0.2066f, 0.3124f, 0.1782f, 0.0201f, 0.1574f, 0.6661f, 0.6296f, 0.9357f, 0.7982f,
+        0.5678f, 0.1376f, 0.5641f, 0.0616f, 0.4309f, 0.3903f, 0.4278f, 0.2798f, 0.6858f, 0.8409f,
+        0.7685f, 0.6278f, 0.5383f, 0.0311f, 0.7229f, 0.5450f, 0.2707f, 0.3278f, 0.9356f, 0.6244f,
+        0.4759f, 0.6209f, 0.4137f, 0.4702f, 0.2903f, 0.4399f, 0.6856f, 0.0399f, 0.7950f, 0.2830f,
+        0.6826f, 0.6427f, 0.6526f, 0.6081f, 0.9591f, 0.5083f, 0.7323f, 0.7054f, 0.2363f, 0.2833f,
+        0.4240f, 0.2777f, 0.3667f, 0.3910f, 0.6039f, 0.2199f, 0.8043f, 0.4375f, 0.7062f, 0.0814f,
+        0.4700f, 0.0282f, 0.6759f, 0.3437f, 0.9493f, 0.3241f, 0.5638f, 0.2574f, 0.6201f, 0.4670f,
+        0.3706f, 0.2037f, 0.1115f, 0.1199f, 0.9990f, 0.4123f, 0.0019f, 0.9529f, 0.0200f, 0.4186f,
+        0.7175f, 0.9146f, 0.7129f, 0.4636f, 0.9744f, 0.0393f, 0.9869f, 0.8494f, 0.9289f, 0.2548f,
+        0.1425f, 0.6633f, 0.5159f, 0.5232f, 0.9246f, 0.6201f, 0.3111f, 0.4001f, 0.1335f, 0.1923f,
+        0.1434f, 0.8103f, 0.7049f, 0.5303f, 0.3744f, 0.6685f, 0.8129f, 0.8812f, 0.5470f, 0.8199f,
+        0.5113f, 0.4745f, 0.8654f, 0.3864f, 0.3959f, 0.3049f, 0.5187f, 0.5449f, 0.6605f, 0.4305f,
+        0.2178f, 0.8668f, 0.3460f, 0.9229f, 0.2074f, 0.5601f, 0.5366f, 0.8286f, 0.1389f, 0.9099f,
+        0.5314f, 0.5861f, 0.5102f, 0.0360f, 0.4971f, 0.2635f, 0.3427f, 0.6491f, 0.4977f, 0.0932f,
+        0.0730f, 0.1857f, 0.1909f, 0.6083f, 0.1778f, 0.8817f, 0.2098f, 0.0911f, 0.8757f, 0.2953f,
+        0.4254f, 0.9590f, 0.9444f, 0.7149f, 0.0689f, 0.5933f, 0.9891f, 0.9469f, 0.1060f, 0.3960f};
 
     migraphx::parameter_map host_params;
     host_params["boxes"]  = migraphx::argument(boxes_s, boxes_vec.data());
@@ -876,7 +1010,20 @@ TEST_CASE(nms_200boxes_2batch_2class_test)
 
     auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params);
     indices.resize(static_cast<std::size_t>(num_selected) * 3);
-    std::vector<int64_t> gold = {0, 0, 143, 0, 0, 10, 0, 0, 13, 0, 0, 0, 0, 0, 90, 0, 0, 135, 0, 0, 1, 0, 0, 76, 0, 0, 108, 0, 0, 170, 0, 0, 140, 0, 0, 20, 0, 0, 151, 0, 0, 150, 0, 0, 39, 0, 0, 44, 0, 0, 41, 0, 0, 82, 0, 0, 80, 0, 0, 88, 0, 0, 16, 0, 0, 27, 0, 0, 167, 0, 0, 165, 0, 0, 181, 0, 1, 187, 0, 1, 94, 0, 1, 152, 0, 1, 72, 0, 1, 32, 0, 1, 153, 0, 1, 109, 0, 1, 150, 0, 1, 19, 0, 1, 27, 0, 1, 96, 0, 1, 35, 0, 1, 197, 0, 1, 68, 0, 1, 22, 0, 1, 154, 0, 1, 17, 0, 1, 117, 0, 1, 43, 0, 1, 97, 0, 1, 10, 0, 1, 180, 0, 1, 182, 0, 1, 67, 0, 1, 44, 1, 0, 35, 1, 0, 152, 1, 0, 175, 1, 0, 4, 1, 0, 71, 1, 0, 166, 1, 0, 127, 1, 0, 38, 1, 0, 170, 1, 0, 44, 1, 0, 158, 1, 0, 198, 1, 0, 24, 1, 0, 101, 1, 0, 171, 1, 0, 2, 1, 0, 53, 1, 0, 102, 1, 0, 66, 1, 0, 140, 1, 0, 37, 1, 0, 98, 1, 0, 115, 1, 0, 150, 1, 0, 6, 1, 1, 114, 1, 1, 196, 1, 1, 0, 1, 1, 126, 1, 1, 124, 1, 1, 19, 1, 1, 11, 1, 1, 26, 1, 1, 84, 1, 1, 191, 1, 1, 117, 1, 1, 104, 1, 1, 197, 1, 1, 192, 1, 1, 10, 1, 1, 48, 1, 1, 68, 1, 1, 22, 1, 1, 128, 1, 1, 25, 1, 1, 134, 1, 1, 163, 1, 1, 121, 1, 1, 169, 1, 1, 185};
+    std::vector<int64_t> gold = {
+        0, 0, 143, 0, 0, 10,  0, 0, 13,  0, 0, 0,   0, 0, 90,  0, 0, 135, 0, 0, 1,   0, 0, 76,
+        0, 0, 108, 0, 0, 170, 0, 0, 140, 0, 0, 20,  0, 0, 151, 0, 0, 150, 0, 0, 39,  0, 0, 44,
+        0, 0, 41,  0, 0, 82,  0, 0, 80,  0, 0, 88,  0, 0, 16,  0, 0, 27,  0, 0, 167, 0, 0, 165,
+        0, 0, 181, 0, 1, 187, 0, 1, 94,  0, 1, 152, 0, 1, 72,  0, 1, 32,  0, 1, 153, 0, 1, 109,
+        0, 1, 150, 0, 1, 19,  0, 1, 27,  0, 1, 96,  0, 1, 35,  0, 1, 197, 0, 1, 68,  0, 1, 22,
+        0, 1, 154, 0, 1, 17,  0, 1, 117, 0, 1, 43,  0, 1, 97,  0, 1, 10,  0, 1, 180, 0, 1, 182,
+        0, 1, 67,  0, 1, 44,  1, 0, 35,  1, 0, 152, 1, 0, 175, 1, 0, 4,   1, 0, 71,  1, 0, 166,
+        1, 0, 127, 1, 0, 38,  1, 0, 170, 1, 0, 44,  1, 0, 158, 1, 0, 198, 1, 0, 24,  1, 0, 101,
+        1, 0, 171, 1, 0, 2,   1, 0, 53,  1, 0, 102, 1, 0, 66,  1, 0, 140, 1, 0, 37,  1, 0, 98,
+        1, 0, 115, 1, 0, 150, 1, 0, 6,   1, 1, 114, 1, 1, 196, 1, 1, 0,   1, 1, 126, 1, 1, 124,
+        1, 1, 19,  1, 1, 11,  1, 1, 26,  1, 1, 84,  1, 1, 191, 1, 1, 117, 1, 1, 104, 1, 1, 197,
+        1, 1, 192, 1, 1, 10,  1, 1, 48,  1, 1, 68,  1, 1, 22,  1, 1, 128, 1, 1, 25,  1, 1, 134,
+        1, 1, 163, 1, 1, 121, 1, 1, 169, 1, 1, 185};
     EXPECT(migraphx::verify::verify_rms_range(indices, gold));
     EXPECT(num_selected == 100);
 }

From 49e3a2a2fc3db6163ddeb6d7e791f7fe5025778a Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Thu, 21 May 2026 14:01:36 -0500
Subject: [PATCH 23/32] Update NMS op to do fixed_shape_error_check only on
 fixed shapes

---
 src/include/migraphx/op/nonmaxsuppression.hpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp
index b71bc4822eb..96ad442cd8a 100644
--- a/src/include/migraphx/op/nonmaxsuppression.hpp
+++ b/src/include/migraphx/op/nonmaxsuppression.hpp
@@ -93,7 +93,10 @@ struct nonmaxsuppression
             }
         };
 
-        fixed_shape_error_check();
+        if(not (inputs.at(0).dynamic() or inputs.at(1).dynamic()))
+        {
+            fixed_shape_error_check();
+        }
         std::vector<std::size_t> out_lens = {max_num_boxes, 3};
         shape s_ind{shape::int64_type, out_lens};
         shape s_num_selected{shape::int64_type, {1}};

From 94c374438dd0157cfe8c8132002d36327a7427b8 Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Thu, 21 May 2026 14:44:32 -0500
Subject: [PATCH 24/32] Update tests and fixes

---
 docs/dev/onnx_operators.rst                   |  18 +--
 src/include/migraphx/onnx.hpp                 |   2 -
 .../include/migraphx/onnx/onnx_parser.hpp     |   1 -
 src/onnx/onnx.cpp                             |   1 -
 src/onnx/parse_nonmaxsuppression.cpp          |  11 +-
 src/targets/gpu/lowering.cpp                  |   3 +-
 test/multi_target/multitarget_test.cpp        |   9 +-
 test/onnx/gen_onnx.py                         |  28 +---
 test/onnx/nms_dynamic_batch_test.onnx         | Bin 411 -> 388 bytes
 test/onnx/nms_use_dyn_output_false_test.onnx  | Bin 404 -> 0 bytes
 test/onnx/parse/nms_dynamic_batch_test.cpp    |   7 +-
 test/onnx/parse/nms_dynamic_boxes_test.cpp    |   6 +-
 test/onnx/parse/nms_dynamic_classes_test.cpp  |   6 +-
 test/onnx/parse/nms_test.cpp                  |   3 +-
 .../parse/nms_use_dyn_output_false_test.cpp   |  55 -------
 test/op_shape_test.cpp                        | 140 ++++--------------
 test/ref/nonmaxsuppression.cpp                | 126 ++++++++--------
 17 files changed, 128 insertions(+), 288 deletions(-)
 delete mode 100644 test/onnx/nms_use_dyn_output_false_test.onnx
 delete mode 100644 test/onnx/parse/nms_use_dyn_output_false_test.cpp

diff --git a/docs/dev/onnx_operators.rst b/docs/dev/onnx_operators.rst
index a24fbac6a80..e3bd403db67 100644
--- a/docs/dev/onnx_operators.rst
+++ b/docs/dev/onnx_operators.rst
@@ -511,15 +511,15 @@ Operator Support Matrix
 +--------------------------+-----------+-----------------+------------------------------+
 | NegativeLogLikelihoodLoss| ❌        |                 |                              |
 +--------------------------+-----------+-----------------+------------------------------+
-| NonMaxSuppression        | ✅        | FP8, FP16,      | fixed output                 |
-|                          |           | FP32, FP64      | size unless                  |
-|                          |           |                 | ``use_dyn_output``           |
-|                          |           |                 | set                          |
-+--------------------------+-----------+-----------------+------------------------------+
-| NonZero                  | ✅        | FP8, FP16,      | fixed output                 |
-|                          |           | FP32, FP64      | size unless                  |
-|                          |           |                 | ``use_dyn_output``           |
-|                          |           |                 | set                          |
+| NonMaxSuppression        | ✅        | FP8, FP16,      |                              |
+|                          |           | FP32, FP64      |                              |
+|                          |           |                 |                              |
+|                          |           |                 |                              |
++--------------------------+-----------+-----------------+------------------------------+
+| NonZero                  | ✅        | FP8, FP16,      | fixed output size            |
+|                          |           | FP32, FP64      |                              |
+|                          |           |                 |                              |
+|                          |           |                 |                              |
 +--------------------------+-----------+-----------------+------------------------------+
 | Not                      | ✅        | BOOL            |                              |
 +--------------------------+-----------+-----------------+------------------------------+
diff --git a/src/include/migraphx/onnx.hpp b/src/include/migraphx/onnx.hpp
index 13745994fd7..6715022a1ec 100644
--- a/src/include/migraphx/onnx.hpp
+++ b/src/include/migraphx/onnx.hpp
@@ -56,8 +56,6 @@ struct onnx_options
     /// Since loop will become a tensor of max iter size a huge number can cause overflow during
     /// shape computations.
     int64_t limit_max_iterations = std::numeric_limits<uint16_t>::max();
-    /// Use dynamic output for operators when available
-    bool use_dyn_output = false;
     /// Parse in ONNX node names as debug symbols
     bool use_debug_symbols = false;
     /// Path to use for the external data if it is stored at different location compared to onnx
diff --git a/src/onnx/include/migraphx/onnx/onnx_parser.hpp b/src/onnx/include/migraphx/onnx/onnx_parser.hpp
index 4f58cd085a9..0d83a094759 100644
--- a/src/onnx/include/migraphx/onnx/onnx_parser.hpp
+++ b/src/onnx/include/migraphx/onnx/onnx_parser.hpp
@@ -102,7 +102,6 @@ struct onnx_parser
     std::unordered_map<std::string, std::vector<std::size_t>> map_input_dims;
     std::unordered_map<std::string, shape::dynamic_dimension> dim_params;
     std::unordered_map<std::string, std::vector<shape::dynamic_dimension>> map_dyn_input_dims;
-    bool use_dyn_output          = false;
     bool skip_unknown_operators  = false;
     bool use_debug_symbols       = false;
     int64_t max_loop_iterations  = 10;
diff --git a/src/onnx/onnx.cpp b/src/onnx/onnx.cpp
index a14b4b3c581..cab3fc5daa1 100644
--- a/src/onnx/onnx.cpp
+++ b/src/onnx/onnx.cpp
@@ -72,7 +72,6 @@ static program parse_onnx_from(const onnx_options& options, Ts&&... xs)
     parser.skip_unknown_operators = options.skip_unknown_operators;
     parser.max_loop_iterations    = options.max_loop_iterations;
     parser.limit_max_iterations   = options.limit_max_iterations;
-    parser.use_dyn_output         = options.use_dyn_output;
 
     if(options.print_program_on_error)
     {
diff --git a/src/onnx/parse_nonmaxsuppression.cpp b/src/onnx/parse_nonmaxsuppression.cpp
index 74427a9d5b8..959683d01e9 100644
--- a/src/onnx/parse_nonmaxsuppression.cpp
+++ b/src/onnx/parse_nonmaxsuppression.cpp
@@ -25,7 +25,7 @@
 #include <migraphx/ranges.hpp>
 #include <migraphx/make_op.hpp>
 
-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_USE_DYNAMIC_NMS);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_USE_DYNAMIC_NMS)
 
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -46,16 +46,17 @@ struct parse_nonmaxsuppression : op_parser<parse_nonmaxsuppression>
         auto indices = info.add_instruction(make_op("get_tuple_elem", {{"index", 0}}), nms_ins);
         if(enabled(MIGRAPHX_USE_DYNAMIC_NMS{}))
         {
-            return indices;
-        }
-        else
-        {
+            //TODO: planning to make this the default behavior and removing the env var.
             auto num_selected =
                 info.add_instruction(make_op("get_tuple_elem", {{"index", 1}}), nms_ins);
             auto slice_ins = info.add_instruction(
                 make_op("slice", {{"axes", {0}}, {"starts", {0}}}), indices, num_selected);
             return slice_ins;
         }
+        else
+        {
+            return indices;
+        }
     }
 };
 
diff --git a/src/targets/gpu/lowering.cpp b/src/targets/gpu/lowering.cpp
index a7baf80a755..092fe42892f 100644
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -59,6 +59,7 @@ namespace gpu {
 
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_SET_GEMM_PROVIDER)
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_MIOPEN_POOLING)
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_USE_DYNAMIC_NMS)
 
 struct miopen_apply
 {
@@ -484,7 +485,7 @@ struct miopen_apply
             // replace_allocate pass can later turn it into hip::allocate.
             shape mask_shape{shape::uint8_type, {num_batches * num_classes, iou_packed}};
             auto mask_alloc = insert_allocation(ins, mask_shape);
-
+            
             auto sorted = mod->insert_instruction(
                 ins,
                 make_op("gpu::nms_sort", {{"center_point_box", center_point_box}}),
diff --git a/test/multi_target/multitarget_test.cpp b/test/multi_target/multitarget_test.cpp
index a52d4940ce6..1ca5758e74a 100644
--- a/test/multi_target/multitarget_test.cpp
+++ b/test/multi_target/multitarget_test.cpp
@@ -216,14 +216,17 @@ TEST_CASE(single_target_multi_compile)
     auto max_out_l                = gpu_mod->add_literal(int64_t{4});
     auto iou_threshold            = gpu_mod->add_literal(0.5f);
     auto score_threshold          = gpu_mod->add_literal(0.0f);
-    auto r                        = gpu_mod->add_instruction(
-        migraphx::make_op("nonmaxsuppression",
-                          {{"center_point_box", true}, {"use_dyn_output", true}}),
+    auto nms = gpu_mod->add_instruction(
+        migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
         boxes_param_gpu,
         scores_l,
         max_out_l,
         iou_threshold,
         score_threshold);
+    auto idx = gpu_mod->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms);
+    auto cnt = gpu_mod->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 1}}), nms);
+    auto r   = gpu_mod->add_instruction(
+        migraphx::make_op("slice", {{"axes", {0}}, {"starts", {0}}}), idx, cnt);
     gpu_mod->add_return({r});
 
     auto run_on_gpu = mm->add_instruction(
diff --git a/test/onnx/gen_onnx.py b/test/onnx/gen_onnx.py
index b359b64de76..0423f56963a 100644
--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -11158,31 +11158,6 @@ def nms_test():
     return ([node], [b, s, mo, iou, st], [out])
 
 
-@onnx_test()
-def nms_use_dyn_output_false_test():
-    b = helper.make_tensor_value_info('boxes', TensorProto.FLOAT, [1, 6, 4])
-    s = helper.make_tensor_value_info('scores', TensorProto.FLOAT, [1, 1, 6])
-    mo = helper.make_tensor_value_info('max_output_boxes_per_class',
-                                       TensorProto.INT64, [1])
-    iou = helper.make_tensor_value_info('iou_threshold', TensorProto.FLOAT,
-                                        [1])
-    st = helper.make_tensor_value_info('score_threshold', TensorProto.FLOAT,
-                                       [1])
-    out = helper.make_tensor_value_info('selected_indices', TensorProto.INT64,
-                                        [None, 3])
-
-    node = onnx.helper.make_node('NonMaxSuppression',
-                                 inputs=[
-                                     'boxes', 'scores',
-                                     'max_output_boxes_per_class',
-                                     'iou_threshold', 'score_threshold'
-                                 ],
-                                 outputs=['selected_indices'],
-                                 use_dyn_output=0)
-
-    return ([node], [b, s, mo, iou, st], [out])
-
-
 @onnx_test()
 def nms_dynamic_batch_test():
     b = helper.make_tensor_value_info('boxes', TensorProto.FLOAT, [None, 6, 4])
@@ -11204,8 +11179,7 @@ def nms_dynamic_batch_test():
                                      'iou_threshold', 'score_threshold'
                                  ],
                                  outputs=['selected_indices'],
-                                 center_point_box=1,
-                                 use_dyn_output=1)
+                                 center_point_box=1)
 
     return ([node], [b, s, mo, iou, st], [out])
 
diff --git a/test/onnx/nms_dynamic_batch_test.onnx b/test/onnx/nms_dynamic_batch_test.onnx
index 65b3ff5bd74f800105c2cdcb6e2e1d9303ca8d74..174d699680c39e364c184cac3a735d88399c819b 100644
GIT binary patch
delta 24
gcmbQu+``Pn!8!4O1gq6!Ca%tj@?8@T++q|409PXi(f|Me

delta 63
zcmZo+p3N-J!7e10ms=d4Qkj>So0%M+lvt9S5nqy8Tw?W$iEF_``7R+XQ7*pH;#825
R`25n6g3^+SOYSrB0sxsl7CHa`

diff --git a/test/onnx/nms_use_dyn_output_false_test.onnx b/test/onnx/nms_use_dyn_output_false_test.onnx
deleted file mode 100644
index 6f4d989d6a676004f890b9bd964345499cefe784..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 404
zcmah^ONzok5VfNwP2ndt2r{D}uEZm_bnk53Wz)o@#X#&1RyV{GcnuHeAuK~=P(c?(
zLA_7C#{)6Z2b2z#Oix-GXA5WLF3#y-724(1gENEn)|3jt$HW|I{~Y4-!L1NksH05s
z7!kUeankOIwl{eS{YvtG5Fx5uC03;}(`l9{oDKerUdQFe6$Pm<GkP(E-gBf;<HEcz
zS)<=Nrw|ZwCOY78M&r}8qVXgB3tBOv`KL|-fb&4K?tb*-xnTve8l1fMy=!ZDJB7FQ
QloH6>hX%PfS6<k?0q8Y!6#xJL

diff --git a/test/onnx/parse/nms_dynamic_batch_test.cpp b/test/onnx/parse/nms_dynamic_batch_test.cpp
index 4fc4ab22d2f..bb7e350bea8 100644
--- a/test/onnx/parse/nms_dynamic_batch_test.cpp
+++ b/test/onnx/parse/nms_dynamic_batch_test.cpp
@@ -38,19 +38,18 @@ TEST_CASE(nms_dynamic_batch_test)
     auto iou = mm->add_parameter("iou_threshold", siou);
     migraphx::shape sst{migraphx::shape::float_type, {1}};
     auto st  = mm->add_parameter("score_threshold", sst);
-    auto ret = mm->add_instruction(
-        migraphx::make_op("nonmaxsuppression",
-                          {{"center_point_box", true}, {"use_dyn_output", true}}),
+    auto nms = mm->add_instruction(
+        migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
         b,
         s,
         mo,
         iou,
         st);
+    auto ret = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms);
     mm->add_return({ret});
 
     migraphx::onnx_options options;
     options.default_dyn_dim_value = {1, 10};
-    options.use_dyn_output        = true;
 
     auto prog = read_onnx("nms_dynamic_batch_test.onnx", options);
     EXPECT(p == prog);
diff --git a/test/onnx/parse/nms_dynamic_boxes_test.cpp b/test/onnx/parse/nms_dynamic_boxes_test.cpp
index 42706ccefdc..d11552ca3d7 100644
--- a/test/onnx/parse/nms_dynamic_boxes_test.cpp
+++ b/test/onnx/parse/nms_dynamic_boxes_test.cpp
@@ -38,13 +38,13 @@ TEST_CASE(nms_dynamic_boxes_test)
     auto iou = mm->add_parameter("iou_threshold", siou);
     migraphx::shape sst{migraphx::shape::float_type, {1}};
     auto st  = mm->add_parameter("score_threshold", sst);
-    auto ret = mm->add_instruction(
-        migraphx::make_op("nonmaxsuppression", {{"use_dyn_output", true}}), b, s, mo, iou, st);
+    auto nms = mm->add_instruction(
+        migraphx::make_op("nonmaxsuppression"), b, s, mo, iou, st);
+    auto ret = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms);
     mm->add_return({ret});
 
     migraphx::onnx_options options;
     options.default_dyn_dim_value = {6, 20};
-    options.use_dyn_output        = true;
 
     auto prog = read_onnx("nms_dynamic_boxes_test.onnx", options);
     EXPECT(p == prog);
diff --git a/test/onnx/parse/nms_dynamic_classes_test.cpp b/test/onnx/parse/nms_dynamic_classes_test.cpp
index 9e230a067f9..67a21634568 100644
--- a/test/onnx/parse/nms_dynamic_classes_test.cpp
+++ b/test/onnx/parse/nms_dynamic_classes_test.cpp
@@ -38,13 +38,13 @@ TEST_CASE(nms_dynamic_classes_test)
     auto iou = mm->add_parameter("iou_threshold", siou);
     migraphx::shape sst{migraphx::shape::float_type, {1}};
     auto st  = mm->add_parameter("score_threshold", sst);
-    auto ret = mm->add_instruction(
-        migraphx::make_op("nonmaxsuppression", {{"use_dyn_output", true}}), b, s, mo, iou, st);
+    auto nms = mm->add_instruction(
+        migraphx::make_op("nonmaxsuppression"), b, s, mo, iou, st);
+    auto ret = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms);
     mm->add_return({ret});
 
     migraphx::onnx_options options;
     options.default_dyn_dim_value = {1, 10};
-    options.use_dyn_output        = true;
 
     auto prog = read_onnx("nms_dynamic_classes_test.onnx", options);
     EXPECT(p == prog);
diff --git a/test/onnx/parse/nms_test.cpp b/test/onnx/parse/nms_test.cpp
index 6836117bbd8..f8826a8a96e 100644
--- a/test/onnx/parse/nms_test.cpp
+++ b/test/onnx/parse/nms_test.cpp
@@ -43,8 +43,9 @@ TEST_CASE(nms_test)
     migraphx::shape sst{migraphx::shape::float_type, {1}};
     auto st = mm->add_parameter("score_threshold", sst);
 
-    auto ret = mm->add_instruction(
+    auto nms = mm->add_instruction(
         migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), b, s, mo, iou, st);
+    auto ret = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms);
     mm->add_return({ret});
 
     auto prog = read_onnx("nms_test.onnx");
diff --git a/test/onnx/parse/nms_use_dyn_output_false_test.cpp b/test/onnx/parse/nms_use_dyn_output_false_test.cpp
deleted file mode 100644
index 8e95686550f..00000000000
--- a/test/onnx/parse/nms_use_dyn_output_false_test.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#include <onnx_test.hpp>
-
-TEST_CASE(nms_overwrite_use_dyn_output_test)
-{
-    migraphx::program p;
-    auto* mm = p.get_main_module();
-    migraphx::shape sb{migraphx::shape::float_type, {1, 6, 4}};
-    auto b = mm->add_parameter("boxes", sb);
-
-    migraphx::shape ss{migraphx::shape::float_type, {1, 1, 6}};
-    auto s = mm->add_parameter("scores", ss);
-
-    migraphx::shape smo{migraphx::shape::int64_type, {1}};
-    auto mo = mm->add_parameter("max_output_boxes_per_class", smo);
-
-    migraphx::shape siou{migraphx::shape::float_type, {1}};
-    auto iou = mm->add_parameter("iou_threshold", siou);
-
-    migraphx::shape sst{migraphx::shape::float_type, {1}};
-    auto st = mm->add_parameter("score_threshold", sst);
-
-    auto ret = mm->add_instruction(
-        migraphx::make_op("nonmaxsuppression", {{"use_dyn_output", true}}), b, s, mo, iou, st);
-    mm->add_return({ret});
-
-    migraphx::onnx_options options;
-    options.use_dyn_output = true;
-
-    auto prog = read_onnx("nms_use_dyn_output_false_test.onnx", options);
-    EXPECT(p == prog);
-}
diff --git a/test/op_shape_test.cpp b/test/op_shape_test.cpp
index d6655e49967..89f1a743f06 100644
--- a/test/op_shape_test.cpp
+++ b/test/op_shape_test.cpp
@@ -2853,64 +2853,46 @@ TEST_CASE(multinomial_dyn)
 
 TEST_CASE(nms_shape)
 {
-    // use_dyn_output == false
-    migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}};
-    migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}};
+    // The nonmaxsuppression op always returns a tuple shape:
+    //   {indices [max_num_boxes, 3] int64, num_selected [1] int64}
+    // where max_num_boxes = max_batches * max_classes * max_spatial_dim (from max_lens).
     migraphx::shape max_out_s{migraphx::shape::int64_type, {1}};
     migraphx::shape iou_thres_s{migraphx::shape::float_type, {1}};
     migraphx::shape score_thres_s{migraphx::shape::float_type, {1}};
-    migraphx::shape output_s{migraphx::shape::int64_type, {6, 3}};
-    expect_shape(output_s,
-                 migraphx::make_op("nonmaxsuppression",
-                                   {{"center_point_box", true}, {"use_dyn_output", false}}),
-                 boxes_s,
-                 scores_s,
-                 max_out_s,
-                 iou_thres_s,
-                 score_thres_s);
+    migraphx::shape num_selected_s{migraphx::shape::int64_type, {1}};
+
+    auto nms_tuple = [&](std::size_t max_num_boxes) {
+        return migraphx::shape(
+            {migraphx::shape{migraphx::shape::int64_type, {max_num_boxes, 3}}, num_selected_s});
+    };
 
-    // use_dyn_output == true
-    output_s = {migraphx::shape::int64_type, {{0, 6}, {3, 3}}};
-    expect_shape(output_s,
-                 migraphx::make_op("nonmaxsuppression",
-                                   {{"center_point_box", true}, {"use_dyn_output", true}}),
+    // fully static inputs
+    migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}};
+    migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}};
+    expect_shape(nms_tuple(6),
+                 migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
                  boxes_s,
                  scores_s,
                  max_out_s,
                  iou_thres_s,
                  score_thres_s);
 
-    // dynamic batches
+    // dynamic batches: max_num_boxes = 3 * 1 * 6 = 18
     boxes_s  = {migraphx::shape::float_type, {{1, 3}, {6, 6}, {4, 4}}};
     scores_s = {migraphx::shape::float_type, {{1, 3}, {1, 1}, {6, 6}}};
-    output_s = {migraphx::shape::int64_type, {{0, 18}, {3, 3}}};
-    expect_shape(output_s,
-                 migraphx::make_op("nonmaxsuppression",
-                                   {{"center_point_box", true}, {"use_dyn_output", true}}),
+    expect_shape(nms_tuple(18),
+                 migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
                  boxes_s,
                  scores_s,
                  max_out_s,
                  iou_thres_s,
                  score_thres_s);
 
-    // dynamic num boxes
+    // dynamic num boxes: max_num_boxes = 1 * 1 * 20 = 20
     boxes_s  = {migraphx::shape::float_type, {{1, 1}, {6, 20}, {4, 4}}};
     scores_s = {migraphx::shape::float_type, {{1, 1}, {1, 1}, {6, 20}}};
-    output_s = {migraphx::shape::int64_type, {{0, 20}, {3, 3}}};
-    expect_shape(output_s,
-                 migraphx::make_op("nonmaxsuppression",
-                                   {{"center_point_box", true}, {"use_dyn_output", true}}),
-                 boxes_s,
-                 scores_s,
-                 max_out_s,
-                 iou_thres_s,
-                 score_thres_s);
-
-    // use_dyn_output false with dynamic input shape: auto-enables dynamic output
-    output_s = {migraphx::shape::int64_type, {{0, 20}, {3, 3}}};
-    expect_shape(output_s,
-                 migraphx::make_op("nonmaxsuppression",
-                                   {{"center_point_box", true}, {"use_dyn_output", false}}),
+    expect_shape(nms_tuple(20),
+                 migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
                  boxes_s,
                  scores_s,
                  max_out_s,
@@ -2920,90 +2902,20 @@ TEST_CASE(nms_shape)
     // dynamic classes: max_num_boxes = 1 * 3 * 6 = 18
     boxes_s  = {migraphx::shape::float_type, {{1, 1}, {6, 6}, {4, 4}}};
     scores_s = {migraphx::shape::float_type, {{1, 1}, {1, 3}, {6, 6}}};
-    output_s = {migraphx::shape::int64_type, {{0, 18}, {3, 3}}};
-    expect_shape(output_s,
-                 migraphx::make_op("nonmaxsuppression",
-                                   {{"center_point_box", true}, {"use_dyn_output", true}}),
-                 boxes_s,
-                 scores_s,
-                 max_out_s,
-                 iou_thres_s,
-                 score_thres_s);
-
-    // fixed mismatch batches: use_dyn_output=true takes dynamic path, deferred to runtime
-    // max_num_boxes = 2 * 1 * 6 = 12
-    boxes_s  = {migraphx::shape::float_type, {2, 6, 4}};
-    scores_s = {migraphx::shape::float_type, {1, 1, 6}};
-    output_s = {migraphx::shape::int64_type, {{0, 12}, {3, 3}}};
-    expect_shape(output_s,
-                 migraphx::make_op("nonmaxsuppression",
-                                   {{"center_point_box", true}, {"use_dyn_output", true}}),
+    expect_shape(nms_tuple(18),
+                 migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
                  boxes_s,
                  scores_s,
                  max_out_s,
                  iou_thres_s,
                  score_thres_s);
 
-    // fixed mismatch num boxes: use_dyn_output=true takes dynamic path, deferred to runtime
-    // max_num_boxes = 1 * 1 * 6 = 6
-    boxes_s  = {migraphx::shape::float_type, {1, 6, 4}};
-    scores_s = {migraphx::shape::float_type, {1, 1, 4}};
-    output_s = {migraphx::shape::int64_type, {{0, 6}, {3, 3}}};
-    expect_shape(output_s,
-                 migraphx::make_op("nonmaxsuppression",
-                                   {{"center_point_box", true}, {"use_dyn_output", true}}),
-                 boxes_s,
-                 scores_s,
-                 max_out_s,
-                 iou_thres_s,
-                 score_thres_s);
-
-    // dynamic mismatch batches: deferred to runtime validation
-    // max_num_boxes = boxes_max[0] * scores_max[1] * boxes_max[1] = 4 * 1 * 6 = 24
-    boxes_s  = {migraphx::shape::float_type, {{1, 4}, {6, 6}, {4, 4}}};
-    scores_s = {migraphx::shape::float_type, {{2, 8}, {1, 1}, {6, 6}}};
-    output_s = {migraphx::shape::int64_type, {{0, 24}, {3, 3}}};
-    expect_shape(output_s,
-                 migraphx::make_op("nonmaxsuppression",
-                                   {{"center_point_box", true}, {"use_dyn_output", true}}),
-                 boxes_s,
-                 scores_s,
-                 max_out_s,
-                 iou_thres_s,
-                 score_thres_s);
-
-    // dynamic mismatch num boxes: deferred to runtime validation
+    // dynamic mismatch num boxes: deferred to runtime validation.
+    // spatial dim is taken from boxes.max_lens()[1] = 8, so max_num_boxes = 1 * 1 * 8 = 8
     boxes_s  = {migraphx::shape::float_type, {{1, 1}, {6, 8}, {4, 4}}};
     scores_s = {migraphx::shape::float_type, {{1, 1}, {1, 1}, {3, 9}}};
-    output_s = {migraphx::shape::int64_type, {{0, 8}, {3, 3}}};
-    expect_shape(output_s,
-                 migraphx::make_op("nonmaxsuppression",
-                                   {{"center_point_box", true}, {"use_dyn_output", true}}),
-                 boxes_s,
-                 scores_s,
-                 max_out_s,
-                 iou_thres_s,
-                 score_thres_s);
-
-    // dynamic number of classes, fixed boxes_s, mismatch batches: deferred to runtime
-    boxes_s  = {migraphx::shape::float_type, {1, 6, 4}};
-    scores_s = {migraphx::shape::float_type, {{1, 3}, {1, 3}, {6, 6}}};
-    output_s = {migraphx::shape::int64_type, {{0, 18}, {3, 3}}};
-    expect_shape(output_s,
-                 migraphx::make_op("nonmaxsuppression",
-                                   {{"center_point_box", true}, {"use_dyn_output", true}}),
-                 boxes_s,
-                 scores_s,
-                 max_out_s,
-                 iou_thres_s,
-                 score_thres_s);
-    // dynamic number of classes, fixed boxes_s, mismatch num boxes: deferred to runtime
-    boxes_s  = {migraphx::shape::float_type, {1, 6, 4}};
-    scores_s = {migraphx::shape::float_type, {{1, 1}, {1, 3}, {4, 8}}};
-    output_s = {migraphx::shape::int64_type, {{0, 18}, {3, 3}}};
-    expect_shape(output_s,
-                 migraphx::make_op("nonmaxsuppression",
-                                   {{"center_point_box", true}, {"use_dyn_output", true}}),
+    expect_shape(nms_tuple(8),
+                 migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
                  boxes_s,
                  scores_s,
                  max_out_s,
diff --git a/test/ref/nonmaxsuppression.cpp b/test/ref/nonmaxsuppression.cpp
index c65dc4916ea..8f16ec6dd75 100644
--- a/test/ref/nonmaxsuppression.cpp
+++ b/test/ref/nonmaxsuppression.cpp
@@ -30,6 +30,15 @@
 
 #include <test.hpp>
 
+static migraphx::instruction_ref add_nms_dynamic_slice(migraphx::module* mm,
+                                                       migraphx::instruction_ref nms)
+{
+    auto idx = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms);
+    auto cnt = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 1}}), nms);
+    return mm->add_instruction(
+        migraphx::make_op("slice", {{"axes", {0}}, {"starts", {0}}}), idx, cnt);
+}
+
 TEST_CASE(nms_dyn_out_test)
 {
     migraphx::program p;
@@ -47,14 +56,13 @@ TEST_CASE(nms_dyn_out_test)
     auto iou_threshold   = mm->add_literal(0.5f);
     auto score_threshold = mm->add_literal(0.0f);
 
-    auto r = mm->add_instruction(
-        migraphx::make_op("nonmaxsuppression",
-                          {{"center_point_box", true}, {"use_dyn_output", true}}),
-        boxes_l,
-        scores_l,
-        max_out_l,
-        iou_threshold,
-        score_threshold);
+    auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
+                                   boxes_l,
+                                   scores_l,
+                                   max_out_l,
+                                   iou_threshold,
+                                   score_threshold);
+    auto r = add_nms_dynamic_slice(mm, nms);
     mm->add_return({r});
 
     p.compile(migraphx::make_target("ref"));
@@ -83,12 +91,13 @@ TEST_CASE(nms_identical_all_dyn_out_test)
     auto iou_threshold   = mm->add_literal(0.1f);
     auto score_threshold = mm->add_literal(0.0f);
 
-    auto r = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"use_dyn_output", true}}),
-                                 boxes_l,
-                                 scores_l,
-                                 max_out_l,
-                                 iou_threshold,
-                                 score_threshold);
+    auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"),
+                                   boxes_l,
+                                   scores_l,
+                                   max_out_l,
+                                   iou_threshold,
+                                   score_threshold);
+    auto r = add_nms_dynamic_slice(mm, nms);
     mm->add_return({r});
 
     p.compile(migraphx::make_target("ref"));
@@ -114,14 +123,14 @@ TEST_CASE(nms_dyn_batch_test)
     auto iou_threshold   = mm->add_literal(0.5f);
     auto score_threshold = mm->add_literal(0.0f);
 
-    auto r = mm->add_instruction(
-        migraphx::make_op("nonmaxsuppression",
-                          {{"center_point_box", true}, {"use_dyn_output", true}}),
+    auto nms = mm->add_instruction(
+        migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
         boxes_p,
         scores_p,
         max_out_l,
         iou_threshold,
         score_threshold);
+    auto r = add_nms_dynamic_slice(mm, nms);
     mm->add_return({r});
 
     p.compile(migraphx::make_target("ref"));
@@ -160,14 +169,14 @@ TEST_CASE(nms_dyn_boxes_test)
     auto iou_threshold   = mm->add_literal(0.5f);
     auto score_threshold = mm->add_literal(0.0f);
 
-    auto r = mm->add_instruction(
-        migraphx::make_op("nonmaxsuppression",
-                          {{"center_point_box", true}, {"use_dyn_output", true}}),
+    auto nms = mm->add_instruction(
+        migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
         boxes_p,
         scores_p,
         max_out_l,
         iou_threshold,
         score_threshold);
+    auto r = add_nms_dynamic_slice(mm, nms);
     mm->add_return({r});
 
     p.compile(migraphx::make_target("ref"));
@@ -203,14 +212,14 @@ TEST_CASE(nms_dyn_classes_test)
     auto iou_threshold   = mm->add_literal(0.5f);
     auto score_threshold = mm->add_literal(0.0f);
 
-    auto r = mm->add_instruction(
-        migraphx::make_op("nonmaxsuppression",
-                          {{"center_point_box", true}, {"use_dyn_output", true}}),
+    auto nms = mm->add_instruction(
+        migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
         boxes_p,
         scores_p,
         max_out_l,
         iou_threshold,
         score_threshold);
+    auto r = add_nms_dynamic_slice(mm, nms);
     mm->add_return({r});
 
     p.compile(migraphx::make_target("ref"));
@@ -251,21 +260,20 @@ TEST_CASE(nms_not_center_test)
     auto iou_threshold   = mm->add_literal(0.5f);
     auto score_threshold = mm->add_literal(0.0f);
 
-    // set use_dyn_output back to false in operator map
-    auto r =
-        mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"use_dyn_output", false}}),
-                            boxes_l,
-                            scores_l,
-                            max_out_l,
-                            iou_threshold,
-                            score_threshold);
+    auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"),
+                                   boxes_l,
+                                   scores_l,
+                                   max_out_l,
+                                   iou_threshold,
+                                   score_threshold);
+    auto r = add_nms_dynamic_slice(mm, nms);
     mm->add_return({r});
 
     p.compile(migraphx::make_target("ref"));
     auto output = p.eval({}).back();
     std::vector<int64_t> result;
     output.visit([&](auto out) { result.assign(out.begin(), out.end()); });
-    std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 0, 5};
     EXPECT(migraphx::verify::verify_rms_range(result, gold));
 }
 
@@ -286,20 +294,20 @@ TEST_CASE(nms_test)
     auto iou_threshold   = mm->add_literal(0.5f);
     auto score_threshold = mm->add_literal(0.0f);
 
-    auto r =
-        mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
-                            boxes_l,
-                            scores_l,
-                            max_out_l,
-                            iou_threshold,
-                            score_threshold);
+    auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
+                                   boxes_l,
+                                   scores_l,
+                                   max_out_l,
+                                   iou_threshold,
+                                   score_threshold);
+    auto r = add_nms_dynamic_slice(mm, nms);
     mm->add_return({r});
 
     p.compile(migraphx::make_target("ref"));
     auto output = p.eval({}).back();
     std::vector<int64_t> result;
     output.visit([&](auto out) { result.assign(out.begin(), out.end()); });
-    std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 0, 5};
     EXPECT(migraphx::verify::verify_rms_range(result, gold));
 }
 
@@ -324,20 +332,20 @@ TEST_CASE(nms_transpose1_test)
 
     auto transpose_boxes = mm->add_instruction(
         migraphx::make_op("transpose", {{"permutation", {0, 2, 1}}}), t_boxes_l);
-    auto r =
-        mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
-                            transpose_boxes,
-                            scores_l,
-                            max_out_l,
-                            iou_threshold,
-                            score_threshold);
+    auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
+                                   transpose_boxes,
+                                   scores_l,
+                                   max_out_l,
+                                   iou_threshold,
+                                   score_threshold);
+    auto r = add_nms_dynamic_slice(mm, nms);
     mm->add_return({r});
 
     p.compile(migraphx::make_target("ref"));
     auto output = p.eval({}).back();
     std::vector<int64_t> result;
     output.visit([&](auto out) { result.assign(out.begin(), out.end()); });
-    std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 0, 5};
     EXPECT(migraphx::verify::verify_rms_range(result, gold));
 }
 
@@ -362,20 +370,20 @@ TEST_CASE(nms_transpose2_test)
 
     auto transpose_boxes = mm->add_instruction(
         migraphx::make_op("transpose", {{"permutation", {1, 2, 0}}}), t_boxes_l);
-    auto r =
-        mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
-                            transpose_boxes,
-                            scores_l,
-                            max_out_l,
-                            iou_threshold,
-                            score_threshold);
+    auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
+                                   transpose_boxes,
+                                   scores_l,
+                                   max_out_l,
+                                   iou_threshold,
+                                   score_threshold);
+    auto r = add_nms_dynamic_slice(mm, nms);
     mm->add_return({r});
 
     p.compile(migraphx::make_target("ref"));
     auto output = p.eval({}).back();
     std::vector<int64_t> result;
     output.visit([&](auto out) { result.assign(out.begin(), out.end()); });
-    std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 0, 5};
     EXPECT(migraphx::verify::verify_rms_range(result, gold));
 }
 
@@ -396,14 +404,14 @@ TEST_CASE(nms_dyn_different_spatial_ranges_test)
     auto iou_threshold   = mm->add_literal(0.5f);
     auto score_threshold = mm->add_literal(0.0f);
 
-    auto r = mm->add_instruction(
-        migraphx::make_op("nonmaxsuppression",
-                          {{"center_point_box", true}, {"use_dyn_output", true}}),
+    auto nms = mm->add_instruction(
+        migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
         boxes_p,
         scores_p,
         max_out_l,
         iou_threshold,
         score_threshold);
+    auto r = add_nms_dynamic_slice(mm, nms);
     mm->add_return({r});
 
     p.compile(migraphx::make_target("ref"));

From 22d8beb390196d57916452cad9c1d5194d5cff9a Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Thu, 21 May 2026 15:46:37 -0500
Subject: [PATCH 25/32] Add ref fallback for dynamic input NMS and cleanup
 kernel types

---
 src/include/migraphx/op/nonmaxsuppression.hpp |   3 +-
 .../migraphx/kernels/nonmaxsuppression.hpp    |  44 ++---
 src/targets/gpu/lowering.cpp                  | 186 ++++++++++++------
 src/targets/gpu/nms_ops.cpp                   |   3 +
 test/gpu/nonmaxsuppression.cpp                |  49 +++++
 5 files changed, 204 insertions(+), 81 deletions(-)

diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp
index 96ad442cd8a..87bd541fba1 100644
--- a/src/include/migraphx/op/nonmaxsuppression.hpp
+++ b/src/include/migraphx/op/nonmaxsuppression.hpp
@@ -189,7 +189,8 @@ struct nonmaxsuppression
         return intersection_over_union > iou_threshold;
     }
 
-    // filter boxes below score_threshold
+    // Filter boxes below score_threshold.
+    // Don't filter for score if score_threshold == 0.f
     template <class T>
     std::vector<std::pair<double, int64_t>>
     filter_boxes_by_score(T scores_start, std::size_t num_boxes, double score_threshold) const
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
index 7ae9638e173..177f373712a 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
@@ -64,23 +64,23 @@ __device__ inline array<typename Box::type, 4> nms_normalize_box(const Box box)
 {
     if constexpr(CenterPointBox)
     {
-        const float xc = box[0];
-        const float yc = box[1];
-        const float hw = box[2] * 0.5f;
-        const float hh = box[3] * 0.5f;
+        const auto xc = box[0];
+        const auto yc = box[1];
+        const auto hw = box[2] * 0.5f;
+        const auto hh = box[3] * 0.5f;
         return {xc - hw, yc - hh, xc + hw, yc + hh};
     }
     else
     {
         // ONNX layout: [y1, x1, y2, x2]; corners may be in either order.
-        const float y1   = box[0];
-        const float x1   = box[1];
-        const float y2   = box[2];
-        const float x2   = box[3];
-        const float xmin = min(x1, x2);
-        const float xmax = max(x1, x2);
-        const float ymin = min(y1, y2);
-        const float ymax = max(y1, y2);
+        const auto y1   = box[0];
+        const auto x1   = box[1];
+        const auto y2   = box[2];
+        const auto x2   = box[3];
+        const auto xmin = min(x1, x2);
+        const auto xmax = max(x1, x2);
+        const auto ymin = min(y1, y2);
+        const auto ymax = max(y1, y2);
         return {xmin, ymin, xmax, ymax};
     }
 }
@@ -88,16 +88,16 @@ __device__ inline array<typename Box::type, 4> nms_normalize_box(const Box box)
 template <class Box, class Threshold>
 __device__ inline bool nms_iou_over_threshold(const Box a, const Box b, const Threshold threshold)
 {
-    const float left   = max(a[0], b[0]);
-    const float right  = min(a[2], b[2]);
-    const float top    = max(a[1], b[1]);
-    const float bottom = min(a[3], b[3]);
-    const float w      = max(right - left, 0.f);
-    const float h      = max(bottom - top, 0.f);
-    const float inter  = w * h;
-    const float area_a = max(a[2] - a[0], 0.f) * max(a[3] - a[1], 0.f);
-    const float area_b = max(b[2] - b[0], 0.f) * max(b[3] - b[1], 0.f);
-    const float un     = area_a + area_b - inter;
+    const auto left   = max(a[0], b[0]);
+    const auto right  = min(a[2], b[2]);
+    const auto top    = max(a[1], b[1]);
+    const auto bottom = min(a[3], b[3]);
+    const auto w      = max(right - left, 0.f);
+    const auto h      = max(bottom - top, 0.f);
+    const auto inter  = w * h;
+    const auto area_a = max(a[2] - a[0], 0.f) * max(a[3] - a[1], 0.f);
+    const auto area_b = max(b[2] - b[0], 0.f) * max(b[3] - b[1], 0.f);
+    const auto un     = area_a + area_b - inter;
     if(area_a <= 0.f or area_b <= 0.f or un <= 0.f)
         return false;
     return (inter / un) > threshold;
diff --git a/src/targets/gpu/lowering.cpp b/src/targets/gpu/lowering.cpp
index 092fe42892f..192e3f8c45d 100644
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -59,7 +59,6 @@ namespace gpu {
 
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_SET_GEMM_PROVIDER)
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_MIOPEN_POOLING)
-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_USE_DYNAMIC_NMS)
 
 struct miopen_apply
 {
@@ -455,69 +454,140 @@ struct miopen_apply
     // compile pass can pick them up later. We can't rely on the main lowering
     // loop to wrap them: it walks forward, and the new instructions land
     // before `ins` so they would never be revisited.
+    //
+    // The kernels are JIT'd against compile-time sizes baked from the input
+    // shapes, so when either of `boxes` / `scores` is dynamic we fall back to
+    // executing the ref op on the host.
     void add_nms_op()
     {
         apply_map.emplace("nonmaxsuppression", [=](instruction_ref ins) {
-            auto inputs = ins->inputs();
-            const auto& boxes_s    = inputs[0]->get_shape();
-            const auto& scores_s   = inputs[1]->get_shape();
-            const auto num_batches = boxes_s.lens()[0];
-            const auto num_boxes   = boxes_s.lens()[1];
-            const auto num_classes = scores_s.lens()[1];
-            const auto iou_packed  = num_boxes * (num_boxes - 1) / 2;
-
-            // Fill in missing optional scalar inputs with default literals.
-            const shape default_max_s{shape::int64_type, {1}};
-            const shape default_iou_s{shape::float_type, {1}};
-            const shape default_thr_s{shape::float_type, {1}};
-            if(inputs.size() < 3)
-                inputs.push_back(
-                    mod->insert_literal(ins, literal{default_max_s, {std::int64_t{0}}}));
-            if(inputs.size() < 4)
-                inputs.push_back(mod->insert_literal(ins, literal{default_iou_s, {0.0f}}));
-            if(inputs.size() < 5)
-                inputs.push_back(mod->insert_literal(ins, literal{default_thr_s, {0.0f}}));
-
-            bool center_point_box =
-                ins->get_operator().to_value().at("center_point_box").to<bool>();
-
-            // Mask is scratch only; allocate up-front so the standard
-            // replace_allocate pass can later turn it into hip::allocate.
-            shape mask_shape{shape::uint8_type, {num_batches * num_classes, iou_packed}};
-            auto mask_alloc = insert_allocation(ins, mask_shape);
-            
-            auto sorted = mod->insert_instruction(
-                ins,
-                make_op("gpu::nms_sort", {{"center_point_box", center_point_box}}),
-                inputs[0],
-                inputs[1]);
-            sorted = insert_precompile_op(sorted);
-
-            auto filter = mod->insert_instruction(ins,
-                                                  make_op("gpu::nms_filter",
-                                                          {{"num_batches", num_batches},
-                                                           {"num_classes", num_classes},
-                                                           {"num_boxes", num_boxes}}),
-                                                  sorted,
-                                                  inputs[2],
-                                                  inputs[3],
-                                                  inputs[4],
-                                                  mask_alloc);
-            filter      = insert_precompile_op(filter);
-
-            auto raw_output =
-                mod->insert_instruction(ins, make_op("get_tuple_elem", {{"index", 0}}), filter);
-            auto bc_counts =
-                mod->insert_instruction(ins, make_op("get_tuple_elem", {{"index", 1}}), filter);
-
-            auto compact =
-                mod->insert_instruction(ins, make_op("gpu::nms_compact"), bc_counts, raw_output);
-            compact = insert_precompile_op(compact);
-
-            return mod->replace_instruction(ins, compact);
+            const auto& boxes_s  = ins->inputs()[0]->get_shape();
+            const auto& scores_s = ins->inputs()[1]->get_shape();
+            if(boxes_s.dynamic() or scores_s.dynamic())
+                return lower_nms_to_ref(ins);
+            return lower_nms_to_gpu_pipeline(ins);
         });
     }
 
+    // Static GPU pipeline: gpu::nms_sort -> gpu::nms_filter -> gpu::nms_compact.
+    instruction_ref lower_nms_to_gpu_pipeline(instruction_ref ins) const
+    {
+        auto inputs            = ins->inputs();
+        const auto& boxes_s    = inputs[0]->get_shape();
+        const auto& scores_s   = inputs[1]->get_shape();
+        const auto num_batches = boxes_s.lens()[0];
+        const auto num_boxes   = boxes_s.lens()[1];
+        const auto num_classes = scores_s.lens()[1];
+        const auto iou_packed  = num_boxes * (num_boxes - 1) / 2;
+
+        // Fill in missing optional scalar inputs with default literals.
+        const shape default_max_s{shape::int64_type, {1}};
+        const shape default_iou_s{shape::float_type, {1}};
+        const shape default_thr_s{shape::float_type, {1}};
+        if(inputs.size() < 3)
+            inputs.push_back(mod->insert_literal(ins, literal{default_max_s, {std::int64_t{0}}}));
+        if(inputs.size() < 4)
+            inputs.push_back(mod->insert_literal(ins, literal{default_iou_s, {0.0f}}));
+        if(inputs.size() < 5)
+            inputs.push_back(mod->insert_literal(ins, literal{default_thr_s, {0.0f}}));
+
+        bool center_point_box =
+            ins->get_operator().to_value().at("center_point_box").to<bool>();
+
+        // Mask is scratch only; allocate up-front so the standard
+        // replace_allocate pass can later turn it into hip::allocate.
+        shape mask_shape{shape::uint8_type, {num_batches * num_classes, iou_packed}};
+        auto mask_alloc = insert_allocation(ins, mask_shape);
+
+        auto sorted = mod->insert_instruction(
+            ins,
+            make_op("gpu::nms_sort", {{"center_point_box", center_point_box}}),
+            inputs[0],
+            inputs[1]);
+        sorted = insert_precompile_op(sorted);
+
+        auto filter = mod->insert_instruction(ins,
+                                              make_op("gpu::nms_filter",
+                                                      {{"num_batches", num_batches},
+                                                       {"num_classes", num_classes},
+                                                       {"num_boxes", num_boxes}}),
+                                              sorted,
+                                              inputs[2],
+                                              inputs[3],
+                                              inputs[4],
+                                              mask_alloc);
+        filter      = insert_precompile_op(filter);
+
+        auto raw_output =
+            mod->insert_instruction(ins, make_op("get_tuple_elem", {{"index", 0}}), filter);
+        auto bc_counts =
+            mod->insert_instruction(ins, make_op("get_tuple_elem", {{"index", 1}}), filter);
+
+        auto compact =
+            mod->insert_instruction(ins, make_op("gpu::nms_compact"), bc_counts, raw_output);
+        compact = insert_precompile_op(compact);
+
+        return mod->replace_instruction(ins, compact);
+    }
+
+    // Dynamic-shape fallback: run the ref `nonmaxsuppression` op on the host
+    // and copy each tuple element back to its own GPU allocation. Downstream
+    // `get_tuple_elem` consumers of `ins` are rewritten in place to point at
+    // the per-element GPU copies; `ins` itself is left for DCE to remove.
+    //
+    // The ref op produces a tuple {indices, num_selected}, and `hip::copy_to_gpu`
+    // is not tuple-aware (calls `argument::data()` which asserts non-tuple), so
+    // we have to split the tuple on the host side before copying back.
+    instruction_ref lower_nms_to_ref(instruction_ref ins) const
+    {
+        // Copy each input from GPU to host, then sync before running the ref op.
+        auto inputs = ins->inputs();
+        std::vector<instruction_ref> cpu_inputs;
+        cpu_inputs.reserve(inputs.size());
+        std::transform(
+            inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) {
+                return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in);
+            });
+        cpu_inputs.front() =
+            mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs);
+
+        // Ref op produces a tuple {indices [max_num_boxes, 3], num_selected [1]}.
+        auto cpu_out = mod->insert_instruction(ins, ins->get_operator(), cpu_inputs);
+
+        // For each sub-shape, extract on the host side and copy back to its
+        // own GPU allocation.
+        const auto& sub_shapes = ins->get_shape().sub_shapes();
+        std::vector<instruction_ref> gpu_subs;
+        gpu_subs.reserve(sub_shapes.size());
+        for(std::size_t i = 0; i < sub_shapes.size(); ++i)
+        {
+            auto cpu_sub = mod->insert_instruction(
+                ins, make_op("get_tuple_elem", {{"index", i}}), cpu_out);
+            auto gpu_alloc = insert_allocation(ins, sub_shapes[i]);
+            gpu_subs.push_back(mod->insert_instruction(
+                ins, make_op("hip::copy_to_gpu"), cpu_sub, gpu_alloc));
+        }
+
+        // Snapshot outputs since we mutate the graph below.
+        auto consumers = ins->outputs();
+        for(auto consumer : consumers)
+        {
+            if(consumer->name() != "get_tuple_elem")
+                MIGRAPHX_THROW("gpu::add_nms_op: dynamic NMS fallback expects only "
+                               "get_tuple_elem consumers of nonmaxsuppression; got: " +
+                               consumer->name());
+            auto idx =
+                consumer->get_operator().to_value().at("index").to<std::size_t>();
+            assert(idx < gpu_subs.size());
+            mod->replace_instruction(consumer, gpu_subs[idx]);
+        }
+
+        // `ins` is now dead; leave it for dead_code_elimination. Return it so
+        // the apply-loop shape check (which compares against the original
+        // tuple shape) succeeds.
+        return ins;
+    }
+
     void add_lrn_op()
     {
         apply_map.emplace("lrn", [=](instruction_ref ins) {
diff --git a/src/targets/gpu/nms_ops.cpp b/src/targets/gpu/nms_ops.cpp
index a1fb8fdfe48..f9ac82c8ebf 100644
--- a/src/targets/gpu/nms_ops.cpp
+++ b/src/targets/gpu/nms_ops.cpp
@@ -33,6 +33,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 
 // Sort boxes per (batch, class) into nms_data{} tensor.
+// inputs = {boxes, scores}
 struct nms_sort
 {
     bool center_point_box = false;
@@ -69,6 +70,7 @@ MIGRAPHX_REGISTER_OP(nms_sort);
 // Produces a tuple of (raw_output, bc_counts).
 // num_batches/num_classes/num_boxes are kept as op attributes because the filter inputs
 // is a scratch buffer from which these can't be recovered.
+// inputs = {sorted_boxes, sorted_scores, sorted_box_indices, output_indices, output_bc_counts}
 struct nms_filter
 {
     std::size_t num_batches = 0;
@@ -99,6 +101,7 @@ MIGRAPHX_REGISTER_OP(nms_filter);
 //  Needs a make_tuple type of operator that reuses the indicies input.
 // Prefix-scan the per-block counts and compact the selections into
 // the final selected_indices. Output as selected_indices and num_selected tuple.
+// inputs = {output_bc_counts, output_indices}
 struct nms_compact
 {
     std::string name() const { return "gpu::nms_compact"; }
diff --git a/test/gpu/nonmaxsuppression.cpp b/test/gpu/nonmaxsuppression.cpp
index f123263b596..4d9bdb89602 100644
--- a/test/gpu/nonmaxsuppression.cpp
+++ b/test/gpu/nonmaxsuppression.cpp
@@ -187,6 +187,55 @@ TEST_CASE(nms_not_center_test)
     EXPECT(num_selected == 3);
 }
 
+// Exercises the CPU fallback in src/targets/gpu/lowering.cpp::lower_nms_to_ref
+// by declaring dynamic-shape parameters for boxes/scores. Uses different
+// dynamic-dim ranges between boxes and scores so split_single_dyn_dim bails
+// out (its has_one_unique_dyn_dim check requires identical ranges) and the
+// dynamic shape survives until lowering. Same data and gold as
+// nms_not_center_test.
+TEST_CASE(nms_dynamic_fallback_test)
+{
+    using dd = migraphx::shape::dynamic_dimension;
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape boxes_dyn_s{migraphx::shape::float_type,
+                                {dd{1, 1}, dd{4, 10}, dd{4, 4}}};
+    migraphx::shape scores_dyn_s{migraphx::shape::float_type,
+                                 {dd{1, 1}, dd{1, 1}, dd{4, 8}}};
+
+    auto boxes_p         = mm->add_parameter("boxes", boxes_dyn_s);
+    auto scores_p        = mm->add_parameter("scores", scores_dyn_s);
+    auto max_out_l       = mm->add_literal(int64_t{4});
+    auto iou_threshold   = mm->add_literal(0.5f);
+    auto score_threshold = mm->add_literal(0.0f);
+
+    auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"),
+                                   boxes_p,
+                                   scores_p,
+                                   max_out_l,
+                                   iou_threshold,
+                                   score_threshold);
+    add_nms_return(mm, nms);
+
+    std::vector<float> boxes_vec  = {1.0, 1.0,  0.0, 0.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, 0.9,  1.0, -0.1, 0.0, 10.0,  1.0, 11.0,
+                                     1.0, 10.1, 0.0, 11.1, 1.0, 101.0, 0.0, 100.0};
+    std::vector<float> scores_vec = {0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f};
+
+    migraphx::shape boxes_runtime_s{migraphx::shape::float_type, {1, 6, 4}};
+    migraphx::shape scores_runtime_s{migraphx::shape::float_type, {1, 1, 6}};
+
+    migraphx::parameter_map host_params;
+    host_params["boxes"]  = migraphx::argument(boxes_runtime_s, boxes_vec.data());
+    host_params["scores"] = migraphx::argument(scores_runtime_s, scores_vec.data());
+
+    auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params);
+    indices.resize(static_cast<std::size_t>(num_selected) * 3);
+    std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 0, 5};
+    EXPECT(migraphx::verify::verify_rms_range(indices, gold));
+    EXPECT(num_selected == 3);
+}
+
 TEST_CASE(nms_transpose1_test)
 {
     migraphx::program p;

From 8fc4844cd0c823e08aa35c3144f3eb07667431f3 Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Thu, 21 May 2026 16:23:08 -0500
Subject: [PATCH 26/32] Get rid of nms_data in kernel to use global memory only
 for now

---
 .../migraphx/kernels/nonmaxsuppression.hpp    | 164 ++++++++----------
 .../kernels/include/migraphx/kernels/sort.hpp |  48 ++++-
 src/targets/gpu/lowering.cpp                  |  41 ++---
 src/targets/gpu/nms_ops.cpp                   |   3 +-
 4 files changed, 132 insertions(+), 124 deletions(-)

diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
index 177f373712a..ee5b1b090e9 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
@@ -38,25 +38,6 @@
 
 namespace migraphx {
 
-template <class Score, class Box, class Index>
-struct nms_data
-{
-    // holds a copy of data
-    Score score;
-    array<Box, 4> box;
-    Index box_index;
-};
-
-// Comparator for sorting nms_data{} (or anything else with a `.score` field).
-struct nms_score_greater
-{
-    template <class T>
-    constexpr bool operator()(const T& a, const T& b) const
-    {
-        return a.score > b.score;
-    }
-};
-
 // Decode a single box into (xmin, ymin, xmax, ymax) corners.
 // Normalize such that [x1, y1] is the bottom left corner.
 template <bool CenterPointBox, class Box>
@@ -109,15 +90,14 @@ __device__ inline index_int nms_packed_idx(index_int i, index_int j, index_int N
     return (i * N - (i * (i + 1)) / 2) + j - (i + 1);
 }
 
-// One block per (batch_idx, class_idx).
-// Load data into per-block buffer of nms_data.
-// Pads values after N with sentinel values.
-// Sorts the nms_data in descending order by score.
-// boxes_tv: dims([NumBatches, NumBoxes, 4])
-// scores_tv: dims([NumBatches, NumClasses, NumBoxes])
-// sorted_scores: output, dims([B, C, AlignedNumBoxes])
-// sorted_boxes: output, dims([B, C, AlignedNumBoxes, 4])
-// sorted_indices: output, dims([B, C, AlignedNumBoxes])
+// One block per (batch_idx, class_idx). Initializes the per-block slice of
+// sorted_* in place (padding past NumBoxes with score-sentinels) and bitonic
+// sorts the three global arrays in lockstep by descending score.
+// boxes_tv:       dims([NumBatches, NumBoxes, 4])
+// scores_tv:      dims([NumBatches, NumClasses, NumBoxes])
+// sorted_scores:  out, dims([B, C, AlignedNumBoxes])
+// sorted_boxes:   out, dims([B, C, AlignedNumBoxes, 4])
+// sorted_indices: out, dims([B, C, AlignedNumBoxes])
 template <bool CenterPointBox,
           index_int NumBatches,
           index_int NumClasses,
@@ -148,65 +128,83 @@ __device__ void nonmaxsuppression_sort(const Boxes boxes_tv,
     const auto my_scores =
         slice_tensor(scores_tv, array<index_int, 3>{batch_idx, class_idx, 0}, slice_axes<2>());
 
+    // TODO: make version that uses block shared memory if the data will fit
+    auto my_sorted_scores =
+        slice_tensor(sorted_scores, array<index_int, 2>{block_id, 0}, slice_axes<1>());
+    auto my_sorted_boxes =
+        slice_tensor(sorted_boxes, array<index_int, 3>{block_id, 0, 0}, slice_axes<1, 2>());
+    auto my_sorted_indices =
+        slice_tensor(sorted_indices, array<index_int, 2>{block_id, 0}, slice_axes<1>());
+
     using scores_type  = typename SortedScores::type;
     using boxes_type   = typename SortedBoxes::type;
     using indices_type = typename SortedIndices::type;
-    // Use shared memory for sorting per-block nms_data. Assuming it fits in LDS.
-    // TODO: can add a static_assert on needed LDS size
-    __shared__
-        uninitialized_buffer<nms_data<scores_type, boxes_type, indices_type>, AlignedNumBoxes>
-            block_nms_data;
+
+    // Initialize sorted_* in place; pad past NumBoxes with sentinels that
+    // sink to the end under descending sort. sorted_boxes is 3D ([1, N, 4])
+    // since slice_tensor preserves rank.
     idx.local_stride(AlignedNumBoxes, [&](auto i) {
         if(i < NumBoxes)
         {
-            block_nms_data[i].score = my_scores[i];
-            block_nms_data[i].box   = nms_normalize_box<CenterPointBox>(
+            const auto box = nms_normalize_box<CenterPointBox>(
                 slice_tensor(boxes_tv, array<index_int, 3>{batch_idx, i, 0}, slice_axes<2>()));
-            block_nms_data[i].box_index = static_cast<int32_t>(i);
+            my_sorted_scores[i] = my_scores[i];
+            for(index_int k = 0; k < 4; ++k)
+                my_sorted_boxes[array<index_int, 3>{0, i, k}] = box[k];
+            my_sorted_indices[i] = static_cast<indices_type>(i);
         }
         else
         {
-            block_nms_data[i].score     = numeric_lowest<scores_type>();
-            block_nms_data[i].box       = array<boxes_type, 4>{0.f, 0.f, 0.f, 0.f};
-            block_nms_data[i].box_index = -1;
+            my_sorted_scores[i] = numeric_lowest<scores_type>();
+            for(index_int k = 0; k < 4; ++k)
+                my_sorted_boxes[array<index_int, 3>{0, i, k}] = boxes_type{0};
+            my_sorted_indices[i] = static_cast<indices_type>(-1);
         }
     });
     __syncthreads();
 
-    bitonic_sort{nms_score_greater{}}.template block_sort<AlignedNumBoxes>(idx, block_nms_data);
-
-    // Copy sorted result back to global memory.
-    auto block_out_scores =
-        slice_tensor(sorted_scores, array<index_int, 2>{block_id, 0}, slice_axes<1>());
-    auto block_out_boxes =
-        slice_tensor(sorted_boxes, array<index_int, 3>{block_id, 0, 0}, slice_axes<1, 2>());
-    auto block_out_indices =
-        slice_tensor(sorted_indices, array<index_int, 2>{block_id, 0}, slice_axes<1>());
-    idx.local_stride(AlignedNumBoxes, [&](auto i) {
-        block_out_scores[i] = block_nms_data[i].score;
-        auto out_box_iter   = block_out_boxes.begin_at(array<index_int, 3>{0, i, 0});
-        copy(block_nms_data[i].box.begin(), block_nms_data[i].box.end(), out_box_iter);
-        block_out_indices[i] = block_nms_data[i].box_index;
-    });
+    // Sort scores descending, dragging boxes and indices along. Uses the
+    // indexed variant so we can swap all 4 box lanes per index pair.
+    bitonic_sort{greater{}}.template block_sort_indexed<AlignedNumBoxes>(
+        idx,
+        [&](auto i, auto j) { return my_sorted_scores[j] > my_sorted_scores[i]; },
+        [&](auto i, auto j) {
+            swap(my_sorted_scores[i], my_sorted_scores[j]);
+            swap(my_sorted_indices[i], my_sorted_indices[j]);
+            for(index_int k = 0; k < 4; ++k)
+                swap(my_sorted_boxes[array<index_int, 3>{0, i, k}],
+                     my_sorted_boxes[array<index_int, 3>{0, j, k}]);
+        });
 }
 
-// Build the packed upper-triangular IoU mask for the NumBoxes nms_data boxes.
-// Work is striped such that each thread does a multiple of 2 rows so each does roughly the same
-// amount of work regardless of where it falls in the triangle.
-// `nms_data`: nms_data nms_data{} tensor
-// `mask`: bool mask tensor
-template <index_int NumBoxes, class NMSData, class Mask>
-__device__ void
-nms_make_iou_mask(const index idx, const NMSData nms_data, Mask mask, const float iou_threshold)
+// Build the packed upper-triangular IoU mask for the first NumBoxes sorted
+// boxes. Threads are paired across the triangle so each does roughly the same
+// amount of work.
+// `sorted_boxes`: per-block 3D view, dims([1, >=NumBoxes, 4])
+// `mask`:         bool mask tensor
+template <index_int NumBoxes, class SortedBoxes, class Mask>
+__device__ void nms_make_iou_mask(const index idx,
+                                  const SortedBoxes sorted_boxes,
+                                  Mask mask,
+                                  const float iou_threshold)
 {
     static_assert(NumBoxes > 0);
     constexpr index_int half = NumBoxes / 2;
+    using box_elem_type      = typename SortedBoxes::type;
+
+    auto load_box = [&](index_int i) {
+        return array<box_elem_type, 4>{sorted_boxes[array<index_int, 3>{0, i, 0}],
+                                       sorted_boxes[array<index_int, 3>{0, i, 1}],
+                                       sorted_boxes[array<index_int, 3>{0, i, 2}],
+                                       sorted_boxes[array<index_int, 3>{0, i, 3}]};
+    };
 
     auto fill_row = [&](index_int i) {
+        const auto box_i = load_box(i);
         for(index_int j = i + 1; j < NumBoxes; ++j)
         {
             mask[nms_packed_idx(i, j, NumBoxes)] =
-                nms_iou_over_threshold(nms_data[i].box, nms_data[j].box, iou_threshold);
+                nms_iou_over_threshold(box_i, load_box(j), iou_threshold);
         }
     };
 
@@ -223,15 +221,18 @@ nms_make_iou_mask(const index idx, const NMSData nms_data, Mask mask, const floa
     }
 }
 
-// Greedy filter that writes selections into a per-batch per-class region of output.
+// Greedy filter that writes selections into a per-batch per-class region of
+// output, reading scores and original-box indices from the sorted_* views.
 template <index_int NumBoxes,
           index_int NumClasses,
-          class NMSData,
+          class SortedScores,
+          class SortedIndices,
           class Mask,
           class Output,
           class Counts>
 __device__ void nms_filter_per_block(const index idx,
-                                     const NMSData nms_data,
+                                     const SortedScores sorted_scores,
+                                     const SortedIndices sorted_indices,
                                      const Mask mask,
                                      const int max_output,
                                      const float score_thr,
@@ -247,7 +248,7 @@ __device__ void nms_filter_per_block(const index idx,
     // Match the ref op: only filter by score when score_threshold > 0.
     const bool do_filter = score_thr > 0.f;
     idx.local_stride(NumBoxes,
-                     [&](auto i) { removed[i] = (do_filter and nms_data[i].score < score_thr); });
+                     [&](auto i) { removed[i] = (do_filter and sorted_scores[i] < score_thr); });
     __syncthreads();
 
     index_int output_idx = 0;
@@ -262,7 +263,7 @@ __device__ void nms_filter_per_block(const index idx,
         {
             if(idx.local == 0)
             {
-                array<typename Output::type, 3> tmp = {batch_idx, class_idx, nms_data[i].box_index};
+                array<typename Output::type, 3> tmp = {batch_idx, class_idx, sorted_indices[i]};
                 auto output_iter = block_output.begin_at(array<index_int, 3>{0, output_idx, 0});
                 copy(tmp.begin(), tmp.end(), output_iter);
             }
@@ -279,10 +280,11 @@ __device__ void nms_filter_per_block(const index idx,
         bc_counts[block_id] = static_cast<int32_t>(output_idx);
 }
 
-// Per-block filter driver: one block per (batch_idx, class_idx).`.
-// Expecting box-coordinate convention has already been normalized into corner form.
 // TODO: Merge the nonmaxsuppression_sort and nonmaxsuppression_filter kernels by relaxing
-// the AlignedNumBoxes resitriction for the sort.
+// the AlignedNumBoxes restriction for the sort.
+// Per-block filter driver: one block per (batch_idx, class_idx). Slices the
+// global sorted_* arrays and passes the views to the IoU-mask and greedy
+// filter helpers. Box coordinates are assumed to already be in corner form.
 template <index_int NumBatches,
           index_int NumClasses,
           index_int NumBoxes,
@@ -320,20 +322,6 @@ __device__ void nonmaxsuppression_filter(const SortedScores sorted_scores,
     auto my_sorted_indices =
         slice_tensor(sorted_indices, array<index_int, 2>{block_idx, 0}, slice_axes<1>());
 
-    using scores_type  = typename SortedScores::type;
-    using boxes_type   = typename SortedBoxes::type;
-    using indices_type = typename SortedIndices::type;
-    // Use shared memory for sorting per-block nms_data. Assuming it fits in LDS.
-    // TODO: can add a static_assert on needed LDS size
-    __shared__ uninitialized_buffer<nms_data<scores_type, boxes_type, indices_type>, NumBoxes>
-        block_nms_data;
-
-    idx.local_stride(NumBoxes, [&](auto i) {
-        block_nms_data[i].score = my_sorted_scores[i];
-        auto boxes_iter         = my_sorted_boxes.begin_at(array<index_int, 3>{0, i, 0});
-        copy(boxes_iter, boxes_iter + 4, block_nms_data[i].box.begin());
-        block_nms_data[i].box_index = my_sorted_indices[i];
-    });
     auto my_mask   = slice_tensor(mask, array<index_int, 2>{block_idx, 0}, slice_axes<1>());
     auto my_output = slice_tensor(output, array<index_int, 3>{block_idx, 0, 0}, slice_axes<1, 2>());
 
@@ -342,12 +330,12 @@ __device__ void nonmaxsuppression_filter(const SortedScores sorted_scores,
     const float iou_thr_val              = iou_thr_p[0];
     const float score_thr_val            = score_thr_p[0];
 
-    __syncthreads();
-    nms_make_iou_mask<NumBoxes>(idx, block_nms_data, my_mask, iou_thr_val);
+    nms_make_iou_mask<NumBoxes>(idx, my_sorted_boxes, my_mask, iou_thr_val);
 
     __syncthreads();
     nms_filter_per_block<NumBoxes, NumClasses>(idx,
-                                               block_nms_data,
+                                               my_sorted_scores,
+                                               my_sorted_indices,
                                                my_mask,
                                                max_output_boxes_per_class,
                                                score_thr_val,
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp
index b49d78ca572..491e9348e1e 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp
@@ -140,12 +140,8 @@ struct bitonic_sort
         });
     }
 
-    // Block-level bitonic sort over a power-of-two buffer in shared or global
-    // memory. All threads in the block cooperate; buf must point to N elements
-    // visible to every thread. The compare_function determines the final order
-    // (e.g. greater{} -> descending). The buffer must be sized to N (a
-    // compile-time power of 2); callers pad with sentinel values when the
-    // logical length is smaller.
+    // Block-wide bitonic sort of an N-element buffer (N must be a power of 2;
+    // pad with sentinels when the logical length is smaller).
     template <index_int N, class T>
     __device__ void block_sort(index idx, T& buf) const
     {
@@ -167,6 +163,46 @@ struct bitonic_sort
             }
         }
     }
+
+    // Bitonic sort with caller-supplied compare and swap. Callbacks are
+    // always invoked with i < j; compare_at(i, j) follows the same convention
+    // as compare_function(buf[j], buf[i]) in block_sort.
+    template <index_int N, class CompareAt, class SwapAt>
+    __device__ void block_sort_indexed(index idx, CompareAt compare_at, SwapAt swap_at) const
+    {
+        static_assert(is_power_of_2(N), "N must be a power of 2");
+        for(index_int k = 2; k <= N; k <<= 1)
+        {
+            for(index_int j = k >> 1; j > 0; j >>= 1)
+            {
+                idx.local_stride(N, [&](auto tid) {
+                    index_int partner = tid ^ j;
+                    if(partner > tid)
+                    {
+                        const bool reverse = (tid & k) != 0;
+                        if(reverse ^ compare_at(tid, partner))
+                            swap_at(tid, partner);
+                    }
+                });
+                __syncthreads();
+            }
+        }
+    }
+
+    // Sort keys (under compare_function) and swap each vals[i]/vals[j] in
+    // lockstep. Each ValBuf must support swap(buf[i], buf[j]). For non-scalar
+    // layouts, use block_sort_indexed directly.
+    template <index_int N, class KeyBuf, class... ValBufs>
+    __device__ void block_sort(index idx, KeyBuf& keys, ValBufs&... vals) const
+    {
+        block_sort_indexed<N>(
+            idx,
+            [&](auto i, auto j) { return compare_function(keys[j], keys[i]); },
+            [&](auto i, auto j) {
+                swap(keys[i], keys[j]);
+                (swap(vals[i], vals[j]), ...);
+            });
+    }
 };
 
 MIGRAPHX_AUTO_DEDUCE(bitonic_sort);
diff --git a/src/targets/gpu/lowering.cpp b/src/targets/gpu/lowering.cpp
index 192e3f8c45d..e685d538973 100644
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -448,16 +448,9 @@ struct miopen_apply
         });
     }
 
-    // Rewrites onnx `nonmaxsuppression` into the GPU op pipeline:
-    //   gpu::nms_sort -> gpu::nms_filter -> gpu::nms_compact
-    // Each gpu::nms_* op is wrapped in gpu::precompile_op inline so the JIT
-    // compile pass can pick them up later. We can't rely on the main lowering
-    // loop to wrap them: it walks forward, and the new instructions land
-    // before `ins` so they would never be revisited.
-    //
-    // The kernels are JIT'd against compile-time sizes baked from the input
-    // shapes, so when either of `boxes` / `scores` is dynamic we fall back to
-    // executing the ref op on the host.
+    // Lowers `nonmaxsuppression` to the gpu::nms_sort -> nms_filter ->
+    // nms_compact pipeline, or to a host ref-op fallback when either input
+    // shape is dynamic (the kernels bake compile-time sizes).
     void add_nms_op()
     {
         apply_map.emplace("nonmaxsuppression", [=](instruction_ref ins) {
@@ -469,7 +462,8 @@ struct miopen_apply
         });
     }
 
-    // Static GPU pipeline: gpu::nms_sort -> gpu::nms_filter -> gpu::nms_compact.
+    // Static GPU pipeline. Each gpu::nms_* is wrapped in precompile_op inline
+    // because the main lowering loop walks forward and would skip them.
     instruction_ref lower_nms_to_gpu_pipeline(instruction_ref ins) const
     {
         auto inputs            = ins->inputs();
@@ -494,8 +488,7 @@ struct miopen_apply
         bool center_point_box =
             ins->get_operator().to_value().at("center_point_box").to<bool>();
 
-        // Mask is scratch only; allocate up-front so the standard
-        // replace_allocate pass can later turn it into hip::allocate.
+        // Scratch mask; replace_allocate later turns it into hip::allocate.
         shape mask_shape{shape::uint8_type, {num_batches * num_classes, iou_packed}};
         auto mask_alloc = insert_allocation(ins, mask_shape);
 
@@ -530,17 +523,11 @@ struct miopen_apply
         return mod->replace_instruction(ins, compact);
     }
 
-    // Dynamic-shape fallback: run the ref `nonmaxsuppression` op on the host
-    // and copy each tuple element back to its own GPU allocation. Downstream
-    // `get_tuple_elem` consumers of `ins` are rewritten in place to point at
-    // the per-element GPU copies; `ins` itself is left for DCE to remove.
-    //
-    // The ref op produces a tuple {indices, num_selected}, and `hip::copy_to_gpu`
-    // is not tuple-aware (calls `argument::data()` which asserts non-tuple), so
-    // we have to split the tuple on the host side before copying back.
+    // Dynamic-shape fallback: run the ref op on the host. The tuple has to be
+    // split host-side before copy_to_gpu (which is not tuple-aware), and the
+    // downstream get_tuple_elem consumers are rewritten in place.
     instruction_ref lower_nms_to_ref(instruction_ref ins) const
     {
-        // Copy each input from GPU to host, then sync before running the ref op.
         auto inputs = ins->inputs();
         std::vector<instruction_ref> cpu_inputs;
         cpu_inputs.reserve(inputs.size());
@@ -551,11 +538,8 @@ struct miopen_apply
         cpu_inputs.front() =
             mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs);
 
-        // Ref op produces a tuple {indices [max_num_boxes, 3], num_selected [1]}.
         auto cpu_out = mod->insert_instruction(ins, ins->get_operator(), cpu_inputs);
 
-        // For each sub-shape, extract on the host side and copy back to its
-        // own GPU allocation.
         const auto& sub_shapes = ins->get_shape().sub_shapes();
         std::vector<instruction_ref> gpu_subs;
         gpu_subs.reserve(sub_shapes.size());
@@ -568,7 +552,7 @@ struct miopen_apply
                 ins, make_op("hip::copy_to_gpu"), cpu_sub, gpu_alloc));
         }
 
-        // Snapshot outputs since we mutate the graph below.
+        // Snapshot since we mutate the graph below.
         auto consumers = ins->outputs();
         for(auto consumer : consumers)
         {
@@ -582,9 +566,8 @@ struct miopen_apply
             mod->replace_instruction(consumer, gpu_subs[idx]);
         }
 
-        // `ins` is now dead; leave it for dead_code_elimination. Return it so
-        // the apply-loop shape check (which compares against the original
-        // tuple shape) succeeds.
+        // Leave `ins` for dead_code_elimination; return it so the apply-loop
+        // tuple-shape check passes.
         return ins;
     }
 
diff --git a/src/targets/gpu/nms_ops.cpp b/src/targets/gpu/nms_ops.cpp
index f9ac82c8ebf..de801c2aa32 100644
--- a/src/targets/gpu/nms_ops.cpp
+++ b/src/targets/gpu/nms_ops.cpp
@@ -32,7 +32,8 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 
-// Sort boxes per (batch, class) into nms_data{} tensor.
+// Sort boxes per (batch, class) into per-class sorted_scores / sorted_boxes /
+// sorted_indices tensors.
 // inputs = {boxes, scores}
 struct nms_sort
 {

From 229cf90ef1ec37252ffbcda18c145e6e82b762ab Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Thu, 21 May 2026 16:42:42 -0500
Subject: [PATCH 27/32] doc comments cleanup

---
 src/include/migraphx/op/nonmaxsuppression.hpp               | 2 +-
 src/onnx/parse_nonmaxsuppression.cpp                        | 4 ++--
 .../kernels/include/migraphx/kernels/nonmaxsuppression.hpp  | 2 +-
 src/targets/gpu/lowering.cpp                                | 1 +
 src/targets/gpu/nms_ops.cpp                                 | 6 +++---
 5 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp
index 87bd541fba1..38f076ca8de 100644
--- a/src/include/migraphx/op/nonmaxsuppression.hpp
+++ b/src/include/migraphx/op/nonmaxsuppression.hpp
@@ -308,7 +308,7 @@ struct nonmaxsuppression
         argument result{max_output_shape};
         argument num_selected_result{output_shapes.at(1)};
 
-        std::size_t max_output_boxes_per_class =
+        int64_t max_output_boxes_per_class =
             (args.size() > 2) ? (args.at(2).at<std::size_t>()) : 0;
         if(max_output_boxes_per_class == 0)
         {
diff --git a/src/onnx/parse_nonmaxsuppression.cpp b/src/onnx/parse_nonmaxsuppression.cpp
index 959683d01e9..0c343bf970a 100644
--- a/src/onnx/parse_nonmaxsuppression.cpp
+++ b/src/onnx/parse_nonmaxsuppression.cpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -42,7 +42,7 @@ struct parse_nonmaxsuppression : op_parser<parse_nonmaxsuppression>
     {
         auto op = parser.load(opd.op_name, info);
         auto nms_ins = info.add_instruction(op, args);
-        // variable ends input slice to handle dynamic shape output
+        // slice with variable ends to handle dynamic shape output.
         auto indices = info.add_instruction(make_op("get_tuple_elem", {{"index", 0}}), nms_ins);
         if(enabled(MIGRAPHX_USE_DYNAMIC_NMS{}))
         {
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
index ee5b1b090e9..5b02b8136ad 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
@@ -392,7 +392,7 @@ nonmaxsuppression_compact(const Counts bc_counts, const Idx indices, Out output,
         const index_int box_idx         = i % NumBoxes;
         if(box_idx < bc_counts[batch_class_idx])
         {
-            for(int k = 0; k < 3; ++k)
+            for(index_int k = 0; k < 3; ++k)
             {
                 output[(offsets[batch_class_idx] + box_idx) * index_size + k] =
                     indices[(batch_class_idx * NumBoxes + box_idx) * index_size + k];
diff --git a/src/targets/gpu/lowering.cpp b/src/targets/gpu/lowering.cpp
index e685d538973..dbb16ee5e0a 100644
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -552,6 +552,7 @@ struct miopen_apply
                 ins, make_op("hip::copy_to_gpu"), cpu_sub, gpu_alloc));
         }
 
+        // TODO: this needs cleanup
         // Snapshot since we mutate the graph below.
         auto consumers = ins->outputs();
         for(auto consumer : consumers)
diff --git a/src/targets/gpu/nms_ops.cpp b/src/targets/gpu/nms_ops.cpp
index de801c2aa32..10418002216 100644
--- a/src/targets/gpu/nms_ops.cpp
+++ b/src/targets/gpu/nms_ops.cpp
@@ -58,9 +58,9 @@ struct nms_sort
         const auto num_boxes   = boxes_s.lens()[1];
         const auto num_classes = scores_s.lens()[1];
         const auto aligned_b =
-            static_cast<std::size_t>(bit_ceil(static_cast<std::uint32_t>(num_boxes)));
-        shape out_scores_shape{shape::float_type, {num_batches * num_classes, aligned_b}};
-        shape out_boxes_shape{shape::float_type, {num_batches * num_classes, aligned_b, 4}};
+            static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(num_boxes)));
+        shape out_scores_shape{scores_s.type(), {num_batches * num_classes, aligned_b}};
+        shape out_boxes_shape{boxes.type(), {num_batches * num_classes, aligned_b, 4}};
         shape out_box_index_shape{shape::int32_type, {num_batches * num_classes, aligned_b}};
         return shape{{out_scores_shape, out_boxes_shape, out_box_index_shape}};
     }

From 8bb7865aae1b26e7862274541da78a8288a1eefc Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Thu, 21 May 2026 17:05:41 -0500
Subject: [PATCH 28/32] Formatting

---
 src/include/migraphx/op/nonmaxsuppression.hpp |   5 +-
 src/onnx/parse_nonmaxsuppression.cpp          |   4 +-
 src/targets/gpu/lowering.cpp                  |  24 ++--
 src/targets/gpu/nms_ops.cpp                   |   9 +-
 test/gpu/nonmaxsuppression.cpp                |   6 +-
 test/onnx/parse/nms_dynamic_batch_test.cpp    |   7 +-
 test/onnx/parse/nms_dynamic_boxes_test.cpp    |   3 +-
 test/onnx/parse/nms_dynamic_classes_test.cpp  |   3 +-
 test/ref/nonmaxsuppression.cpp                | 112 +++++++++---------
 9 files changed, 81 insertions(+), 92 deletions(-)

diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp
index 38f076ca8de..4b4c47070b3 100644
--- a/src/include/migraphx/op/nonmaxsuppression.hpp
+++ b/src/include/migraphx/op/nonmaxsuppression.hpp
@@ -93,7 +93,7 @@ struct nonmaxsuppression
             }
         };
 
-        if(not (inputs.at(0).dynamic() or inputs.at(1).dynamic()))
+        if(not(inputs.at(0).dynamic() or inputs.at(1).dynamic()))
         {
             fixed_shape_error_check();
         }
@@ -308,8 +308,7 @@ struct nonmaxsuppression
         argument result{max_output_shape};
         argument num_selected_result{output_shapes.at(1)};
 
-        int64_t max_output_boxes_per_class =
-            (args.size() > 2) ? (args.at(2).at<std::size_t>()) : 0;
+        int64_t max_output_boxes_per_class = (args.size() > 2) ? (args.at(2).at<std::size_t>()) : 0;
         if(max_output_boxes_per_class == 0)
         {
             num_selected_result.visit([&](auto output) { output[0] = 0; });
diff --git a/src/onnx/parse_nonmaxsuppression.cpp b/src/onnx/parse_nonmaxsuppression.cpp
index 0c343bf970a..a62e04988b0 100644
--- a/src/onnx/parse_nonmaxsuppression.cpp
+++ b/src/onnx/parse_nonmaxsuppression.cpp
@@ -40,13 +40,13 @@ struct parse_nonmaxsuppression : op_parser<parse_nonmaxsuppression>
                           const onnx_parser::node_info& info,
                           const std::vector<instruction_ref>& args) const
     {
-        auto op = parser.load(opd.op_name, info);
+        auto op      = parser.load(opd.op_name, info);
         auto nms_ins = info.add_instruction(op, args);
         // slice with variable ends to handle dynamic shape output.
         auto indices = info.add_instruction(make_op("get_tuple_elem", {{"index", 0}}), nms_ins);
         if(enabled(MIGRAPHX_USE_DYNAMIC_NMS{}))
         {
-            //TODO: planning to make this the default behavior and removing the env var.
+            // TODO: planning to make this the default behavior and removing the env var.
             auto num_selected =
                 info.add_instruction(make_op("get_tuple_elem", {{"index", 1}}), nms_ins);
             auto slice_ins = info.add_instruction(
diff --git a/src/targets/gpu/lowering.cpp b/src/targets/gpu/lowering.cpp
index dbb16ee5e0a..6fb4f0cd9c9 100644
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -485,8 +485,7 @@ struct miopen_apply
         if(inputs.size() < 5)
             inputs.push_back(mod->insert_literal(ins, literal{default_thr_s, {0.0f}}));
 
-        bool center_point_box =
-            ins->get_operator().to_value().at("center_point_box").to<bool>();
+        bool center_point_box = ins->get_operator().to_value().at("center_point_box").to<bool>();
 
         // Scratch mask; replace_allocate later turns it into hip::allocate.
         shape mask_shape{shape::uint8_type, {num_batches * num_classes, iou_packed}};
@@ -531,12 +530,10 @@ struct miopen_apply
         auto inputs = ins->inputs();
         std::vector<instruction_ref> cpu_inputs;
         cpu_inputs.reserve(inputs.size());
-        std::transform(
-            inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) {
-                return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in);
-            });
-        cpu_inputs.front() =
-            mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs);
+        std::transform(inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) {
+            return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in);
+        });
+        cpu_inputs.front() = mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs);
 
         auto cpu_out = mod->insert_instruction(ins, ins->get_operator(), cpu_inputs);
 
@@ -545,11 +542,11 @@ struct miopen_apply
         gpu_subs.reserve(sub_shapes.size());
         for(std::size_t i = 0; i < sub_shapes.size(); ++i)
         {
-            auto cpu_sub = mod->insert_instruction(
-                ins, make_op("get_tuple_elem", {{"index", i}}), cpu_out);
+            auto cpu_sub =
+                mod->insert_instruction(ins, make_op("get_tuple_elem", {{"index", i}}), cpu_out);
             auto gpu_alloc = insert_allocation(ins, sub_shapes[i]);
-            gpu_subs.push_back(mod->insert_instruction(
-                ins, make_op("hip::copy_to_gpu"), cpu_sub, gpu_alloc));
+            gpu_subs.push_back(
+                mod->insert_instruction(ins, make_op("hip::copy_to_gpu"), cpu_sub, gpu_alloc));
         }
 
         // TODO: this needs cleanup
@@ -561,8 +558,7 @@ struct miopen_apply
                 MIGRAPHX_THROW("gpu::add_nms_op: dynamic NMS fallback expects only "
                                "get_tuple_elem consumers of nonmaxsuppression; got: " +
                                consumer->name());
-            auto idx =
-                consumer->get_operator().to_value().at("index").to<std::size_t>();
+            auto idx = consumer->get_operator().to_value().at("index").to<std::size_t>();
             assert(idx < gpu_subs.size());
             mod->replace_instruction(consumer, gpu_subs[idx]);
         }
diff --git a/src/targets/gpu/nms_ops.cpp b/src/targets/gpu/nms_ops.cpp
index 10418002216..682f1f2c003 100644
--- a/src/targets/gpu/nms_ops.cpp
+++ b/src/targets/gpu/nms_ops.cpp
@@ -57,10 +57,9 @@ struct nms_sort
         const auto num_batches = boxes_s.lens()[0];
         const auto num_boxes   = boxes_s.lens()[1];
         const auto num_classes = scores_s.lens()[1];
-        const auto aligned_b =
-            static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(num_boxes)));
+        const auto aligned_b = bit_ceil(static_cast<std::uint64_t>(num_boxes));
         shape out_scores_shape{scores_s.type(), {num_batches * num_classes, aligned_b}};
-        shape out_boxes_shape{boxes.type(), {num_batches * num_classes, aligned_b, 4}};
+        shape out_boxes_shape{boxes_s.type(), {num_batches * num_classes, aligned_b, 4}};
         shape out_box_index_shape{shape::int32_type, {num_batches * num_classes, aligned_b}};
         return shape{{out_scores_shape, out_boxes_shape, out_box_index_shape}};
     }
@@ -69,8 +68,8 @@ MIGRAPHX_REGISTER_OP(nms_sort);
 
 // Build the IoU mask and run the greedy filter.
 // Produces a tuple of (raw_output, bc_counts).
-// num_batches/num_classes/num_boxes are kept as op attributes because the filter inputs
-// is a scratch buffer from which these can't be recovered.
+// num_batches/num_classes/num_boxes are kept as op attributes because these can't be recovered
+// from the inputs.
 // inputs = {sorted_boxes, sorted_scores, sorted_box_indices, output_indices, output_bc_counts}
 struct nms_filter
 {
diff --git a/test/gpu/nonmaxsuppression.cpp b/test/gpu/nonmaxsuppression.cpp
index 4d9bdb89602..89fdafef7a8 100644
--- a/test/gpu/nonmaxsuppression.cpp
+++ b/test/gpu/nonmaxsuppression.cpp
@@ -198,10 +198,8 @@ TEST_CASE(nms_dynamic_fallback_test)
     using dd = migraphx::shape::dynamic_dimension;
     migraphx::program p;
     auto* mm = p.get_main_module();
-    migraphx::shape boxes_dyn_s{migraphx::shape::float_type,
-                                {dd{1, 1}, dd{4, 10}, dd{4, 4}}};
-    migraphx::shape scores_dyn_s{migraphx::shape::float_type,
-                                 {dd{1, 1}, dd{1, 1}, dd{4, 8}}};
+    migraphx::shape boxes_dyn_s{migraphx::shape::float_type, {dd{1, 1}, dd{4, 10}, dd{4, 4}}};
+    migraphx::shape scores_dyn_s{migraphx::shape::float_type, {dd{1, 1}, dd{1, 1}, dd{4, 8}}};
 
     auto boxes_p         = mm->add_parameter("boxes", boxes_dyn_s);
     auto scores_p        = mm->add_parameter("scores", scores_dyn_s);
diff --git a/test/onnx/parse/nms_dynamic_batch_test.cpp b/test/onnx/parse/nms_dynamic_batch_test.cpp
index bb7e350bea8..346f8d7de5a 100644
--- a/test/onnx/parse/nms_dynamic_batch_test.cpp
+++ b/test/onnx/parse/nms_dynamic_batch_test.cpp
@@ -39,12 +39,7 @@ TEST_CASE(nms_dynamic_batch_test)
     migraphx::shape sst{migraphx::shape::float_type, {1}};
     auto st  = mm->add_parameter("score_threshold", sst);
     auto nms = mm->add_instruction(
-        migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
-        b,
-        s,
-        mo,
-        iou,
-        st);
+        migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), b, s, mo, iou, st);
     auto ret = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms);
     mm->add_return({ret});
 
diff --git a/test/onnx/parse/nms_dynamic_boxes_test.cpp b/test/onnx/parse/nms_dynamic_boxes_test.cpp
index d11552ca3d7..2b43f4200b1 100644
--- a/test/onnx/parse/nms_dynamic_boxes_test.cpp
+++ b/test/onnx/parse/nms_dynamic_boxes_test.cpp
@@ -38,8 +38,7 @@ TEST_CASE(nms_dynamic_boxes_test)
     auto iou = mm->add_parameter("iou_threshold", siou);
     migraphx::shape sst{migraphx::shape::float_type, {1}};
     auto st  = mm->add_parameter("score_threshold", sst);
-    auto nms = mm->add_instruction(
-        migraphx::make_op("nonmaxsuppression"), b, s, mo, iou, st);
+    auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"), b, s, mo, iou, st);
     auto ret = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms);
     mm->add_return({ret});
 
diff --git a/test/onnx/parse/nms_dynamic_classes_test.cpp b/test/onnx/parse/nms_dynamic_classes_test.cpp
index 67a21634568..aaa8f843c1e 100644
--- a/test/onnx/parse/nms_dynamic_classes_test.cpp
+++ b/test/onnx/parse/nms_dynamic_classes_test.cpp
@@ -38,8 +38,7 @@ TEST_CASE(nms_dynamic_classes_test)
     auto iou = mm->add_parameter("iou_threshold", siou);
     migraphx::shape sst{migraphx::shape::float_type, {1}};
     auto st  = mm->add_parameter("score_threshold", sst);
-    auto nms = mm->add_instruction(
-        migraphx::make_op("nonmaxsuppression"), b, s, mo, iou, st);
+    auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"), b, s, mo, iou, st);
     auto ret = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms);
     mm->add_return({ret});
 
diff --git a/test/ref/nonmaxsuppression.cpp b/test/ref/nonmaxsuppression.cpp
index 8f16ec6dd75..39d95fd56d6 100644
--- a/test/ref/nonmaxsuppression.cpp
+++ b/test/ref/nonmaxsuppression.cpp
@@ -56,12 +56,13 @@ TEST_CASE(nms_dyn_out_test)
     auto iou_threshold   = mm->add_literal(0.5f);
     auto score_threshold = mm->add_literal(0.0f);
 
-    auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
-                                   boxes_l,
-                                   scores_l,
-                                   max_out_l,
-                                   iou_threshold,
-                                   score_threshold);
+    auto nms =
+        mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
+                            boxes_l,
+                            scores_l,
+                            max_out_l,
+                            iou_threshold,
+                            score_threshold);
     auto r = add_nms_dynamic_slice(mm, nms);
     mm->add_return({r});
 
@@ -97,7 +98,7 @@ TEST_CASE(nms_identical_all_dyn_out_test)
                                    max_out_l,
                                    iou_threshold,
                                    score_threshold);
-    auto r = add_nms_dynamic_slice(mm, nms);
+    auto r   = add_nms_dynamic_slice(mm, nms);
     mm->add_return({r});
 
     p.compile(migraphx::make_target("ref"));
@@ -123,13 +124,13 @@ TEST_CASE(nms_dyn_batch_test)
     auto iou_threshold   = mm->add_literal(0.5f);
     auto score_threshold = mm->add_literal(0.0f);
 
-    auto nms = mm->add_instruction(
-        migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
-        boxes_p,
-        scores_p,
-        max_out_l,
-        iou_threshold,
-        score_threshold);
+    auto nms =
+        mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
+                            boxes_p,
+                            scores_p,
+                            max_out_l,
+                            iou_threshold,
+                            score_threshold);
     auto r = add_nms_dynamic_slice(mm, nms);
     mm->add_return({r});
 
@@ -169,13 +170,13 @@ TEST_CASE(nms_dyn_boxes_test)
     auto iou_threshold   = mm->add_literal(0.5f);
     auto score_threshold = mm->add_literal(0.0f);
 
-    auto nms = mm->add_instruction(
-        migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
-        boxes_p,
-        scores_p,
-        max_out_l,
-        iou_threshold,
-        score_threshold);
+    auto nms =
+        mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
+                            boxes_p,
+                            scores_p,
+                            max_out_l,
+                            iou_threshold,
+                            score_threshold);
     auto r = add_nms_dynamic_slice(mm, nms);
     mm->add_return({r});
 
@@ -212,13 +213,13 @@ TEST_CASE(nms_dyn_classes_test)
     auto iou_threshold   = mm->add_literal(0.5f);
     auto score_threshold = mm->add_literal(0.0f);
 
-    auto nms = mm->add_instruction(
-        migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
-        boxes_p,
-        scores_p,
-        max_out_l,
-        iou_threshold,
-        score_threshold);
+    auto nms =
+        mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
+                            boxes_p,
+                            scores_p,
+                            max_out_l,
+                            iou_threshold,
+                            score_threshold);
     auto r = add_nms_dynamic_slice(mm, nms);
     mm->add_return({r});
 
@@ -266,7 +267,7 @@ TEST_CASE(nms_not_center_test)
                                    max_out_l,
                                    iou_threshold,
                                    score_threshold);
-    auto r = add_nms_dynamic_slice(mm, nms);
+    auto r   = add_nms_dynamic_slice(mm, nms);
     mm->add_return({r});
 
     p.compile(migraphx::make_target("ref"));
@@ -294,12 +295,13 @@ TEST_CASE(nms_test)
     auto iou_threshold   = mm->add_literal(0.5f);
     auto score_threshold = mm->add_literal(0.0f);
 
-    auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
-                                   boxes_l,
-                                   scores_l,
-                                   max_out_l,
-                                   iou_threshold,
-                                   score_threshold);
+    auto nms =
+        mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
+                            boxes_l,
+                            scores_l,
+                            max_out_l,
+                            iou_threshold,
+                            score_threshold);
     auto r = add_nms_dynamic_slice(mm, nms);
     mm->add_return({r});
 
@@ -332,12 +334,13 @@ TEST_CASE(nms_transpose1_test)
 
     auto transpose_boxes = mm->add_instruction(
         migraphx::make_op("transpose", {{"permutation", {0, 2, 1}}}), t_boxes_l);
-    auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
-                                   transpose_boxes,
-                                   scores_l,
-                                   max_out_l,
-                                   iou_threshold,
-                                   score_threshold);
+    auto nms =
+        mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
+                            transpose_boxes,
+                            scores_l,
+                            max_out_l,
+                            iou_threshold,
+                            score_threshold);
     auto r = add_nms_dynamic_slice(mm, nms);
     mm->add_return({r});
 
@@ -370,12 +373,13 @@ TEST_CASE(nms_transpose2_test)
 
     auto transpose_boxes = mm->add_instruction(
         migraphx::make_op("transpose", {{"permutation", {1, 2, 0}}}), t_boxes_l);
-    auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
-                                   transpose_boxes,
-                                   scores_l,
-                                   max_out_l,
-                                   iou_threshold,
-                                   score_threshold);
+    auto nms =
+        mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
+                            transpose_boxes,
+                            scores_l,
+                            max_out_l,
+                            iou_threshold,
+                            score_threshold);
     auto r = add_nms_dynamic_slice(mm, nms);
     mm->add_return({r});
 
@@ -404,13 +408,13 @@ TEST_CASE(nms_dyn_different_spatial_ranges_test)
     auto iou_threshold   = mm->add_literal(0.5f);
     auto score_threshold = mm->add_literal(0.0f);
 
-    auto nms = mm->add_instruction(
-        migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
-        boxes_p,
-        scores_p,
-        max_out_l,
-        iou_threshold,
-        score_threshold);
+    auto nms =
+        mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
+                            boxes_p,
+                            scores_p,
+                            max_out_l,
+                            iou_threshold,
+                            score_threshold);
     auto r = add_nms_dynamic_slice(mm, nms);
     mm->add_return({r});
 

From 4c27d5fe35485d6f5a01f6f4e63c2fef5421501a Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Thu, 21 May 2026 17:11:31 -0500
Subject: [PATCH 29/32] Licensing

---
 src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp   | 2 +-
 .../gpu/include/migraphx/gpu/compile_hip_code_object.hpp      | 2 +-
 src/targets/gpu/jit/topk.cpp                                  | 2 +-
 .../kernels/include/migraphx/kernels/nonmaxsuppression.hpp    | 4 ++--
 src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp     | 2 +-
 test/multi_target/multitarget_test.cpp                        | 2 +-
 test/onnx/parse/nms_dynamic_batch_test.cpp                    | 2 +-
 test/onnx/parse/nms_dynamic_boxes_test.cpp                    | 2 +-
 test/onnx/parse/nms_dynamic_classes_test.cpp                  | 2 +-
 test/onnx/parse/nms_test.cpp                                  | 2 +-
 10 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp b/src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp
index 95ce82f224e..22539bb8d6c 100644
--- a/src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp b/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
index f11051916cf..8ef0fbb6533 100644
--- a/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/src/targets/gpu/jit/topk.cpp b/src/targets/gpu/jit/topk.cpp
index 1deafb2db60..a39a26a4e18 100644
--- a/src/targets/gpu/jit/topk.cpp
+++ b/src/targets/gpu/jit/topk.cpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
index 5b02b8136ad..2fd277974d8 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
@@ -85,9 +85,9 @@ __device__ inline bool nms_iou_over_threshold(const Box a, const Box b, const Th
 }
 
 // Packed upper-triangular index for j > i within an N x N matrix.
-__device__ inline index_int nms_packed_idx(index_int i, index_int j, index_int N)
+__device__ inline index_int nms_packed_idx(index_int i, index_int j, index_int size)
 {
-    return (i * N - (i * (i + 1)) / 2) + j - (i + 1);
+    return (i * size - (i * (i + 1)) / 2) + j - (i + 1);
 }
 
 // One block per (batch_idx, class_idx). Initializes the per-block slice of
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp
index 491e9348e1e..5403783a601 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/test/multi_target/multitarget_test.cpp b/test/multi_target/multitarget_test.cpp
index 1ca5758e74a..40e4dd37ecf 100644
--- a/test/multi_target/multitarget_test.cpp
+++ b/test/multi_target/multitarget_test.cpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/test/onnx/parse/nms_dynamic_batch_test.cpp b/test/onnx/parse/nms_dynamic_batch_test.cpp
index 346f8d7de5a..f9ac10fa4aa 100644
--- a/test/onnx/parse/nms_dynamic_batch_test.cpp
+++ b/test/onnx/parse/nms_dynamic_batch_test.cpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/test/onnx/parse/nms_dynamic_boxes_test.cpp b/test/onnx/parse/nms_dynamic_boxes_test.cpp
index 2b43f4200b1..2b11265d00c 100644
--- a/test/onnx/parse/nms_dynamic_boxes_test.cpp
+++ b/test/onnx/parse/nms_dynamic_boxes_test.cpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/test/onnx/parse/nms_dynamic_classes_test.cpp b/test/onnx/parse/nms_dynamic_classes_test.cpp
index aaa8f843c1e..8f8a3abd9d2 100644
--- a/test/onnx/parse/nms_dynamic_classes_test.cpp
+++ b/test/onnx/parse/nms_dynamic_classes_test.cpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/test/onnx/parse/nms_test.cpp b/test/onnx/parse/nms_test.cpp
index f8826a8a96e..3dbf522b504 100644
--- a/test/onnx/parse/nms_test.cpp
+++ b/test/onnx/parse/nms_test.cpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal

From b3765f6cd8315b8afe11019600628f3a8808e6e6 Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Thu, 21 May 2026 17:12:45 -0500
Subject: [PATCH 30/32] Formatting continued

---
 test/multi_target/multitarget_test.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/multi_target/multitarget_test.cpp b/test/multi_target/multitarget_test.cpp
index 40e4dd37ecf..d5a196375d3 100644
--- a/test/multi_target/multitarget_test.cpp
+++ b/test/multi_target/multitarget_test.cpp
@@ -216,7 +216,7 @@ TEST_CASE(single_target_multi_compile)
     auto max_out_l                = gpu_mod->add_literal(int64_t{4});
     auto iou_threshold            = gpu_mod->add_literal(0.5f);
     auto score_threshold          = gpu_mod->add_literal(0.0f);
-    auto nms = gpu_mod->add_instruction(
+    auto nms                      = gpu_mod->add_instruction(
         migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}),
         boxes_param_gpu,
         scores_l,
@@ -244,7 +244,7 @@ TEST_CASE(single_target_multi_compile)
     // eval
     migraphx::parameter_map params;
     std::vector<float> boxes_vec  = {0.5, 0.5,  1.0, 1.0, 0.5, 0.6,  1.0, 1.0, 0.5, 0.4,   1.0, 1.0,
-                                    0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0};
+                                     0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0};
     params["boxes"]               = migraphx::argument(boxes_s, boxes_vec.data());
     auto output                   = p.eval(params).back();
     std::vector<int64_t> gold_vec = {0, 0, 3, 0, 0, 0, 0, 0, 5};

From 0bd8d04bf62f21ad3fee63a0739e10f185dfbe90 Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Tue, 26 May 2026 13:11:11 -0500
Subject: [PATCH 31/32] Add changelog

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e5f19d84cb1..60c6c5dbf19 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,6 +26,8 @@ Full documentation for MIGraphX is available at
 * Added N-D scale and zero-point support for `QLinearMatMul` operator.
 * Added test cases for `QLinearConv` per-channel scale and `QLinearMatMul` N-D per-channel quantization.
 * Added find_concat_same_input matcher to convert concat(N*x) into multibroadcast(x) to reduce hipCopy() (#4981)
+* Added GPU kernel for ONNX `NonMaxSuppression` operation and redesigned the `nonmaxsuppression` operation to better represent the data-dependent output shape in the MIGraphX IR (#4893).
+
 ### Changed
 
 * Converted `nonzero` operator from device implementation to JIT compilation (#4720).
@@ -68,6 +70,7 @@ Full documentation for MIGraphX is available at
 
 ### Removed
 * Removed legacy device implementations for `argmin` and `argmax` in favor of the JIT implementations recently added (#4658).
+* Removed `onnx_options::use_dyn_output` after redesign of `NonMaxSuppression` operator (#4893).
 
 ## MIGraphX 2.15 for ROCm 7.2.0
 

From 59b95b7e542644a3dc0300b40d1de8f4fdd82568 Mon Sep 17 00:00:00 2001
From: charlie <charlie.lin@amd.com>
Date: Wed, 27 May 2026 14:27:13 -0500
Subject: [PATCH 32/32] Tidy and formatting

---
 src/onnx/parse_nonmaxsuppression.cpp                        | 1 +
 src/targets/gpu/jit/nonmaxsuppression.cpp                   | 6 ++----
 .../kernels/include/migraphx/kernels/nonmaxsuppression.hpp  | 2 ++
 src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp   | 4 ++++
 src/targets/gpu/nms_ops.cpp                                 | 4 ++--
 5 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/onnx/parse_nonmaxsuppression.cpp b/src/onnx/parse_nonmaxsuppression.cpp
index a62e04988b0..5549e6a102b 100644
--- a/src/onnx/parse_nonmaxsuppression.cpp
+++ b/src/onnx/parse_nonmaxsuppression.cpp
@@ -24,6 +24,7 @@
 #include <migraphx/onnx/op_parser.hpp>
 #include <migraphx/ranges.hpp>
 #include <migraphx/make_op.hpp>
+#include <migraphx/env.hpp>
 
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_USE_DYNAMIC_NMS)
 
diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp
index 732e8702410..a1b3109f413 100644
--- a/src/targets/gpu/jit/nonmaxsuppression.cpp
+++ b/src/targets/gpu/jit/nonmaxsuppression.cpp
@@ -150,8 +150,7 @@ struct nms_sort_compiler : compiler<nms_sort_compiler>
         const auto num_batches = boxes_s.lens()[0];
         const auto num_boxes   = boxes_s.lens()[1];
         const auto num_classes = scores_s.lens()[1];
-        const auto aligned_num_boxes =
-            static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(num_boxes)));
+        const std::size_t aligned_num_boxes = bit_ceil(num_boxes);
         // NOTE: topK kernel uses relement/4 for amount of work in a block?
         auto block_size = compute_block_size(ctx, aligned_num_boxes, 1024);
 
@@ -192,8 +191,7 @@ struct nms_filter_compiler : compiler<nms_filter_compiler>
         const auto num_batches = v.at("num_batches").to<std::size_t>();
         const auto num_classes = v.at("num_classes").to<std::size_t>();
         const auto num_boxes   = v.at("num_boxes").to<std::size_t>();
-        const auto aligned_num_boxes =
-            static_cast<std::size_t>(bit_ceil(static_cast<std::uint64_t>(num_boxes)));
+        const std::size_t aligned_num_boxes = bit_ceil(num_boxes);
         // TODO: tune for max block size?
         // ceil_div(num_boxes, 2) because of strided thread work distribution
         const auto block_size = compute_block_size(ctx, (num_boxes + 1) / 2, 256);
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
index 2fd277974d8..2f104a4676b 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp
@@ -165,6 +165,7 @@ __device__ void nonmaxsuppression_sort(const Boxes boxes_tv,
 
     // Sort scores descending, dragging boxes and indices along. Uses the
     // indexed variant so we can swap all 4 box lanes per index pair.
+    // NOLINTNEXTLINE(clang-diagnostic-error)
     bitonic_sort{greater{}}.template block_sort_indexed<AlignedNumBoxes>(
         idx,
         [&](auto i, auto j) { return my_sorted_scores[j] > my_sorted_scores[i]; },
@@ -214,6 +215,7 @@ __device__ void nms_make_iou_mask(const index idx,
     });
 
     // Have thread 0 do middle row if odd NumBoxes
+    // NOLINTNEXTLINE(hicpp-signed-bitwise)
     if constexpr((NumBoxes & 1) != 0 and NumBoxes > 1)
     {
         if(idx.local == 0)
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp
index 5403783a601..7c603eeb9b8 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp
@@ -146,8 +146,10 @@ struct bitonic_sort
     __device__ void block_sort(index idx, T& buf) const
     {
         static_assert(is_power_of_2(N), "N must be a power of 2");
+        //NOLINTNEXTLINE(hicpp-signed-bitwise)
         for(index_int k = 2; k <= N; k <<= 1)
         {
+            //NOLINTNEXTLINE(hicpp-signed-bitwise)
             for(index_int j = k >> 1; j > 0; j >>= 1)
             {
                 idx.local_stride(N, [&](auto tid) {
@@ -171,8 +173,10 @@ struct bitonic_sort
     __device__ void block_sort_indexed(index idx, CompareAt compare_at, SwapAt swap_at) const
     {
         static_assert(is_power_of_2(N), "N must be a power of 2");
+        //NOLINTNEXTLINE(hicpp-signed-bitwise)
         for(index_int k = 2; k <= N; k <<= 1)
         {
+            //NOLINTNEXTLINE(hicpp-signed-bitwise)
             for(index_int j = k >> 1; j > 0; j >>= 1)
             {
                 idx.local_stride(N, [&](auto tid) {
diff --git a/src/targets/gpu/nms_ops.cpp b/src/targets/gpu/nms_ops.cpp
index 682f1f2c003..76778dc916c 100644
--- a/src/targets/gpu/nms_ops.cpp
+++ b/src/targets/gpu/nms_ops.cpp
@@ -57,7 +57,7 @@ struct nms_sort
         const auto num_batches = boxes_s.lens()[0];
         const auto num_boxes   = boxes_s.lens()[1];
         const auto num_classes = scores_s.lens()[1];
-        const auto aligned_b = bit_ceil(static_cast<std::uint64_t>(num_boxes));
+        const auto aligned_b   = bit_ceil(static_cast<std::uint64_t>(num_boxes));
         shape out_scores_shape{scores_s.type(), {num_batches * num_classes, aligned_b}};
         shape out_boxes_shape{boxes_s.type(), {num_batches * num_classes, aligned_b, 4}};
         shape out_box_index_shape{shape::int32_type, {num_batches * num_classes, aligned_b}};
@@ -98,7 +98,7 @@ struct nms_filter
 MIGRAPHX_REGISTER_OP(nms_filter);
 
 // TODO: This should work in-place, saving memory. Need to update IR to handle it.
-//  Needs a make_tuple type of operator that reuses the indicies input.
+//  Needs a make_tuple type of operator that reuses the indices input.
 // Prefix-scan the per-block counts and compact the selections into
 // the final selected_indices. Output as selected_indices and num_selected tuple.
 // inputs = {output_bc_counts, output_indices}