From 8e3f22eaedcbed7c46ff9ae33a61e9ad94b157d5 Mon Sep 17 00:00:00 2001 From: charlie Date: Fri, 8 May 2026 16:53:51 -0500 Subject: [PATCH 01/32] Initial AI implementation from prototype --- src/targets/gpu/jit/nonmaxsuppression.cpp | 215 ++++++++++++ .../migraphx/kernels/nonmaxsuppression.hpp | 305 ++++++++++++++++++ .../kernels/include/migraphx/kernels/sort.hpp | 28 ++ src/targets/gpu/lowering.cpp | 8 + test/verify/test_nms.cpp | 113 ++++++- 5 files changed, 668 insertions(+), 1 deletion(-) create mode 100644 src/targets/gpu/jit/nonmaxsuppression.cpp create mode 100644 src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp new file mode 100644 index 00000000000..1b6ba4bb62c --- /dev/null +++ b/src/targets/gpu/jit/nonmaxsuppression.cpp @@ -0,0 +1,215 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace migraphx { +inline namespace MIGRAPHX_INLINE_NS { +namespace gpu { + +// nms_data is laid out as { float score; float box[4]; int box_index; } for a +// total of 24 bytes per entry. The scratch workspace is allocated as raw int8 +// and reinterpreted in the kernel. +static constexpr std::size_t nms_bytes_per_data = 24; + +// NOLINTNEXTLINE +static const char* const nms_kernel_src = R"__migraphx__( +#include +#include + +namespace migraphx { + +extern "C" { + +MIGRAPHX_GLOBAL void nms_kernel(${params}) +{ + make_tensors()(${args})([](auto boxes, + auto scores, + auto max_p, + auto iou_p, + auto thr_p, + auto sorted, + auto mask, + auto count, + auto out) { + nonmaxsuppression<${center_point_box}, + ${num_batches}, + ${num_classes}, + ${num_boxes}, + ${aligned_num_boxes}>( + boxes, scores, max_p, iou_p, thr_p, sorted, mask, count, out); + }); +} + +} + +} // namespace migraphx +)__migraphx__"; + +struct nms_compiler : compiler +{ + std::vector names() const { return {"nonmaxsuppression"}; } + + operation compile_op(context& ctx, const std::vector& inputs, const value& v) const + { + // inputs (in order): boxes, scores, max, iou, score_thr, + // sorted_data, iou_mask, global_count, output. + const auto& boxes_s = inputs[0]; + const auto& scores_s = inputs[1]; + const auto nb = boxes_s.lens()[0]; + const auto b = boxes_s.lens()[1]; + const auto nc = scores_s.lens()[1]; + const auto aligned_b = static_cast(bit_ceil(static_cast(b))); + const auto block_size = std::min(aligned_b, std::size_t{1024}); + + hip_compile_options options; + options.inputs = inputs; + options.output = inputs.back(); + options.kernel_name = "nms_kernel"; + options.virtual_inputs = inputs; + options.set_launch_params(v, block_size * nb * nc, block_size); + + auto src = interpolate_string( + nms_kernel_src, + {{"params", enum_params(inputs.size(), "void * private_p")}, + {"args", enum_params(inputs.size(), "private_p")}, + {"num_batches", std::to_string(nb)}, + {"num_classes", std::to_string(nc)}, + {"num_boxes", std::to_string(b)}, + {"aligned_num_boxes", std::to_string(aligned_b)}, + {"center_point_box", + v.at("center_point_box").to() ? "true" : "false"}}); + return compile_hip_code_object(ctx, src, options); + } + + compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const + { + // ins->inputs() is [user_inputs..., output_alloc] from + // insert_precompile_op. user_inputs has 2..5 entries per ONNX NMS. + auto raw = ins->inputs(); + if(raw.size() < 3 or raw.size() > 6) + MIGRAPHX_THROW("nms_compiler: unexpected input count " + std::to_string(raw.size())); + + std::vector raw_shapes; + raw_shapes.reserve(raw.size() - 1); + std::transform(raw.begin(), + raw.end() - 1, + std::back_inserter(raw_shapes), + [](auto i) { return i->get_shape(); }); + + // Default shapes for missing optional scalar inputs. The literals + // inserted by the replace lambda use these same shapes so the + // compiled kernel's tensor_view types match the runtime arguments. + const shape default_max_s{shape::int64_type, {1}}; + const shape default_iou_s{shape::float_type, {1}}; + const shape default_thr_s{shape::float_type, {1}}; + if(raw_shapes.size() < 3) + raw_shapes.push_back(default_max_s); + if(raw_shapes.size() < 4) + raw_shapes.push_back(default_iou_s); + if(raw_shapes.size() < 5) + raw_shapes.push_back(default_thr_s); + + const auto& boxes_s = raw_shapes[0]; + const auto& scores_s = raw_shapes[1]; + if(boxes_s.lens().size() != 3 or scores_s.lens().size() != 3) + MIGRAPHX_THROW("nms_compiler: boxes and scores must be 3-D"); + + const auto nb = boxes_s.lens()[0]; + const auto b = boxes_s.lens()[1]; + const auto nc = scores_s.lens()[1]; + const auto aligned_b = static_cast(bit_ceil(static_cast(b))); + const auto iou_packed = (b > 1) ? (b * (b - 1) / 2) : std::size_t{1}; + + shape sorted_s{shape::int8_type, {nb * nc * aligned_b * nms_bytes_per_data}}; + shape mask_s{shape::uint8_type, {nb * nc * iou_packed}}; + shape count_s{shape::int64_type, {1}}; + + std::vector kshapes = raw_shapes; + kshapes.push_back(sorted_s); + kshapes.push_back(mask_s); + kshapes.push_back(count_s); + kshapes.push_back(raw.back()->get_shape()); + + auto kop = compile_op(ctx, kshapes, op.to_value()); + + return {kop, [=](module& m, instruction_ref ins2, const operation& cop) { + auto args = ins2->inputs(); + auto out = args.back(); + args.pop_back(); + + if(args.size() < 3) + { + args.push_back(m.insert_literal( + ins2, literal{default_max_s, {std::int64_t{0}}})); + } + if(args.size() < 4) + { + args.push_back( + m.insert_literal(ins2, literal{default_iou_s, {0.0f}})); + } + if(args.size() < 5) + { + args.push_back( + m.insert_literal(ins2, literal{default_thr_s, {0.0f}})); + } + + auto sorted = m.insert_instruction( + ins2, make_op("hip::allocate", {{"shape", to_value(sorted_s)}})); + auto mask = m.insert_instruction( + ins2, make_op("hip::allocate", {{"shape", to_value(mask_s)}})); + auto count = m.insert_instruction( + ins2, make_op("hip::allocate", {{"shape", to_value(count_s)}})); + + // Reset the global atomic counter to zero each launch and + // pre-zero the output buffer so unwritten rows match the + // CPU implementation's behavior. + count = m.insert_instruction( + ins2, make_op("hip::fill", {{"value", 0}}), count); + out = m.insert_instruction( + ins2, make_op("hip::fill", {{"value", 0}}), out); + + args.push_back(sorted); + args.push_back(mask); + args.push_back(count); + args.push_back(out); + + m.replace_instruction(ins2, cop, args); + }}; + } +}; + +} // namespace gpu +} // namespace MIGRAPHX_INLINE_NS +} // namespace migraphx diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp new file mode 100644 index 00000000000..f9f4a88a69a --- /dev/null +++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp @@ -0,0 +1,305 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#ifndef MIGRAPHX_GUARD_KERNELS_NONMAXSUPPRESSION_HPP +#define MIGRAPHX_GUARD_KERNELS_NONMAXSUPPRESSION_HPP + +#include +#include +#include +#include +#include +#include +#include + +namespace migraphx { + +// Per-box record carried through the sort. Box corners are stored normalized +// to (xmin, ymin, xmax, ymax) so the IoU computation is independent of the +// center_point_box attribute. +struct nms_data +{ + float score; + array box; + int box_index; +}; + +// Decode a single box into (xmin, ymin, xmax, ymax) corners. +template +__device__ inline array nms_normalize_box(const float* b) +{ + if constexpr(CenterPointBox) + { + const float xc = b[0]; + const float yc = b[1]; + const float hw = b[2] * 0.5f; + const float hh = b[3] * 0.5f; + return {xc - hw, yc - hh, xc + hw, yc + hh}; + } + else + { + // ONNX layout: [y1, x1, y2, x2]; corners may be in either order. + const float y1 = b[0]; + const float x1 = b[1]; + const float y2 = b[2]; + const float x2 = b[3]; + const float xmin = min(x1, x2); + const float xmax = max(x1, x2); + const float ymin = min(y1, y2); + const float ymax = max(y1, y2); + return {xmin, ymin, xmax, ymax}; + } +} + +__device__ inline bool +nms_iou_over_threshold(const array& a, const array& b, float threshold) +{ + const float left = max(a[0], b[0]); + const float right = min(a[2], b[2]); + const float top = max(a[1], b[1]); + const float bottom = min(a[3], b[3]); + const float w = max(right - left, 0.f); + const float h = max(bottom - top, 0.f); + const float inter = w * h; + const float area_a = max(a[2] - a[0], 0.f) * max(a[3] - a[1], 0.f); + const float area_b = max(b[2] - b[0], 0.f) * max(b[3] - b[1], 0.f); + const float un = area_a + area_b - inter; + if(area_a <= 0.f or area_b <= 0.f or un <= 0.f) + return false; + return (inter / un) > threshold; +} + +// Packed upper-triangular index for j > i within an N x N matrix. +__device__ inline index_int nms_packed_idx(index_int i, index_int j, index_int N) +{ + return (i * N - (i * (i + 1)) / 2) + j - (i + 1); +} + +struct nms_score_greater +{ + constexpr bool operator()(const nms_data& a, const nms_data& b) const + { + return a.score > b.score; + } +}; + +// Phase 1: load (score, box, box_index) tuples into a per-block buffer of +// AlignedN entries (power of two), padding the [N, AlignedN) tail with sentinel +// values, then sort the buffer in descending order by score. +template +__device__ void nms_load_and_sort(index idx, + const float* boxes_b, // [N, 4] + const float* scores_bc, // [N] + nms_data* sorted) +{ + idx.local_stride(AlignedN, [&](auto i) { + nms_data d; + if(i < N) + { + d.score = scores_bc[i]; + d.box = nms_normalize_box(boxes_b + i * 4); + d.box_index = static_cast(i); + } + else + { + // Sentinel: -inf score so it never beats any real entry, and a + // negative box_index so accidental dereferencing is detectable. + d.score = -__FLT_MAX__; + d.box = array{0.f, 0.f, 0.f, 0.f}; + d.box_index = -1; + } + sorted[i] = d; + }); + __syncthreads(); + bitonic_sort{nms_score_greater{}}.template block_sort(idx, sorted); +} + +// Phase 2: build the packed upper-triangular IoU mask for the N sorted boxes. +// Work is striped (i, N-1-i) per thread so each thread does roughly the same +// amount of work regardless of where it falls in the triangle. +template +__device__ void nms_make_iou_mask(index idx, const nms_data* sorted, uint8_t* mask, float iou_thr) +{ + constexpr index_int half = N / 2; + + auto fill_row = [&](index_int i) { + for(index_int j = i + 1; j < N; ++j) + { + mask[nms_packed_idx(i, j, N)] = + nms_iou_over_threshold(sorted[i].box, sorted[j].box, iou_thr) ? 1 : 0; + } + }; + + idx.local_stride(half, [&](auto i) { + fill_row(i); + fill_row(N - 1 - i); + }); + + if constexpr((N & 1) != 0 and N > 1) + { + if(idx.local == 0) + fill_row(half); + } +} + +// Phase 3: greedy filter, mirroring the prototype but using a global atomic +// counter to compact outputs from all (batch, class) blocks into a single +// dense output buffer. +template +__device__ void nms_filter_atomic(index idx, + const nms_data* sorted, + const uint8_t* mask, + int batch_idx, + int class_idx, + index_int max_output, + float score_thr, + unsigned long long* global_count, // NOLINT + int64_t* output, + index_int output_capacity) +{ + __shared__ uint8_t removed[N > 0 ? N : 1]; + // Match the CPU op: only filter by score when score_threshold > 0 (the CPU + // takes the same branch). With a non-positive (or sentinel) threshold, all + // boxes are kept regardless of sign. + const bool do_filter = score_thr > 0.f; + idx.local_stride(N, [&](auto i) { + removed[i] = (do_filter and sorted[i].score < score_thr) ? 1 : 0; + }); + __syncthreads(); + + index_int output_idx = 0; + for(index_int i = 0; i < N; ++i) + { + if(output_idx >= max_output) + { + __syncthreads(); + break; + } + if(removed[i] == 0) + { + if(idx.local == 0) + { + const unsigned long long slot = atomicAdd(global_count, 1ull); // NOLINT + if(slot < static_cast(output_capacity)) + { + output[slot * 3 + 0] = batch_idx; + output[slot * 3 + 1] = class_idx; + output[slot * 3 + 2] = sorted[i].box_index; + } + } + ++output_idx; + // Update removed[] using row i of the IoU mask. Each thread handles + // a stride of the row to balance work. + for(index_int j = i + 1 + idx.local; j < N; j += idx.nlocal()) + { + removed[j] |= mask[nms_packed_idx(i, j, N)]; + } + } + __syncthreads(); + } +} + +// Per-block driver: one block per (batch_idx, class_idx). Workspace pointers +// are sliced into per-block segments using idx.group. +template +__device__ void nonmaxsuppression(Boxes boxes, + Scores scores, + MaxOut max_out_p, + IouThr iou_thr_p, + ScoreThr score_thr_p, + Sorted sorted_buf, + Mask mask_buf, + Count count_buf, + Output output) +{ + static_assert(NumBatches > 0, "num_batches must be > 0"); + static_assert(NumClasses > 0, "num_classes must be > 0"); + + auto idx = make_index(); + const index_int block_id = idx.group; + const int batch_idx = static_cast(block_id / NumClasses); + const int class_idx = static_cast(block_id % NumClasses); + constexpr index_int iou_packed_size = (NumBoxes > 1) ? (NumBoxes * (NumBoxes - 1)) / 2 : 1; + + nms_data* my_sorted = + reinterpret_cast(sorted_buf.data()) + block_id * AlignedNumBoxes; + uint8_t* my_mask = reinterpret_cast(mask_buf.data()) + block_id * iou_packed_size; + + const float* boxes_b = boxes.data() + batch_idx * NumBoxes * 4; + const float* scores_bc = scores.data() + (batch_idx * NumClasses + class_idx) * NumBoxes; + + // Pull scalar tensor inputs once. They're broadcast to all threads via the + // common load (each thread reads the same single element). + const int64_t max_out_val = max_out_p[0]; + const float iou_thr_val = iou_thr_p[0]; + const float score_thr_val = score_thr_p[0]; + + nms_load_and_sort( + idx, boxes_b, scores_bc, my_sorted); + __syncthreads(); + + if constexpr(NumBoxes > 1) + { + nms_make_iou_mask(idx, my_sorted, my_mask, iou_thr_val); + __syncthreads(); + } + + // The CPU op reads max_output_boxes_per_class as std::size_t, so a negative + // signed value is treated as a very large unsigned (effectively unlimited). + // Mirror that here by reinterpreting as unsigned and then capping at + // NumBoxes, which is the most we could ever emit per (batch, class) block. + const auto max_unsigned = static_cast(max_out_val); + const index_int max_output = (max_unsigned > static_cast(NumBoxes)) + ? static_cast(NumBoxes) + : static_cast(max_unsigned); + const index_int output_capacity = output.get_shape().lens[0]; + auto* count_addr = + reinterpret_cast(count_buf.data()); // NOLINT + nms_filter_atomic(idx, + my_sorted, + my_mask, + batch_idx, + class_idx, + max_output, + score_thr_val, + count_addr, + output.data(), + output_capacity); +} + +} // namespace migraphx + +#endif // MIGRAPHX_GUARD_KERNELS_NONMAXSUPPRESSION_HPP diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp index 1e3cc019558..fa4d1c981e2 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp @@ -139,6 +139,34 @@ struct bitonic_sort lane_merge(get_bit(id, w), x); }); } + + // Block-level bitonic sort over a power-of-two buffer in shared or global + // memory. All threads in the block cooperate; buf must point to N elements + // visible to every thread. The compare_function determines the final order + // (e.g. greater{} -> descending). The buffer must be sized to N (a + // compile-time power of 2); callers pad with sentinel values when the + // logical length is smaller. + template + __device__ void block_sort(index idx, T* buf) const + { + static_assert(is_power_of_2(N), "N must be a power of 2"); + for(index_int k = 2; k <= N; k <<= 1) + { + for(index_int j = k >> 1; j > 0; j >>= 1) + { + idx.local_stride(N, [&](auto tid) { + index_int partner = tid ^ j; + if(partner > tid) + { + const bool reverse = (tid & k) != 0; + if(this->compare(buf[tid], buf[partner], reverse)) + swap(buf[tid], buf[partner]); + } + }); + __syncthreads(); + } + } + } }; MIGRAPHX_AUTO_DEDUCE(bitonic_sort); diff --git a/src/targets/gpu/lowering.cpp b/src/targets/gpu/lowering.cpp index 6b6def4721d..108d994057a 100644 --- a/src/targets/gpu/lowering.cpp +++ b/src/targets/gpu/lowering.cpp @@ -449,6 +449,14 @@ struct miopen_apply void add_nms_op() { apply_map.emplace("nonmaxsuppression", [=](instruction_ref ins) { + // Fixed-output NMS is handled by the JIT kernel registered via + // jit/nonmaxsuppression.cpp; route it through insert_precompile_op + // so compile_ops picks it up later. The dynamic-output mode still + // falls back to the CPU implementation. + auto op_val = ins->get_operator().to_value(); + if(not op_val.at("use_dyn_output").to()) + return insert_precompile_op(ins); + auto s = ins->get_shape(); auto output = insert_allocation(ins, s); std::vector cpu_inputs; diff --git a/test/verify/test_nms.cpp b/test/verify/test_nms.cpp index 6b3e56bafd6..98702a12b5e 100644 --- a/test/verify/test_nms.cpp +++ b/test/verify/test_nms.cpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -57,3 +57,114 @@ struct test_nms : verify_program return p; } }; + +// Multi-batch fixed-output NMS exercises the (batch_idx, class_idx) -> block_id +// dispatch in the GPU kernel. +struct test_nms_multi_batch : verify_program +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + + migraphx::shape boxes_s{migraphx::shape::float_type, {2, 6, 4}}; + std::vector boxes_vec = {0.5, 0.5, 1.0, 1.0, 0.5, 0.6, 1.0, 1.0, 0.5, 0.4, 1.0, + 1.0, 0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, + 1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5, 0.6, 1.0, 1.0, 0.5, + 0.4, 1.0, 1.0, 0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, + 0.5, 100.5, 1.0, 1.0}; + + migraphx::shape scores_s{migraphx::shape::float_type, {2, 1, 6}}; + std::vector scores_vec = { + 0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f, 0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f}; + + auto boxes_l = mm->add_literal(migraphx::literal(boxes_s, boxes_vec)); + auto scores_l = mm->add_literal(migraphx::literal(scores_s, scores_vec)); + auto max_out_l = mm->add_literal(int64_t{4}); + auto iou_threshold = mm->add_literal(0.5f); + auto score_threshold = mm->add_literal(0.0f); + + auto r = + mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", 1}}), + boxes_l, + scores_l, + max_out_l, + iou_threshold, + score_threshold); + mm->add_return({r}); + + return p; + } +}; + +// Multi-class fixed-output NMS exercises per-class greedy filtering with +// outputs interleaved by the global atomic counter. +struct test_nms_multi_class : verify_program +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + + migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}}; + std::vector boxes_vec = {0.0, 0.0, 1.0, 1.0, 0.0, 0.1, 1.0, 1.1, 0.0, + -0.1, 1.0, 0.9, 0.0, 10.0, 1.0, 11.0, 0.0, 10.1, + 1.0, 11.1, 0.0, 100.0, 1.0, 101.0}; + + migraphx::shape scores_s{migraphx::shape::float_type, {1, 2, 6}}; + std::vector scores_vec = { + 0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f, 0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f}; + + auto boxes_l = mm->add_literal(migraphx::literal(boxes_s, boxes_vec)); + auto scores_l = mm->add_literal(migraphx::literal(scores_s, scores_vec)); + auto max_out_l = mm->add_literal(int64_t{2}); + auto iou_threshold = mm->add_literal(0.5f); + auto score_threshold = mm->add_literal(0.0f); + + auto r = + mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", 1}}), + boxes_l, + scores_l, + max_out_l, + iou_threshold, + score_threshold); + mm->add_return({r}); + + return p; + } +}; + +// center_point_box=0 path with potentially flipped corner coordinates. +struct test_nms_not_center : verify_program +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + + migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}}; + std::vector boxes_vec = {1.0, 1.0, 0.0, 0.0, 0.0, 0.1, 1.0, 1.1, + 0.0, 0.9, 1.0, -0.1, 0.0, 10.0, 1.0, 11.0, + 1.0, 10.1, 0.0, 11.1, 1.0, 101.0, 0.0, 100.0}; + + migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}}; + std::vector scores_vec = {0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f}; + + auto boxes_l = mm->add_literal(migraphx::literal(boxes_s, boxes_vec)); + auto scores_l = mm->add_literal(migraphx::literal(scores_s, scores_vec)); + auto max_out_l = mm->add_literal(int64_t{4}); + auto iou_threshold = mm->add_literal(0.5f); + auto score_threshold = mm->add_literal(0.0f); + + auto r = + mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", 0}}), + boxes_l, + scores_l, + max_out_l, + iou_threshold, + score_threshold); + mm->add_return({r}); + + return p; + } +}; From 4ec2fe13103402fbca7f13b687eab13b809821b2 Mon Sep 17 00:00:00 2001 From: charlie Date: Tue, 12 May 2026 16:00:09 -0500 Subject: [PATCH 02/32] AI edit with "compact" kernel --- src/targets/gpu/jit/nonmaxsuppression.cpp | 160 ++++++++++++++---- .../migraphx/kernels/nonmaxsuppression.hpp | 135 ++++++++++----- test/verify/test_nms.cpp | 35 +--- 3 files changed, 230 insertions(+), 100 deletions(-) diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp index 1b6ba4bb62c..ff7dd14e289 100644 --- a/src/targets/gpu/jit/nonmaxsuppression.cpp +++ b/src/targets/gpu/jit/nonmaxsuppression.cpp @@ -43,8 +43,12 @@ namespace gpu { // and reinterpreted in the kernel. static constexpr std::size_t nms_bytes_per_data = 24; +// Phase-1 ("compute") kernel: each block runs NMS for its (batch, class) and +// writes selections into a per-block region of the raw_output scratch plus a +// per-block count. No global atomic counter is used, so per-block contents +// are deterministic. // NOLINTNEXTLINE -static const char* const nms_kernel_src = R"__migraphx__( +static const char* const nms_compute_kernel_src = R"__migraphx__( #include #include @@ -61,14 +65,40 @@ MIGRAPHX_GLOBAL void nms_kernel(${params}) auto thr_p, auto sorted, auto mask, - auto count, - auto out) { + auto counts, + auto raw_out) { nonmaxsuppression<${center_point_box}, ${num_batches}, ${num_classes}, ${num_boxes}, ${aligned_num_boxes}>( - boxes, scores, max_p, iou_p, thr_p, sorted, mask, count, out); + boxes, scores, max_p, iou_p, thr_p, sorted, mask, counts, raw_out); + }); +} + +} + +} // namespace migraphx +)__migraphx__"; + +// Phase-2 ("compact") kernel: a single thread walks the per-block raw_output +// regions in block_id order and copies the first counts[b] selections from +// each region into a contiguous prefix of the final output. The order of +// (block_id 0, 1, ...) is the same as the CPU op's (batch, class) iteration +// order, so the resulting output matches the CPU op exactly. +// NOLINTNEXTLINE +static const char* const nms_compact_kernel_src = R"__migraphx__( +#include +#include + +namespace migraphx { + +extern "C" { + +MIGRAPHX_GLOBAL void nms_compact_kernel(${params}) +{ + make_tensors()(${args})([](auto counts, auto raw_out, auto out) { + nonmaxsuppression_compact<${num_blocks}, ${num_boxes}>(counts, raw_out, out); }); } @@ -81,10 +111,13 @@ struct nms_compiler : compiler { std::vector names() const { return {"nonmaxsuppression"}; } - operation compile_op(context& ctx, const std::vector& inputs, const value& v) const + // Compile the per-block compute kernel. `inputs` is: + // [boxes, scores, max, iou, score_thr, sorted, mask, counts, raw_output] + // `raw_output` is the last input so the framework treats it as the + // kernel's output buffer; the per-block counts is an in/out scratch. + operation + compile_compute(context& ctx, const std::vector& inputs, const value& v) const { - // inputs (in order): boxes, scores, max, iou, score_thr, - // sorted_data, iou_mask, global_count, output. const auto& boxes_s = inputs[0]; const auto& scores_s = inputs[1]; const auto nb = boxes_s.lens()[0]; @@ -95,13 +128,13 @@ struct nms_compiler : compiler hip_compile_options options; options.inputs = inputs; - options.output = inputs.back(); + options.output = inputs.back(); // raw_output buffer options.kernel_name = "nms_kernel"; options.virtual_inputs = inputs; options.set_launch_params(v, block_size * nb * nc, block_size); auto src = interpolate_string( - nms_kernel_src, + nms_compute_kernel_src, {{"params", enum_params(inputs.size(), "void * private_p")}, {"args", enum_params(inputs.size(), "private_p")}, {"num_batches", std::to_string(nb)}, @@ -113,6 +146,47 @@ struct nms_compiler : compiler return compile_hip_code_object(ctx, src, options); } + // Compile the serial compaction kernel. `inputs` is: + // [counts, raw_output, output] + // Launched with one thread (single block, single thread) since the work + // is intentionally serial: it walks per-block regions in fixed order to + // produce the exact byte-for-byte output the CPU op produces. + operation + compile_compact(context& ctx, const std::vector& inputs, const value& v) const + { + // Derive num_blocks (length of counts) and per-block stride NumBoxes + // (raw_output is sized nb*nc*NumBoxes*3 int64 entries). + const auto& cnt_s = inputs[0]; + const auto& raw_s = inputs[1]; + const auto num_blocks = cnt_s.elements(); + const auto num_boxes = (num_blocks > 0) + ? raw_s.elements() / (num_blocks * std::size_t{3}) + : std::size_t{0}; + + hip_compile_options options; + options.inputs = inputs; + options.output = inputs.back(); + options.kernel_name = "nms_compact_kernel"; + options.virtual_inputs = inputs; + options.set_launch_params(v, std::size_t{1}, std::size_t{1}); + + auto src = interpolate_string( + nms_compact_kernel_src, + {{"params", enum_params(inputs.size(), "void * private_p")}, + {"args", enum_params(inputs.size(), "private_p")}, + {"num_blocks", std::to_string(num_blocks)}, + {"num_boxes", std::to_string(num_boxes)}}); + return compile_hip_code_object(ctx, src, options); + } + + // Required compiler<> hook: return the compute kernel based on the raw + // input shapes. The full two-kernel chain is handled in `compile()`; this + // entry point is only used by callers that ask for a single op view. + operation compile_op(context& ctx, const std::vector& inputs, const value& v) const + { + return compile_compute(ctx, inputs, v); + } + compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const { // ins->inputs() is [user_inputs..., output_alloc] from @@ -154,17 +228,32 @@ struct nms_compiler : compiler shape sorted_s{shape::int8_type, {nb * nc * aligned_b * nms_bytes_per_data}}; shape mask_s{shape::uint8_type, {nb * nc * iou_packed}}; - shape count_s{shape::int64_type, {1}}; + // Per-block raw output: nb*nc blocks, each can write up to b + // selections of (batch, class, box_idx) int64 triples. + shape raw_output_s{shape::int64_type, {nb * nc * b * 3}}; + // Per-block selection counts (one int32 per (batch, class) block). + shape counts_s{shape::int32_type, {nb * nc}}; + + // Compute kernel input shapes: [user inputs..., sorted, mask, counts, raw_out] + std::vector compute_shapes = raw_shapes; + compute_shapes.push_back(sorted_s); + compute_shapes.push_back(mask_s); + compute_shapes.push_back(counts_s); + compute_shapes.push_back(raw_output_s); + + // Compact kernel input shapes: [counts, raw_out, output] + std::vector compact_shapes; + compact_shapes.push_back(counts_s); + compact_shapes.push_back(raw_output_s); + compact_shapes.push_back(raw.back()->get_shape()); - std::vector kshapes = raw_shapes; - kshapes.push_back(sorted_s); - kshapes.push_back(mask_s); - kshapes.push_back(count_s); - kshapes.push_back(raw.back()->get_shape()); + auto compute_kop = compile_compute(ctx, compute_shapes, op.to_value()); + auto compact_kop = compile_compact(ctx, compact_shapes, op.to_value()); - auto kop = compile_op(ctx, kshapes, op.to_value()); + std::vector kops = {compute_kop, compact_kop}; - return {kop, [=](module& m, instruction_ref ins2, const operation& cop) { + return {kops, + [=](module& m, instruction_ref ins2, const std::vector& cops) { auto args = ins2->inputs(); auto out = args.back(); args.pop_back(); @@ -189,23 +278,34 @@ struct nms_compiler : compiler ins2, make_op("hip::allocate", {{"shape", to_value(sorted_s)}})); auto mask = m.insert_instruction( ins2, make_op("hip::allocate", {{"shape", to_value(mask_s)}})); - auto count = m.insert_instruction( - ins2, make_op("hip::allocate", {{"shape", to_value(count_s)}})); - - // Reset the global atomic counter to zero each launch and - // pre-zero the output buffer so unwritten rows match the - // CPU implementation's behavior. - count = m.insert_instruction( - ins2, make_op("hip::fill", {{"value", 0}}), count); + auto raw_out = m.insert_instruction( + ins2, make_op("hip::allocate", {{"shape", to_value(raw_output_s)}})); + auto counts = m.insert_instruction( + ins2, make_op("hip::allocate", {{"shape", to_value(counts_s)}})); + + // Pre-zero the final output buffer so unwritten rows match + // the CPU implementation's behavior (trailing zeros). The + // counts and raw_out scratch don't need zeroing: each + // block writes its count exactly once and the compact + // kernel only reads counts[b] entries from each block. out = m.insert_instruction( ins2, make_op("hip::fill", {{"value", 0}}), out); - args.push_back(sorted); - args.push_back(mask); - args.push_back(count); - args.push_back(out); + auto compute_args = args; + compute_args.push_back(sorted); + compute_args.push_back(mask); + compute_args.push_back(counts); + compute_args.push_back(raw_out); + + auto compute_ins = + m.insert_instruction(ins2, cops[0], compute_args); - m.replace_instruction(ins2, cop, args); + // Use compute_ins (returned raw_out) as the dataflow edge + // so the compact kernel is ordered after the compute + // kernel and the raw_out buffer remains live. + std::vector compact_args = { + counts, compute_ins, out}; + m.replace_instruction(ins2, cops[1], compact_args); }}; } }; diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp index f9f4a88a69a..ead79aed578 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp @@ -162,20 +162,20 @@ __device__ void nms_make_iou_mask(index idx, const nms_data* sorted, uint8_t* ma } } -// Phase 3: greedy filter, mirroring the prototype but using a global atomic -// counter to compact outputs from all (batch, class) blocks into a single -// dense output buffer. +// Phase 3: greedy filter that writes selections into a per-block region of a +// scratch buffer (block_id * N entries) and stores the per-block count. A +// follow-up serial compaction kernel gathers per-block regions in block_id +// order to produce a deterministic compacted output that matches the CPU op. template -__device__ void nms_filter_atomic(index idx, - const nms_data* sorted, - const uint8_t* mask, - int batch_idx, - int class_idx, - index_int max_output, - float score_thr, - unsigned long long* global_count, // NOLINT - int64_t* output, - index_int output_capacity) +__device__ void nms_filter_per_block(index idx, + const nms_data* sorted, + const uint8_t* mask, + int batch_idx, + int class_idx, + index_int max_output, + float score_thr, + int64_t* raw_output, // [num_blocks * N * 3] + int32_t* block_counts) // [num_blocks] { __shared__ uint8_t removed[N > 0 ? N : 1]; // Match the CPU op: only filter by score when score_threshold > 0 (the CPU @@ -187,6 +187,9 @@ __device__ void nms_filter_atomic(index idx, }); __syncthreads(); + const index_int block_id = idx.group; + int64_t* my_output = raw_output + block_id * N * 3; + index_int output_idx = 0; for(index_int i = 0; i < N; ++i) { @@ -199,13 +202,9 @@ __device__ void nms_filter_atomic(index idx, { if(idx.local == 0) { - const unsigned long long slot = atomicAdd(global_count, 1ull); // NOLINT - if(slot < static_cast(output_capacity)) - { - output[slot * 3 + 0] = batch_idx; - output[slot * 3 + 1] = class_idx; - output[slot * 3 + 2] = sorted[i].box_index; - } + my_output[output_idx * 3 + 0] = batch_idx; + my_output[output_idx * 3 + 1] = class_idx; + my_output[output_idx * 3 + 2] = sorted[i].box_index; } ++output_idx; // Update removed[] using row i of the IoU mask. Each thread handles @@ -217,10 +216,47 @@ __device__ void nms_filter_atomic(index idx, } __syncthreads(); } + + if(idx.local == 0) + block_counts[block_id] = static_cast(output_idx); +} + +// Serial compaction: a single thread walks per-block regions in block_id order +// (which equals the CPU op's (batch, class) iteration order) and copies the +// first block_counts[b] entries of each region into a contiguous prefix of the +// final output buffer. Trailing slots are left as the zero fill applied before +// this kernel runs. +template +__device__ void +nms_compact(index idx, const int64_t* raw_output, const int32_t* block_counts, int64_t* output) +{ + if(idx.global == 0) + { + index_int dst = 0; + for(index_int b = 0; b < NumBlocks; ++b) + { + const int32_t cnt = block_counts[b]; + const int64_t* src = raw_output + b * NumBoxes * 3; + for(int32_t i = 0; i < cnt; ++i) + { + output[dst * 3 + 0] = src[i * 3 + 0]; + output[dst * 3 + 1] = src[i * 3 + 1]; + output[dst * 3 + 2] = src[i * 3 + 2]; + ++dst; + } + } + } } // Per-block driver: one block per (batch_idx, class_idx). Workspace pointers -// are sliced into per-block segments using idx.group. +// are sliced into per-block segments using idx.group. Selections are written +// to a per-block region of `raw_output` and the per-block count is written to +// `block_counts`; a follow-up compact kernel produces the final compacted +// output that matches the CPU op's ordering. +// +// `raw_output_buf` is intentionally the last parameter so that JIT-compiled +// callers (which use `inputs.back()` as the kernel's output buffer) treat it +// as the chained output flowing into the compact kernel. template + class Counts, + class RawOutput> __device__ void nonmaxsuppression(Boxes boxes, Scores scores, MaxOut max_out_p, @@ -242,8 +278,8 @@ __device__ void nonmaxsuppression(Boxes boxes, ScoreThr score_thr_p, Sorted sorted_buf, Mask mask_buf, - Count count_buf, - Output output) + Counts counts_buf, + RawOutput raw_output_buf) { static_assert(NumBatches > 0, "num_batches must be > 0"); static_assert(NumClasses > 0, "num_classes must be > 0"); @@ -281,23 +317,38 @@ __device__ void nonmaxsuppression(Boxes boxes, // signed value is treated as a very large unsigned (effectively unlimited). // Mirror that here by reinterpreting as unsigned and then capping at // NumBoxes, which is the most we could ever emit per (batch, class) block. - const auto max_unsigned = static_cast(max_out_val); - const index_int max_output = (max_unsigned > static_cast(NumBoxes)) - ? static_cast(NumBoxes) - : static_cast(max_unsigned); - const index_int output_capacity = output.get_shape().lens[0]; - auto* count_addr = - reinterpret_cast(count_buf.data()); // NOLINT - nms_filter_atomic(idx, - my_sorted, - my_mask, - batch_idx, - class_idx, - max_output, - score_thr_val, - count_addr, - output.data(), - output_capacity); + const auto max_unsigned = static_cast(max_out_val); + const index_int max_output = (max_unsigned > static_cast(NumBoxes)) + ? static_cast(NumBoxes) + : static_cast(max_unsigned); + nms_filter_per_block(idx, + my_sorted, + my_mask, + batch_idx, + class_idx, + max_output, + score_thr_val, + reinterpret_cast(raw_output_buf.data()), + reinterpret_cast(counts_buf.data())); +} + +// Serial compact wrapper invoked from the second JIT kernel. Reads the +// per-block counts and raw_output produced by `nonmaxsuppression` and copies +// selections into the final output in block_id (i.e. (batch, class)) order. +// `output` is last to match the JIT convention of using `inputs.back()` as +// the kernel's logical output buffer. +template +__device__ void nonmaxsuppression_compact(Counts counts_buf, + RawOutput raw_output_buf, + Output output) +{ + static_assert(NumBlocks > 0, "num_blocks must be > 0"); + + auto idx = make_index(); + nms_compact(idx, + reinterpret_cast(raw_output_buf.data()), + reinterpret_cast(counts_buf.data()), + output.data()); } } // namespace migraphx diff --git a/test/verify/test_nms.cpp b/test/verify/test_nms.cpp index c99364396f1..94828ae35a2 100644 --- a/test/verify/test_nms.cpp +++ b/test/verify/test_nms.cpp @@ -35,12 +35,10 @@ struct test_nms : verify_program auto* mm = p.get_main_module(); migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}}; - migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}}; - std::vector scores_vec = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3}; auto boxes_l = mm->add_parameter("boxes", boxes_s); - auto scores_l = mm->add_literal(migraphx::literal(scores_s, scores_vec)); + auto scores_l = mm->add_parameter("scores", scores_s); auto max_out_l = mm->add_literal(int64_t{4}); auto iou_threshold = mm->add_literal(0.5f); auto score_threshold = mm->add_literal(0.0f); @@ -68,18 +66,10 @@ struct test_nms_multi_batch : verify_program auto* mm = p.get_main_module(); migraphx::shape boxes_s{migraphx::shape::float_type, {2, 6, 4}}; - std::vector boxes_vec = {0.5, 0.5, 1.0, 1.0, 0.5, 0.6, 1.0, 1.0, 0.5, 0.4, 1.0, - 1.0, 0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, - 1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5, 0.6, 1.0, 1.0, 0.5, - 0.4, 1.0, 1.0, 0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, - 0.5, 100.5, 1.0, 1.0}; - migraphx::shape scores_s{migraphx::shape::float_type, {2, 1, 6}}; - std::vector scores_vec = { - 0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f, 0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f}; - auto boxes_l = mm->add_literal(migraphx::literal(boxes_s, boxes_vec)); - auto scores_l = mm->add_literal(migraphx::literal(scores_s, scores_vec)); + auto boxes_l = mm->add_parameter("boxes", boxes_s); + auto scores_l = mm->add_parameter("scores", scores_s); auto max_out_l = mm->add_literal(int64_t{4}); auto iou_threshold = mm->add_literal(0.5f); auto score_threshold = mm->add_literal(0.0f); @@ -107,16 +97,10 @@ struct test_nms_multi_class : verify_program auto* mm = p.get_main_module(); migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}}; - std::vector boxes_vec = {0.0, 0.0, 1.0, 1.0, 0.0, 0.1, 1.0, 1.1, 0.0, - -0.1, 1.0, 0.9, 0.0, 10.0, 1.0, 11.0, 0.0, 10.1, - 1.0, 11.1, 0.0, 100.0, 1.0, 101.0}; - migraphx::shape scores_s{migraphx::shape::float_type, {1, 2, 6}}; - std::vector scores_vec = { - 0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f, 0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f}; - auto boxes_l = mm->add_literal(migraphx::literal(boxes_s, boxes_vec)); - auto scores_l = mm->add_literal(migraphx::literal(scores_s, scores_vec)); + auto boxes_l = mm->add_parameter("boxes", boxes_s); + auto scores_l = mm->add_parameter("scores", scores_s); auto max_out_l = mm->add_literal(int64_t{2}); auto iou_threshold = mm->add_literal(0.5f); auto score_threshold = mm->add_literal(0.0f); @@ -143,15 +127,10 @@ struct test_nms_not_center : verify_program auto* mm = p.get_main_module(); migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}}; - std::vector boxes_vec = {1.0, 1.0, 0.0, 0.0, 0.0, 0.1, 1.0, 1.1, - 0.0, 0.9, 1.0, -0.1, 0.0, 10.0, 1.0, 11.0, - 1.0, 10.1, 0.0, 11.1, 1.0, 101.0, 0.0, 100.0}; - migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}}; - std::vector scores_vec = {0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f}; - auto boxes_l = mm->add_literal(migraphx::literal(boxes_s, boxes_vec)); - auto scores_l = mm->add_literal(migraphx::literal(scores_s, scores_vec)); + auto boxes_l = mm->add_parameter("boxes", boxes_s); + auto scores_l = mm->add_parameter("scores", scores_s); auto max_out_l = mm->add_literal(int64_t{4}); auto iou_threshold = mm->add_literal(0.5f); auto score_threshold = mm->add_literal(0.0f); From 18ae57e89a7eb38a1c47104e0cb3eca1aef1337c Mon Sep 17 00:00:00 2001 From: charlie Date: Wed, 13 May 2026 14:32:14 -0500 Subject: [PATCH 03/32] AI split into 3 kernels --- src/targets/gpu/jit/nonmaxsuppression.cpp | 262 +++++++++++++----- .../migraphx/kernels/nonmaxsuppression.hpp | 143 ++++++---- test/verify/test_nms.cpp | 1 + 3 files changed, 286 insertions(+), 120 deletions(-) diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp index ff7dd14e289..189ba2057e8 100644 --- a/src/targets/gpu/jit/nonmaxsuppression.cpp +++ b/src/targets/gpu/jit/nonmaxsuppression.cpp @@ -43,12 +43,12 @@ namespace gpu { // and reinterpreted in the kernel. static constexpr std::size_t nms_bytes_per_data = 24; -// Phase-1 ("compute") kernel: each block runs NMS for its (batch, class) and -// writes selections into a per-block region of the raw_output scratch plus a -// per-block count. No global atomic counter is used, so per-block contents -// are deterministic. +// Phase-1 ("sort") kernel: each block normalizes its (batch, class)'s boxes +// and bitonic-sorts them by descending score into a per-block region of the +// `sorted` scratch buffer. Launch dimensions are sized to AlignedNumBoxes so +// the sort has enough parallelism even when NumBoxes is small relative to it. // NOLINTNEXTLINE -static const char* const nms_compute_kernel_src = R"__migraphx__( +static const char* const nms_sort_kernel_src = R"__migraphx__( #include #include @@ -56,23 +56,50 @@ namespace migraphx { extern "C" { -MIGRAPHX_GLOBAL void nms_kernel(${params}) +MIGRAPHX_GLOBAL void nms_sort_kernel(${params}) { - make_tensors()(${args})([](auto boxes, - auto scores, + make_tensors()(${args})([](auto boxes, auto scores, auto sorted) { + nonmaxsuppression_sort<${center_point_box}, + ${num_batches}, + ${num_classes}, + ${num_boxes}, + ${aligned_num_boxes}>(boxes, scores, sorted); + }); +} + +} + +} // namespace migraphx +)__migraphx__"; + +// Phase-2 ("filter") kernel: each block reads its (batch, class)'s sorted +// records out of the shared `sorted` buffer, builds the IoU mask, runs the +// greedy filter, and writes selections into a per-block region of the +// `raw_output` scratch plus a per-block count. No global atomic counter is +// used, so per-block contents are deterministic. +// NOLINTNEXTLINE +static const char* const nms_filter_kernel_src = R"__migraphx__( +#include +#include + +namespace migraphx { + +extern "C" { + +MIGRAPHX_GLOBAL void nms_filter_kernel(${params}) +{ + make_tensors()(${args})([](auto sorted, auto max_p, auto iou_p, auto thr_p, - auto sorted, auto mask, auto counts, auto raw_out) { - nonmaxsuppression<${center_point_box}, - ${num_batches}, - ${num_classes}, - ${num_boxes}, - ${aligned_num_boxes}>( - boxes, scores, max_p, iou_p, thr_p, sorted, mask, counts, raw_out); + nonmaxsuppression_filter<${num_batches}, + ${num_classes}, + ${num_boxes}, + ${aligned_num_boxes}>( + sorted, max_p, iou_p, thr_p, mask, counts, raw_out); }); } @@ -81,11 +108,12 @@ MIGRAPHX_GLOBAL void nms_kernel(${params}) } // namespace migraphx )__migraphx__"; -// Phase-2 ("compact") kernel: a single thread walks the per-block raw_output -// regions in block_id order and copies the first counts[b] selections from -// each region into a contiguous prefix of the final output. The order of -// (block_id 0, 1, ...) is the same as the CPU op's (batch, class) iteration -// order, so the resulting output matches the CPU op exactly. +// Phase-3 ("compact") kernel: a single block does an exclusive prefix scan +// over the per-block counts to obtain output offsets, then its threads +// scatter selections from each per-block region of `raw_output` into the +// contiguous prefix of the final output. The order of (block_id 0, 1, ...) +// is the same as the CPU op's (batch, class) iteration order, so the +// resulting output matches the CPU op exactly. // NOLINTNEXTLINE static const char* const nms_compact_kernel_src = R"__migraphx__( #include @@ -111,30 +139,36 @@ struct nms_compiler : compiler { std::vector names() const { return {"nonmaxsuppression"}; } - // Compile the per-block compute kernel. `inputs` is: - // [boxes, scores, max, iou, score_thr, sorted, mask, counts, raw_output] - // `raw_output` is the last input so the framework treats it as the - // kernel's output buffer; the per-block counts is an in/out scratch. + // Compile the per-block sort kernel. `inputs` is: + // [boxes, scores, sorted] + // `sorted` is the last input so the framework treats it as the kernel's + // chained output flowing into the filter kernel. Launch is sized to + // AlignedNumBoxes so the bitonic sort has enough lane-parallelism even + // when NumBoxes is small relative to it. operation - compile_compute(context& ctx, const std::vector& inputs, const value& v) const + compile_sort(context& ctx, const std::vector& inputs, const value& v) const { - const auto& boxes_s = inputs[0]; - const auto& scores_s = inputs[1]; - const auto nb = boxes_s.lens()[0]; - const auto b = boxes_s.lens()[1]; - const auto nc = scores_s.lens()[1]; - const auto aligned_b = static_cast(bit_ceil(static_cast(b))); - const auto block_size = std::min(aligned_b, std::size_t{1024}); + const auto& boxes_s = inputs[0]; + const auto& scores_s = inputs[1]; + const auto nb = boxes_s.lens()[0]; + const auto b = boxes_s.lens()[1]; + const auto nc = scores_s.lens()[1]; + const auto aligned_b = static_cast(bit_ceil(static_cast(b))); + // bitonic block_sort uses __syncthreads between every stage; pad up + // to a wavefront so degenerate cases (e.g. NumBoxes <= 1) still + // launch a valid block. + const auto block_size = std::min( + std::max(aligned_b, std::size_t{64}), std::size_t{1024}); hip_compile_options options; options.inputs = inputs; - options.output = inputs.back(); // raw_output buffer - options.kernel_name = "nms_kernel"; + options.output = inputs.back(); // sorted buffer + options.kernel_name = "nms_sort_kernel"; options.virtual_inputs = inputs; options.set_launch_params(v, block_size * nb * nc, block_size); auto src = interpolate_string( - nms_compute_kernel_src, + nms_sort_kernel_src, {{"params", enum_params(inputs.size(), "void * private_p")}, {"args", enum_params(inputs.size(), "private_p")}, {"num_batches", std::to_string(nb)}, @@ -146,11 +180,58 @@ struct nms_compiler : compiler return compile_hip_code_object(ctx, src, options); } - // Compile the serial compaction kernel. `inputs` is: + // Compile the per-block filter kernel. `inputs` is: + // [sorted, max, iou, score_thr, mask, counts, raw_output] + // `raw_output` is the last input so the framework treats it as the + // kernel's chained output flowing into the compact kernel. The filter's + // inner loops are O(N) per (batch, class), so the launch is sized to + // NumBoxes (not AlignedNumBoxes) to avoid leaving padding-only threads + // idle. nb, nc, b are passed through the augmented value because the + // filter's inputs no longer carry the raw boxes / scores shapes. + operation + compile_filter(context& ctx, const std::vector& inputs, const value& v) const + { + const auto nb = v.at("num_batches").to(); + const auto nc = v.at("num_classes").to(); + const auto b = v.at("num_boxes").to(); + const auto aligned_b = static_cast(bit_ceil(static_cast(b))); + + // Clamp the per-block thread count to [64, 256]: a multiple of the + // wavefront size keeps __syncthreads / block_scan well-defined, and + // 256 is the sweet spot for the O(N) inner loops without inflating + // shared-memory pressure on `removed[N]` (which is sized by N, not by + // block_size). + const auto block_size = std::min( + std::max( + static_cast(bit_ceil(static_cast(b))), + std::size_t{64}), + std::size_t{256}); + + hip_compile_options options; + options.inputs = inputs; + options.output = inputs.back(); // raw_output buffer + options.kernel_name = "nms_filter_kernel"; + options.virtual_inputs = inputs; + options.set_launch_params(v, block_size * nb * nc, block_size); + + auto src = interpolate_string( + nms_filter_kernel_src, + {{"params", enum_params(inputs.size(), "void * private_p")}, + {"args", enum_params(inputs.size(), "private_p")}, + {"num_batches", std::to_string(nb)}, + {"num_classes", std::to_string(nc)}, + {"num_boxes", std::to_string(b)}, + {"aligned_num_boxes", std::to_string(aligned_b)}}); + return compile_hip_code_object(ctx, src, options); + } + + // Compile the compaction kernel. `inputs` is: // [counts, raw_output, output] - // Launched with one thread (single block, single thread) since the work - // is intentionally serial: it walks per-block regions in fixed order to - // produce the exact byte-for-byte output the CPU op produces. + // Launched as a single block: an exclusive prefix scan over counts gives + // each per-block region a base offset, then the block's threads scatter + // selections to those offsets in parallel. The single-block constraint + // keeps the scan in shared memory; `nms_compact` static_asserts a hard + // cap on NumBlocks that comfortably fits any realistic ONNX NMS. operation compile_compact(context& ctx, const std::vector& inputs, const value& v) const { @@ -163,12 +244,23 @@ struct nms_compiler : compiler ? raw_s.elements() / (num_blocks * std::size_t{3}) : std::size_t{0}; + // Pick a block size large enough to give the scan and scatter useful + // parallelism without inflating LDS pressure. block_scan requires the + // block size to be a multiple of the wavefront size; 64 is the + // smallest safe choice for all supported gfx targets. + const auto total = std::max(num_blocks * num_boxes, std::size_t{1}); + const auto block_size = std::min( + std::max( + static_cast(bit_ceil(static_cast(total))), + std::size_t{64}), + std::size_t{256}); + hip_compile_options options; options.inputs = inputs; options.output = inputs.back(); options.kernel_name = "nms_compact_kernel"; options.virtual_inputs = inputs; - options.set_launch_params(v, std::size_t{1}, std::size_t{1}); + options.set_launch_params(v, block_size, block_size); // one block auto src = interpolate_string( nms_compact_kernel_src, @@ -179,12 +271,24 @@ struct nms_compiler : compiler return compile_hip_code_object(ctx, src, options); } - // Required compiler<> hook: return the compute kernel based on the raw - // input shapes. The full two-kernel chain is handled in `compile()`; this - // entry point is only used by callers that ask for a single op view. + // Required compiler<> hook: return the sort kernel built from the raw + // user input shapes (boxes, scores). The full three-kernel chain is + // handled in `compile()`; this entry point is only used by callers that + // ask for a single op view. operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { - return compile_compute(ctx, inputs, v); + if(inputs.size() < 2) + MIGRAPHX_THROW("nms_compiler: compile_op needs at least boxes and scores"); + const auto& boxes_s = inputs[0]; + const auto& scores_s = inputs[1]; + if(boxes_s.lens().size() != 3 or scores_s.lens().size() != 3) + MIGRAPHX_THROW("nms_compiler: boxes and scores must be 3-D"); + const auto nb = boxes_s.lens()[0]; + const auto b = boxes_s.lens()[1]; + const auto nc = scores_s.lens()[1]; + const auto aligned_b = static_cast(bit_ceil(static_cast(b))); + const shape sorted_s{shape::int8_type, {nb * nc * aligned_b * nms_bytes_per_data}}; + return compile_sort(ctx, {boxes_s, scores_s, sorted_s}, v); } compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const @@ -234,23 +338,34 @@ struct nms_compiler : compiler // Per-block selection counts (one int32 per (batch, class) block). shape counts_s{shape::int32_type, {nb * nc}}; - // Compute kernel input shapes: [user inputs..., sorted, mask, counts, raw_out] - std::vector compute_shapes = raw_shapes; - compute_shapes.push_back(sorted_s); - compute_shapes.push_back(mask_s); - compute_shapes.push_back(counts_s); - compute_shapes.push_back(raw_output_s); + // Sort kernel input shapes: [boxes, scores, sorted] + std::vector sort_shapes = {boxes_s, scores_s, sorted_s}; + + // Filter kernel input shapes: [sorted, max, iou, thr, mask, counts, raw_out] + std::vector filter_shapes = {sorted_s, + raw_shapes[2], + raw_shapes[3], + raw_shapes[4], + mask_s, + counts_s, + raw_output_s}; // Compact kernel input shapes: [counts, raw_out, output] - std::vector compact_shapes; - compact_shapes.push_back(counts_s); - compact_shapes.push_back(raw_output_s); - compact_shapes.push_back(raw.back()->get_shape()); + std::vector compact_shapes = {counts_s, raw_output_s, raw.back()->get_shape()}; + + // The filter kernel can't recover nb/nc/b from its input shapes + // (sorted/mask/counts/raw_out are all flat scratch buffers), so we + // pass them through an augmented value alongside the op attributes. + value augmented = op.to_value(); + augmented["num_batches"] = nb; + augmented["num_classes"] = nc; + augmented["num_boxes"] = b; - auto compute_kop = compile_compute(ctx, compute_shapes, op.to_value()); - auto compact_kop = compile_compact(ctx, compact_shapes, op.to_value()); + auto sort_kop = compile_sort(ctx, sort_shapes, augmented); + auto filter_kop = compile_filter(ctx, filter_shapes, augmented); + auto compact_kop = compile_compact(ctx, compact_shapes, augmented); - std::vector kops = {compute_kop, compact_kop}; + std::vector kops = {sort_kop, filter_kop, compact_kop}; return {kops, [=](module& m, instruction_ref ins2, const std::vector& cops) { @@ -291,21 +406,24 @@ struct nms_compiler : compiler out = m.insert_instruction( ins2, make_op("hip::fill", {{"value", 0}}), out); - auto compute_args = args; - compute_args.push_back(sorted); - compute_args.push_back(mask); - compute_args.push_back(counts); - compute_args.push_back(raw_out); - - auto compute_ins = - m.insert_instruction(ins2, cops[0], compute_args); - - // Use compute_ins (returned raw_out) as the dataflow edge - // so the compact kernel is ordered after the compute - // kernel and the raw_out buffer remains live. - std::vector compact_args = { - counts, compute_ins, out}; - m.replace_instruction(ins2, cops[1], compact_args); + // Phase 1: sort. Inputs are [boxes, scores, sorted]; the + // returned `sort_ins` is the post-write `sorted` buffer + // which becomes the filter kernel's first input. + auto sort_ins = m.insert_instruction( + ins2, cops[0], {args[0], args[1], sorted}); + + // Phase 2: filter. Use `sort_ins` as the dataflow edge so + // the filter is ordered after sort and `sorted` stays + // live. Returned `filter_ins` is the post-write + // `raw_output` buffer fed to compact. + auto filter_ins = m.insert_instruction( + ins2, + cops[1], + {sort_ins, args[2], args[3], args[4], mask, counts, raw_out}); + + // Phase 3: compact. Counts/filter_ins/out match the + // [counts, raw_output, output] order in compact_shapes. + m.replace_instruction(ins2, cops[2], {counts, filter_ins, out}); }}; } }; diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp index ead79aed578..dfdff7430c8 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include #include #include @@ -164,8 +166,8 @@ __device__ void nms_make_iou_mask(index idx, const nms_data* sorted, uint8_t* ma // Phase 3: greedy filter that writes selections into a per-block region of a // scratch buffer (block_id * N entries) and stores the per-block count. A -// follow-up serial compaction kernel gathers per-block regions in block_id -// order to produce a deterministic compacted output that matches the CPU op. +// follow-up compaction kernel gathers per-block regions in block_id order to +// produce a deterministic compacted output that matches the CPU op. template __device__ void nms_filter_per_block(index idx, const nms_data* sorted, @@ -221,42 +223,60 @@ __device__ void nms_filter_per_block(index idx, block_counts[block_id] = static_cast(output_idx); } -// Serial compaction: a single thread walks per-block regions in block_id order -// (which equals the CPU op's (batch, class) iteration order) and copies the -// first block_counts[b] entries of each region into a contiguous prefix of the -// final output buffer. Trailing slots are left as the zero fill applied before -// this kernel runs. +// Single-block compaction: an exclusive prefix scan over block_counts gives +// each per-block region a base offset in the final output; threads in the +// single launched block then scatter the per-block selections in parallel. +// Block_id order is preserved, which matches the CPU op's (batch, class) +// iteration order, and each block writes its `block_counts[b]` entries in +// order, so the final output is bit-for-bit identical to the serial walker. +// Trailing slots are left as the zero fill applied before this kernel runs. template __device__ void nms_compact(index idx, const int64_t* raw_output, const int32_t* block_counts, int64_t* output) { - if(idx.global == 0) - { - index_int dst = 0; - for(index_int b = 0; b < NumBlocks; ++b) + static_assert(NumBlocks > 0, "num_blocks must be > 0"); + // offsets[] is sized 4 * NumBlocks bytes in LDS, well within the 64 KB + // per-block budget for any realistic ONNX NMS (nb * nc). + static_assert(NumBlocks <= 16384, + "nms_compact: NumBlocks exceeds the LDS budget for offsets[]"); + + __shared__ int32_t offsets[NumBlocks]; + + // Exclusive prefix sum: emit(b, inclusive) -> offsets[b] = inclusive - counts[b]. + block_scan( + idx, + op::sum{}, + int32_t{0}, + index_int{NumBlocks}, + [&](auto b) -> int32_t { return block_counts[b]; }, + [&](auto b, auto inclusive) { offsets[b] = inclusive - block_counts[b]; }); + __syncthreads(); + + // Parallel scatter: flatten (b, i) so all threads see roughly equal work, + // regardless of how `block_counts[b]` is distributed across blocks. + constexpr index_int total = NumBlocks * NumBoxes; + idx.local_stride(total, [&](auto bi) { + const index_int b = bi / NumBoxes; + const index_int i = bi % NumBoxes; + if(i < static_cast(block_counts[b])) { - const int32_t cnt = block_counts[b]; - const int64_t* src = raw_output + b * NumBoxes * 3; - for(int32_t i = 0; i < cnt; ++i) - { - output[dst * 3 + 0] = src[i * 3 + 0]; - output[dst * 3 + 1] = src[i * 3 + 1]; - output[dst * 3 + 2] = src[i * 3 + 2]; - ++dst; - } + const int64_t* src = raw_output + (b * NumBoxes + i) * 3; + int64_t* dst = output + (offsets[b] + i) * 3; + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; } - } + }); } -// Per-block driver: one block per (batch_idx, class_idx). Workspace pointers -// are sliced into per-block segments using idx.group. Selections are written -// to a per-block region of `raw_output` and the per-block count is written to -// `block_counts`; a follow-up compact kernel produces the final compacted -// output that matches the CPU op's ordering. +// Per-block sort driver: one block per (batch_idx, class_idx). Loads boxes / +// scores for this (batch, class) into a per-block region of `sorted_buf` and +// runs a block-level bitonic sort. The result feeds the follow-up filter +// kernel, which reads `sorted_buf` and writes the IoU mask / per-block +// selection list. // -// `raw_output_buf` is intentionally the last parameter so that JIT-compiled -// callers (which use `inputs.back()` as the kernel's output buffer) treat it -// as the chained output flowing into the compact kernel. +// `sorted_buf` is the last parameter so the JIT framework treats it as the +// chained output flowing into the filter kernel. template +__device__ void nonmaxsuppression_sort(Boxes boxes, Scores scores, Sorted sorted_buf) +{ + static_assert(NumBatches > 0, "num_batches must be > 0"); + static_assert(NumClasses > 0, "num_classes must be > 0"); + + auto idx = make_index(); + const index_int block_id = idx.group; + const int batch_idx = static_cast(block_id / NumClasses); + const int class_idx = static_cast(block_id % NumClasses); + + nms_data* my_sorted = + reinterpret_cast(sorted_buf.data()) + block_id * AlignedNumBoxes; + + const float* boxes_b = boxes.data() + batch_idx * NumBoxes * 4; + const float* scores_bc = scores.data() + (batch_idx * NumClasses + class_idx) * NumBoxes; + + nms_load_and_sort( + idx, boxes_b, scores_bc, my_sorted); +} + +// Per-block filter driver: one block per (batch_idx, class_idx). Reads the +// previously-sorted records out of `sorted_buf`, builds the IoU mask in +// `mask_buf`, then runs the greedy filter writing selections into a per-block +// region of `raw_output` and the per-block count into `counts_buf`. +// +// The box-coordinate convention has already been normalized into corner form +// in `sorted_buf`, so this driver does not need `CenterPointBox`. +// +// `raw_output_buf` is intentionally the last parameter so that JIT-compiled +// callers (which use `inputs.back()` as the kernel's output buffer) treat it +// as the chained output flowing into the compact kernel. +template -__device__ void nonmaxsuppression(Boxes boxes, - Scores scores, - MaxOut max_out_p, - IouThr iou_thr_p, - ScoreThr score_thr_p, - Sorted sorted_buf, - Mask mask_buf, - Counts counts_buf, - RawOutput raw_output_buf) +__device__ void nonmaxsuppression_filter(Sorted sorted_buf, + MaxOut max_out_p, + IouThr iou_thr_p, + ScoreThr score_thr_p, + Mask mask_buf, + Counts counts_buf, + RawOutput raw_output_buf) { static_assert(NumBatches > 0, "num_batches must be > 0"); static_assert(NumClasses > 0, "num_classes must be > 0"); @@ -294,19 +348,12 @@ __device__ void nonmaxsuppression(Boxes boxes, reinterpret_cast(sorted_buf.data()) + block_id * AlignedNumBoxes; uint8_t* my_mask = reinterpret_cast(mask_buf.data()) + block_id * iou_packed_size; - const float* boxes_b = boxes.data() + batch_idx * NumBoxes * 4; - const float* scores_bc = scores.data() + (batch_idx * NumClasses + class_idx) * NumBoxes; - // Pull scalar tensor inputs once. They're broadcast to all threads via the // common load (each thread reads the same single element). const int64_t max_out_val = max_out_p[0]; const float iou_thr_val = iou_thr_p[0]; const float score_thr_val = score_thr_p[0]; - nms_load_and_sort( - idx, boxes_b, scores_bc, my_sorted); - __syncthreads(); - if constexpr(NumBoxes > 1) { nms_make_iou_mask(idx, my_sorted, my_mask, iou_thr_val); @@ -332,8 +379,8 @@ __device__ void nonmaxsuppression(Boxes boxes, reinterpret_cast(counts_buf.data())); } -// Serial compact wrapper invoked from the second JIT kernel. Reads the -// per-block counts and raw_output produced by `nonmaxsuppression` and copies +// Compact wrapper invoked from the final JIT kernel. Reads the per-block +// counts and raw_output produced by `nonmaxsuppression_filter` and copies // selections into the final output in block_id (i.e. (batch, class)) order. // `output` is last to match the JIT convention of using `inputs.back()` as // the kernel's logical output buffer. diff --git a/test/verify/test_nms.cpp b/test/verify/test_nms.cpp index 94828ae35a2..c25e303529d 100644 --- a/test/verify/test_nms.cpp +++ b/test/verify/test_nms.cpp @@ -148,6 +148,7 @@ struct test_nms_not_center : verify_program } }; +// TODO: update this test // Test NMS with dynamic inputs that have different compile-time spatial ranges. // This reproduces the scenario from nms_repro_minidimmismatch.py where // boxes has 10 spatial entries and scores has 5, but at runtime both are From ced7e69b6742944239f5cbde5e9885f48d3471a9 Mon Sep 17 00:00:00 2001 From: charlie Date: Wed, 13 May 2026 14:32:28 -0500 Subject: [PATCH 04/32] Change NMS ONNX parsing and ref behavior --- src/include/migraphx/op/nonmaxsuppression.hpp | 48 +++++++------------ src/onnx/parse_nonmaxsuppression.cpp | 8 +++- 2 files changed, 24 insertions(+), 32 deletions(-) diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp index b7b13f40354..87a4f1eebb0 100644 --- a/src/include/migraphx/op/nonmaxsuppression.hpp +++ b/src/include/migraphx/op/nonmaxsuppression.hpp @@ -40,9 +40,14 @@ #include #include -/* -https://github.com/onnx/onnx/blob/main/docs/Operators.md#NonMaxSuppression -*/ +/** + * nonmaxsuppression(boxes, + * scores, + * optional(max_output_boxes_per_class), + * optional(iou_threshold), + * optional(score_threshold)); + * Outputs tuple of {tensor with dims[max_num_boxes, 3]: selected_box_indices, scalar int64_t: num_selected_indices} + */ namespace migraphx { inline namespace MIGRAPHX_INLINE_NS { namespace op { @@ -50,13 +55,11 @@ namespace op { struct nonmaxsuppression { bool center_point_box = false; - bool use_dyn_output = false; template static auto reflect(Self& self, F f) { - return pack(f(self.center_point_box, "center_point_box"), - f(self.use_dyn_output, "use_dyn_output")); + return pack(f(self.center_point_box, "center_point_box")); } std::string name() const { return "nonmaxsuppression"; } @@ -87,21 +90,9 @@ struct nonmaxsuppression } }; - bool needs_dyn_output = use_dyn_output or inputs.at(0).dynamic() or inputs.at(1).dynamic(); - - if(needs_dyn_output) - { - std::vector out_lens = {}; - out_lens.push_back({0, max_num_boxes}); - out_lens.push_back({3, 3}); - return {shape::int64_type, out_lens}; - } - else - { - fixed_shape_error_check(); - std::vector out_lens = {max_num_boxes, 3}; - return {shape::int64_type, out_lens}; - } + fixed_shape_error_check(); + std::vector out_lens = {max_num_boxes, 3}; + return {shape::int64_type, out_lens}; } struct box @@ -236,7 +227,6 @@ struct nonmaxsuppression double iou_threshold, double score_threshold) const { - std::fill(output.begin(), output.end(), 0); const auto& lens = scores.get_shape().lens(); const auto num_batches = lens[0]; const auto num_classes = lens[1]; @@ -325,14 +315,12 @@ struct nonmaxsuppression score_threshold); }); }); - if(output_shape.dynamic()) - { - return result.reshape({output_shape.type(), {num_selected, 3}}); - } - else - { - return result; - } + shape scalar_int_shape = {shape::int64_type, {1}}; + argument num_selected_result{scalar_int_shape}; + num_selected_result.visit([&](auto output){ + output.begin() = num_selected; + }); + return {{result, num_selected}}; } }; diff --git a/src/onnx/parse_nonmaxsuppression.cpp b/src/onnx/parse_nonmaxsuppression.cpp index 212ee5123a4..c2389b0e675 100644 --- a/src/onnx/parse_nonmaxsuppression.cpp +++ b/src/onnx/parse_nonmaxsuppression.cpp @@ -39,8 +39,12 @@ struct parse_nonmaxsuppression : op_parser const std::vector& args) const { auto op = parser.load(opd.op_name, info); - op.from_value({{"use_dyn_output", parser.use_dyn_output}}); - return info.add_instruction(op, args); + auto nms_ins = info.add_instruction(op, args); + // variable ends input slice to handle dynamic shape output + auto nms_indices = info.add_instruction(make_op("get_tuple_elem", {{"index", 0}}), nms_ins); + auto nms_num_selected = info.add_instruction(make_op("get_tuple_elem", {{"index", 1}}), nms_ins); + auto slice_ins = info.add_instruction(make_op("slice", {{"axes", {0}}, {"starts", {0}}}, nms_indices, nms_num_selected)); + return slice_ins; } }; From 84c7d3b630f9d9b4109654fc212c72231d2c1240 Mon Sep 17 00:00:00 2001 From: charlie Date: Wed, 13 May 2026 20:22:25 -0500 Subject: [PATCH 05/32] Progress, refactor --- src/include/migraphx/op/nonmaxsuppression.hpp | 10 +- src/onnx/parse_nonmaxsuppression.cpp | 6 +- src/targets/gpu/jit/nonmaxsuppression.cpp | 48 ++-- .../migraphx/kernels/nonmaxsuppression.hpp | 241 ++++++++---------- .../kernels/include/migraphx/kernels/sort.hpp | 4 +- src/targets/gpu/lowering.cpp | 2 +- test/verify/test_nms.cpp | 8 +- 7 files changed, 147 insertions(+), 172 deletions(-) diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp index 87a4f1eebb0..6b9af617909 100644 --- a/src/include/migraphx/op/nonmaxsuppression.hpp +++ b/src/include/migraphx/op/nonmaxsuppression.hpp @@ -72,7 +72,7 @@ struct nonmaxsuppression auto max_classes = inputs.at(1).max_lens().at(1); auto max_spatial_dimension = inputs.at(0).max_lens().at(1); // Per ONNX spec, output is [num_selected_indices, 3] where each row is - // [batch_index, class_index, box_index]. The maximum possible + // [batch_index, class_index, box_index]. The maximum possible // num_selected_indices = num_batches * num_classes * spatial_dimension. const auto max_num_boxes = max_batches * max_classes * max_spatial_dimension; @@ -92,7 +92,9 @@ struct nonmaxsuppression fixed_shape_error_check(); std::vector out_lens = {max_num_boxes, 3}; - return {shape::int64_type, out_lens}; + shape s_ind{shape::int64_type, out_lens}; + shape s_num_selected{shape::int64_type, {1}}; + return shape({s_ind, s_num_selected}); } struct box @@ -223,7 +225,7 @@ struct nonmaxsuppression std::size_t compute_nms(Output output, const Boxes& boxes, const Scores& scores, - std::size_t max_output_boxes_per_class, + int64_t max_output_boxes_per_class, double iou_threshold, double score_threshold) const { @@ -320,7 +322,7 @@ struct nonmaxsuppression num_selected_result.visit([&](auto output){ output.begin() = num_selected; }); - return {{result, num_selected}}; + return {{result, num_selected_result}}; } }; diff --git a/src/onnx/parse_nonmaxsuppression.cpp b/src/onnx/parse_nonmaxsuppression.cpp index c2389b0e675..0ffffa03bcd 100644 --- a/src/onnx/parse_nonmaxsuppression.cpp +++ b/src/onnx/parse_nonmaxsuppression.cpp @@ -41,9 +41,9 @@ struct parse_nonmaxsuppression : op_parser auto op = parser.load(opd.op_name, info); auto nms_ins = info.add_instruction(op, args); // variable ends input slice to handle dynamic shape output - auto nms_indices = info.add_instruction(make_op("get_tuple_elem", {{"index", 0}}), nms_ins); - auto nms_num_selected = info.add_instruction(make_op("get_tuple_elem", {{"index", 1}}), nms_ins); - auto slice_ins = info.add_instruction(make_op("slice", {{"axes", {0}}, {"starts", {0}}}, nms_indices, nms_num_selected)); + auto indices = info.add_instruction(make_op("get_tuple_elem", {{"index", 0}}), nms_ins); + auto num_selected = info.add_instruction(make_op("get_tuple_elem", {{"index", 1}}), nms_ins); + auto slice_ins = info.add_instruction(make_op("slice", {{"axes", {0}}, {"starts", {0}}}), indices, num_selected); return slice_ins; } }; diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp index 189ba2057e8..a473f8477be 100644 --- a/src/targets/gpu/jit/nonmaxsuppression.cpp +++ b/src/targets/gpu/jit/nonmaxsuppression.cpp @@ -135,54 +135,48 @@ MIGRAPHX_GLOBAL void nms_compact_kernel(${params}) } // namespace migraphx )__migraphx__"; +// TODO: use compute_block_size and/or compute_global_for? +// TODO: Don't need num_batches, num_classes, num_boxes as template parameters since tensor_view has shapes. struct nms_compiler : compiler { std::vector names() const { return {"nonmaxsuppression"}; } - // Compile the per-block sort kernel. `inputs` is: - // [boxes, scores, sorted] - // `sorted` is the last input so the framework treats it as the kernel's - // chained output flowing into the filter kernel. Launch is sized to - // AlignedNumBoxes so the bitonic sort has enough lane-parallelism even - // when NumBoxes is small relative to it. + // Compile the sort kernel. + // inputs: [boxes, scores, sorted] operation compile_sort(context& ctx, const std::vector& inputs, const value& v) const { const auto& boxes_s = inputs[0]; const auto& scores_s = inputs[1]; - const auto nb = boxes_s.lens()[0]; - const auto b = boxes_s.lens()[1]; - const auto nc = scores_s.lens()[1]; - const auto aligned_b = static_cast(bit_ceil(static_cast(b))); - // bitonic block_sort uses __syncthreads between every stage; pad up - // to a wavefront so degenerate cases (e.g. NumBoxes <= 1) still - // launch a valid block. - const auto block_size = std::min( - std::max(aligned_b, std::size_t{64}), std::size_t{1024}); + const auto num_batches = boxes_s.lens()[0]; + const auto num_boxes = boxes_s.lens()[1]; + const auto num_classes = scores_s.lens()[1]; + const auto aligned_b = static_cast(bit_ceil(static_cast(num_boxes))); + // clamp between 64 and 1024 threads based on aligned_num_boxes + const auto block_size = std::min(std::max(aligned_b, std::size_t{64}), std::size_t{1024}); hip_compile_options options; options.inputs = inputs; options.output = inputs.back(); // sorted buffer options.kernel_name = "nms_sort_kernel"; options.virtual_inputs = inputs; - options.set_launch_params(v, block_size * nb * nc, block_size); + options.set_launch_params(v, block_size * num_batches * num_classes, block_size); auto src = interpolate_string( nms_sort_kernel_src, {{"params", enum_params(inputs.size(), "void * private_p")}, {"args", enum_params(inputs.size(), "private_p")}, - {"num_batches", std::to_string(nb)}, - {"num_classes", std::to_string(nc)}, - {"num_boxes", std::to_string(b)}, + {"num_batches", std::to_string(num_batches)}, + {"num_classes", std::to_string(num_classes)}, + {"num_boxes", std::to_string(num_boxes)}, {"aligned_num_boxes", std::to_string(aligned_b)}, {"center_point_box", v.at("center_point_box").to() ? "true" : "false"}}); return compile_hip_code_object(ctx, src, options); } - // Compile the per-block filter kernel. `inputs` is: - // [sorted, max, iou, score_thr, mask, counts, raw_output] - // `raw_output` is the last input so the framework treats it as the + // inputs: [sorted, max, iou, score_thr, mask, counts, raw_output] + // `raw_output` is the last input so the framework treats it as the( // kernel's chained output flowing into the compact kernel. The filter's // inner loops are O(N) per (batch, class), so the launch is sized to // NumBoxes (not AlignedNumBoxes) to avoid leaving padding-only threads @@ -225,6 +219,7 @@ struct nms_compiler : compiler return compile_hip_code_object(ctx, src, options); } + // TODO: REDO this whole thing. It doesn't make sense. // Compile the compaction kernel. `inputs` is: // [counts, raw_output, output] // Launched as a single block: an exclusive prefix scan over counts gives @@ -260,6 +255,7 @@ struct nms_compiler : compiler options.output = inputs.back(); options.kernel_name = "nms_compact_kernel"; options.virtual_inputs = inputs; + // BUG: this is not one block options.set_launch_params(v, block_size, block_size); // one block auto src = interpolate_string( @@ -398,14 +394,6 @@ struct nms_compiler : compiler auto counts = m.insert_instruction( ins2, make_op("hip::allocate", {{"shape", to_value(counts_s)}})); - // Pre-zero the final output buffer so unwritten rows match - // the CPU implementation's behavior (trailing zeros). The - // counts and raw_out scratch don't need zeroing: each - // block writes its count exactly once and the compact - // kernel only reads counts[b] entries from each block. - out = m.insert_instruction( - ins2, make_op("hip::fill", {{"value", 0}}), out); - // Phase 1: sort. Inputs are [boxes, scores, sorted]; the // returned `sort_ins` is the post-write `sorted` buffer // which becomes the filter kernel's first input. diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp index dfdff7430c8..c4c27a76ed3 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp @@ -36,9 +36,6 @@ namespace migraphx { -// Per-box record carried through the sort. Box corners are stored normalized -// to (xmin, ymin, xmax, ymax) so the IoU computation is independent of the -// center_point_box attribute. struct nms_data { float score; @@ -47,24 +44,25 @@ struct nms_data }; // Decode a single box into (xmin, ymin, xmax, ymax) corners. -template -__device__ inline array nms_normalize_box(const float* b) +// Normalize such that [x1, y1] is the bottom left corner +template +__device__ inline array nms_normalize_box(Box box) { if constexpr(CenterPointBox) { - const float xc = b[0]; - const float yc = b[1]; - const float hw = b[2] * 0.5f; - const float hh = b[3] * 0.5f; + const float xc = box[0]; + const float yc = box[1]; + const float hw = box[2] * 0.5f; + const float hh = box[3] * 0.5f; return {xc - hw, yc - hh, xc + hw, yc + hh}; } else { // ONNX layout: [y1, x1, y2, x2]; corners may be in either order. - const float y1 = b[0]; - const float x1 = b[1]; - const float y2 = b[2]; - const float x2 = b[3]; + const float y1 = box[0]; + const float x1 = box[1]; + const float y2 = box[2]; + const float x2 = box[3]; const float xmin = min(x1, x2); const float xmax = max(x1, x2); const float ymin = min(y1, y2); @@ -73,8 +71,9 @@ __device__ inline array nms_normalize_box(const float* b) } } +template __device__ inline bool -nms_iou_over_threshold(const array& a, const array& b, float threshold) +nms_iou_over_threshold(const Box a, Box b, float threshold) { const float left = max(a[0], b[0]); const float right = min(a[2], b[2]); @@ -97,6 +96,7 @@ __device__ inline index_int nms_packed_idx(index_int i, index_int j, index_int N return (i * N - (i * (i + 1)) / 2) + j - (i + 1); } +// Comparator for sorting nms_data{}. struct nms_score_greater { constexpr bool operator()(const nms_data& a, const nms_data& b) const @@ -105,18 +105,42 @@ struct nms_score_greater } }; -// Phase 1: load (score, box, box_index) tuples into a per-block buffer of -// AlignedN entries (power of two), padding the [N, AlignedN) tail with sentinel -// values, then sort the buffer in descending order by score. -template -__device__ void nms_load_and_sort(index idx, - const float* boxes_b, // [N, 4] - const float* scores_bc, // [N] - nms_data* sorted) +// Phase 1 +// One block per (batch_idx, class_idx). +// Load data into per-block buffer of nms_data. +// Pads values after N with sentinel values. +// Sorts the nms_data in descending order by score. +// boxes_tv: dims([N, 4]) of float. +// scores_tv: dims([N]) of float. +// sorted_tv: dims([N]) of nms_data{}. +template +__device__ void nonmaxsuppression_sort(Boxes boxes_tv, Scores scores_tv, Output out_tv) { - idx.local_stride(AlignedN, [&](auto i) { - nms_data d; - if(i < N) + static_assert(NumBatches > 0, "num_batches must be > 0"); + static_assert(NumClasses > 0, "num_classes must be > 0"); + + auto idx = make_index(); + const index_int block_id = idx.group; + const int batch_idx = static_cast(block_id / NumClasses); + const int class_idx = static_cast(block_id % NumClasses); + + constexpr auto block_out_shape = make_shape(index_ints{}); + auto* p = reinterpret_cast(out_tv.data()) + block_id * AlignedNumBoxes; + auto block_out_tv = make_tensor_view(p, block_out_shape); + + const auto* boxes_b = boxes_tv.data() + batch_idx * NumBoxes * 4; + const auto* scores_bc = scores_tv.data() + (batch_idx * NumClasses + class_idx) * NumBoxes; + + nms_data d; + idx.local_stride(AlignedNumBoxes, [&](auto i) { + if(i < NumBoxes) { d.score = scores_bc[i]; d.box = nms_normalize_box(boxes_b + i * 4); @@ -130,14 +154,15 @@ __device__ void nms_load_and_sort(index idx, d.box = array{0.f, 0.f, 0.f, 0.f}; d.box_index = -1; } - sorted[i] = d; + block_out_tv[i] = d; }); __syncthreads(); - bitonic_sort{nms_score_greater{}}.template block_sort(idx, sorted); + bitonic_sort{nms_score_greater{}}.template block_sort(idx, block_out_tv); } -// Phase 2: build the packed upper-triangular IoU mask for the N sorted boxes. -// Work is striped (i, N-1-i) per thread so each thread does roughly the same +// Phase 2 +// Build the packed upper-triangular IoU mask for the N sorted boxes. +// Work is striped such that each thread does a multiple of 2 rows so each does roughly the same // amount of work regardless of where it falls in the triangle. template __device__ void nms_make_iou_mask(index idx, const nms_data* sorted, uint8_t* mask, float iou_thr) @@ -164,17 +189,16 @@ __device__ void nms_make_iou_mask(index idx, const nms_data* sorted, uint8_t* ma } } -// Phase 3: greedy filter that writes selections into a per-block region of a -// scratch buffer (block_id * N entries) and stores the per-block count. A -// follow-up compaction kernel gathers per-block regions in block_id order to -// produce a deterministic compacted output that matches the CPU op. +// Phase 3 +// Greedy filter that writes selections into a per-block region of a +// scratch buffer (block_id * N entries) and stores the per-block count. template __device__ void nms_filter_per_block(index idx, const nms_data* sorted, const uint8_t* mask, int batch_idx, int class_idx, - index_int max_output, + int64_t max_output, float score_thr, int64_t* raw_output, // [num_blocks * N * 3] int32_t* block_counts) // [num_blocks] @@ -223,95 +247,13 @@ __device__ void nms_filter_per_block(index idx, block_counts[block_id] = static_cast(output_idx); } -// Single-block compaction: an exclusive prefix scan over block_counts gives -// each per-block region a base offset in the final output; threads in the -// single launched block then scatter the per-block selections in parallel. -// Block_id order is preserved, which matches the CPU op's (batch, class) -// iteration order, and each block writes its `block_counts[b]` entries in -// order, so the final output is bit-for-bit identical to the serial walker. -// Trailing slots are left as the zero fill applied before this kernel runs. -template -__device__ void -nms_compact(index idx, const int64_t* raw_output, const int32_t* block_counts, int64_t* output) -{ - static_assert(NumBlocks > 0, "num_blocks must be > 0"); - // offsets[] is sized 4 * NumBlocks bytes in LDS, well within the 64 KB - // per-block budget for any realistic ONNX NMS (nb * nc). - static_assert(NumBlocks <= 16384, - "nms_compact: NumBlocks exceeds the LDS budget for offsets[]"); - - __shared__ int32_t offsets[NumBlocks]; - - // Exclusive prefix sum: emit(b, inclusive) -> offsets[b] = inclusive - counts[b]. - block_scan( - idx, - op::sum{}, - int32_t{0}, - index_int{NumBlocks}, - [&](auto b) -> int32_t { return block_counts[b]; }, - [&](auto b, auto inclusive) { offsets[b] = inclusive - block_counts[b]; }); - __syncthreads(); - - // Parallel scatter: flatten (b, i) so all threads see roughly equal work, - // regardless of how `block_counts[b]` is distributed across blocks. - constexpr index_int total = NumBlocks * NumBoxes; - idx.local_stride(total, [&](auto bi) { - const index_int b = bi / NumBoxes; - const index_int i = bi % NumBoxes; - if(i < static_cast(block_counts[b])) - { - const int64_t* src = raw_output + (b * NumBoxes + i) * 3; - int64_t* dst = output + (offsets[b] + i) * 3; - dst[0] = src[0]; - dst[1] = src[1]; - dst[2] = src[2]; - } - }); -} - -// Per-block sort driver: one block per (batch_idx, class_idx). Loads boxes / -// scores for this (batch, class) into a per-block region of `sorted_buf` and -// runs a block-level bitonic sort. The result feeds the follow-up filter -// kernel, which reads `sorted_buf` and writes the IoU mask / per-block -// selection list. -// -// `sorted_buf` is the last parameter so the JIT framework treats it as the -// chained output flowing into the filter kernel. -template -__device__ void nonmaxsuppression_sort(Boxes boxes, Scores scores, Sorted sorted_buf) -{ - static_assert(NumBatches > 0, "num_batches must be > 0"); - static_assert(NumClasses > 0, "num_classes must be > 0"); - - auto idx = make_index(); - const index_int block_id = idx.group; - const int batch_idx = static_cast(block_id / NumClasses); - const int class_idx = static_cast(block_id % NumClasses); - - nms_data* my_sorted = - reinterpret_cast(sorted_buf.data()) + block_id * AlignedNumBoxes; - - const float* boxes_b = boxes.data() + batch_idx * NumBoxes * 4; - const float* scores_bc = scores.data() + (batch_idx * NumClasses + class_idx) * NumBoxes; - - nms_load_and_sort( - idx, boxes_b, scores_bc, my_sorted); -} - // Per-block filter driver: one block per (batch_idx, class_idx). Reads the // previously-sorted records out of `sorted_buf`, builds the IoU mask in // `mask_buf`, then runs the greedy filter writing selections into a per-block // region of `raw_output` and the per-block count into `counts_buf`. // -// The box-coordinate convention has already been normalized into corner form -// in `sorted_buf`, so this driver does not need `CenterPointBox`. +// Expecting box-coordinate convention has already been normalized into corner form +// in `sorted_buf`. // // `raw_output_buf` is intentionally the last parameter so that JIT-compiled // callers (which use `inputs.back()` as the kernel's output buffer) treat it @@ -350,7 +292,7 @@ __device__ void nonmaxsuppression_filter(Sorted sorted_buf, // Pull scalar tensor inputs once. They're broadcast to all threads via the // common load (each thread reads the same single element). - const int64_t max_out_val = max_out_p[0]; + const int64_t max_output_boxes_per_class = max_out_p[0]; const float iou_thr_val = iou_thr_p[0]; const float score_thr_val = score_thr_p[0]; @@ -360,25 +302,64 @@ __device__ void nonmaxsuppression_filter(Sorted sorted_buf, __syncthreads(); } - // The CPU op reads max_output_boxes_per_class as std::size_t, so a negative - // signed value is treated as a very large unsigned (effectively unlimited). - // Mirror that here by reinterpreting as unsigned and then capping at - // NumBoxes, which is the most we could ever emit per (batch, class) block. - const auto max_unsigned = static_cast(max_out_val); - const index_int max_output = (max_unsigned > static_cast(NumBoxes)) - ? static_cast(NumBoxes) - : static_cast(max_unsigned); nms_filter_per_block(idx, my_sorted, my_mask, batch_idx, class_idx, - max_output, + max_output_boxes_per_class, score_thr_val, reinterpret_cast(raw_output_buf.data()), reinterpret_cast(counts_buf.data())); } +// Single-block compaction: an exclusive prefix scan over block_counts gives +// each per-block region a base offset in the final output; threads in the +// single launched block then scatter the per-block selections in parallel. +// Block_id order is preserved, which matches the CPU op's (batch, class) +// iteration order, and each block writes its `block_counts[b]` entries in +// order, so the final output is bit-for-bit identical to the serial walker. +// Trailing slots are left as the zero fill applied before this kernel runs. +// TODO: this explaination makes no sense +template +__device__ void +nms_compact(index idx, const int64_t* raw_output, const int32_t* block_counts, int64_t* output) +{ + static_assert(NumBlocks > 0, "num_blocks must be > 0"); + // offsets[] is sized 4 * NumBlocks bytes in LDS, well within the 64 KB + // per-block budget for any realistic ONNX NMS (nb * nc). + static_assert(NumBlocks <= 16384, + "nms_compact: NumBlocks exceeds the LDS budget for offsets[]"); + + __shared__ int32_t offsets[NumBlocks]; + + // Exclusive prefix sum: emit(b, inclusive) -> offsets[b] = inclusive - counts[b]. + block_scan( + idx, + op::sum{}, + int32_t{0}, + index_int{NumBlocks}, + [&](auto b) -> int32_t { return block_counts[b]; }, + [&](auto b, auto inclusive) { offsets[b] = inclusive - block_counts[b]; }); + __syncthreads(); + + // Parallel scatter: flatten (b, i) so all threads see roughly equal work, + // regardless of how `block_counts[b]` is distributed across blocks. + constexpr index_int total = NumBlocks * NumBoxes; + idx.local_stride(total, [&](auto bi) { + const index_int b = bi / NumBoxes; + const index_int i = bi % NumBoxes; + if(i < static_cast(block_counts[b])) + { + const int64_t* src = raw_output + (b * NumBoxes + i) * 3; + int64_t* dst = output + (offsets[b] + i) * 3; + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + } + }); +} + // Compact wrapper invoked from the final JIT kernel. Reads the per-block // counts and raw_output produced by `nonmaxsuppression_filter` and copies // selections into the final output in block_id (i.e. (batch, class)) order. @@ -387,7 +368,7 @@ __device__ void nonmaxsuppression_filter(Sorted sorted_buf, template __device__ void nonmaxsuppression_compact(Counts counts_buf, RawOutput raw_output_buf, - Output output) + Output output_indices) { static_assert(NumBlocks > 0, "num_blocks must be > 0"); @@ -395,7 +376,7 @@ __device__ void nonmaxsuppression_compact(Counts counts_buf, nms_compact(idx, reinterpret_cast(raw_output_buf.data()), reinterpret_cast(counts_buf.data()), - output.data()); + output_indices.data()); } } // namespace migraphx diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp index fa4d1c981e2..980a628682b 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp @@ -146,8 +146,8 @@ struct bitonic_sort // (e.g. greater{} -> descending). The buffer must be sized to N (a // compile-time power of 2); callers pad with sentinel values when the // logical length is smaller. - template - __device__ void block_sort(index idx, T* buf) const + template + __device__ void block_sort(index idx, Array& buf) const { static_assert(is_power_of_2(N), "N must be a power of 2"); for(index_int k = 2; k <= N; k <<= 1) diff --git a/src/targets/gpu/lowering.cpp b/src/targets/gpu/lowering.cpp index 9e510f10047..56c6039c075 100644 --- a/src/targets/gpu/lowering.cpp +++ b/src/targets/gpu/lowering.cpp @@ -108,7 +108,7 @@ struct miopen_apply add_if_op(); add_loop_op(); add_neg_op(); - add_nms_op(); + //add_nms_op(); add_lrn_op(); add_convolution_backwards_op(); add_select_module_op(); diff --git a/test/verify/test_nms.cpp b/test/verify/test_nms.cpp index c25e303529d..9039784a689 100644 --- a/test/verify/test_nms.cpp +++ b/test/verify/test_nms.cpp @@ -43,14 +43,18 @@ struct test_nms : verify_program auto iou_threshold = mm->add_literal(0.5f); auto score_threshold = mm->add_literal(0.0f); - auto r = + auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", 1}}), boxes_l, scores_l, max_out_l, iou_threshold, score_threshold); - mm->add_return({r}); + + auto indices = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms); + auto num_selected = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 1}}), nms); + auto slice_ins = mm->add_instruction(migraphx::make_op("slice", {{"axes", {0}}, {"starts", {0}}}), indices, num_selected); + mm->add_return({slice_ins}); return p; } From 43c10be99f4e05becae45c9761eaa0cf6301fa8a Mon Sep 17 00:00:00 2001 From: charlie Date: Thu, 14 May 2026 17:43:50 -0500 Subject: [PATCH 06/32] Cleanup before refactor into 3 JIT instructions --- .../include/migraphx/gpu/device/scan.hpp | 9 ++ src/targets/gpu/jit/nonmaxsuppression.cpp | 126 +++++++----------- .../migraphx/kernels/nonmaxsuppression.hpp | 103 +++++++------- 3 files changed, 99 insertions(+), 139 deletions(-) diff --git a/src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp b/src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp index 5a66f7f7308..95ce82f224e 100644 --- a/src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp +++ b/src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp @@ -33,6 +33,14 @@ inline namespace MIGRAPHX_INLINE_NS { namespace gpu { namespace device { +// Inclusive prefix sum within a kernel block. +// Hillis-Steele scan with double-buffered (ping-pong) shared array. +// `N`: upper bound on blockDim.x, sizes the shared buffer. +// `op`: associative binary reduce function ex. sum or max. +// `init`: initializer +// `fs`: striding function for thread work distribution. +// `input`: input with input(index_int). +// `output`: output with output(index_int, inclusive_scan_value_at_index_int). template __device__ void block_scan(index idx, Op op, T init, index_int n, Input input, Output output) { diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp index a473f8477be..dc0202a4109 100644 --- a/src/targets/gpu/jit/nonmaxsuppression.cpp +++ b/src/targets/gpu/jit/nonmaxsuppression.cpp @@ -43,12 +43,12 @@ namespace gpu { // and reinterpreted in the kernel. static constexpr std::size_t nms_bytes_per_data = 24; -// Phase-1 ("sort") kernel: each block normalizes its (batch, class)'s boxes +// Phase 1 ("sort") kernel: each block normalizes its (batch, class)'s boxes // and bitonic-sorts them by descending score into a per-block region of the // `sorted` scratch buffer. Launch dimensions are sized to AlignedNumBoxes so // the sort has enough parallelism even when NumBoxes is small relative to it. // NOLINTNEXTLINE -static const char* const nms_sort_kernel_src = R"__migraphx__( +static const char* const nms_load_sort_kernel_src = R"__migraphx__( #include #include @@ -125,8 +125,8 @@ extern "C" { MIGRAPHX_GLOBAL void nms_compact_kernel(${params}) { - make_tensors()(${args})([](auto counts, auto raw_out, auto out) { - nonmaxsuppression_compact<${num_blocks}, ${num_boxes}>(counts, raw_out, out); + make_tensors()(${args})([](auto bc_counts, auto output_indices, auto output_num_selected) { + nonmaxsuppression_compact<${num_batch_class}, ${num_boxes}>(bc_counts, output_indices, output_num_selected); }); } @@ -144,7 +144,7 @@ struct nms_compiler : compiler // Compile the sort kernel. // inputs: [boxes, scores, sorted] operation - compile_sort(context& ctx, const std::vector& inputs, const value& v) const + compile_load_sort(context& ctx, const std::vector& inputs, const value& v) const { const auto& boxes_s = inputs[0]; const auto& scores_s = inputs[1]; @@ -157,7 +157,7 @@ struct nms_compiler : compiler hip_compile_options options; options.inputs = inputs; - options.output = inputs.back(); // sorted buffer + options.output = inputs.back(); options.kernel_name = "nms_sort_kernel"; options.virtual_inputs = inputs; options.set_launch_params(v, block_size * num_batches * num_classes, block_size); @@ -203,7 +203,7 @@ struct nms_compiler : compiler hip_compile_options options; options.inputs = inputs; - options.output = inputs.back(); // raw_output buffer + options.output = inputs.back(); options.kernel_name = "nms_filter_kernel"; options.virtual_inputs = inputs; options.set_launch_params(v, block_size * nb * nc, block_size); @@ -220,13 +220,7 @@ struct nms_compiler : compiler } // TODO: REDO this whole thing. It doesn't make sense. - // Compile the compaction kernel. `inputs` is: - // [counts, raw_output, output] - // Launched as a single block: an exclusive prefix scan over counts gives - // each per-block region a base offset, then the block's threads scatter - // selections to those offsets in parallel. The single-block constraint - // keeps the scan in shared memory; `nms_compact` static_asserts a hard - // cap on NumBlocks that comfortably fits any realistic ONNX NMS. + // Compiles the nms_compact_kernel. operation compile_compact(context& ctx, const std::vector& inputs, const value& v) const { @@ -239,10 +233,6 @@ struct nms_compiler : compiler ? raw_s.elements() / (num_blocks * std::size_t{3}) : std::size_t{0}; - // Pick a block size large enough to give the scan and scatter useful - // parallelism without inflating LDS pressure. block_scan requires the - // block size to be a multiple of the wavefront size; 64 is the - // smallest safe choice for all supported gfx targets. const auto total = std::max(num_blocks * num_boxes, std::size_t{1}); const auto block_size = std::min( std::max( @@ -255,36 +245,21 @@ struct nms_compiler : compiler options.output = inputs.back(); options.kernel_name = "nms_compact_kernel"; options.virtual_inputs = inputs; - // BUG: this is not one block - options.set_launch_params(v, block_size, block_size); // one block + options.set_launch_params(v, 1, block_size); auto src = interpolate_string( nms_compact_kernel_src, {{"params", enum_params(inputs.size(), "void * private_p")}, {"args", enum_params(inputs.size(), "private_p")}, - {"num_blocks", std::to_string(num_blocks)}, + {"num_batch_class", std::to_string(num_batch_class)}, {"num_boxes", std::to_string(num_boxes)}}); return compile_hip_code_object(ctx, src, options); } - // Required compiler<> hook: return the sort kernel built from the raw - // user input shapes (boxes, scores). The full three-kernel chain is - // handled in `compile()`; this entry point is only used by callers that - // ask for a single op view. + // Required compiler<> hook, should not be used for this compiler. operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { - if(inputs.size() < 2) - MIGRAPHX_THROW("nms_compiler: compile_op needs at least boxes and scores"); - const auto& boxes_s = inputs[0]; - const auto& scores_s = inputs[1]; - if(boxes_s.lens().size() != 3 or scores_s.lens().size() != 3) - MIGRAPHX_THROW("nms_compiler: boxes and scores must be 3-D"); - const auto nb = boxes_s.lens()[0]; - const auto b = boxes_s.lens()[1]; - const auto nc = scores_s.lens()[1]; - const auto aligned_b = static_cast(bit_ceil(static_cast(b))); - const shape sorted_s{shape::int8_type, {nb * nc * aligned_b * nms_bytes_per_data}}; - return compile_sort(ctx, {boxes_s, scores_s, sorted_s}, v); + return {}; } compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const @@ -326,28 +301,27 @@ struct nms_compiler : compiler const auto aligned_b = static_cast(bit_ceil(static_cast(b))); const auto iou_packed = (b > 1) ? (b * (b - 1) / 2) : std::size_t{1}; - shape sorted_s{shape::int8_type, {nb * nc * aligned_b * nms_bytes_per_data}}; - shape mask_s{shape::uint8_type, {nb * nc * iou_packed}}; - // Per-block raw output: nb*nc blocks, each can write up to b + shape sorted_shape{shape::int8_type, {nb * nc * aligned_b * nms_bytes_per_data}}; + shape mask_shape{shape::uint8_type, {nb * nc * iou_packed}}; + // Per-block output: nb*nc blocks, each can write up to b // selections of (batch, class, box_idx) int64 triples. - shape raw_output_s{shape::int64_type, {nb * nc * b * 3}}; - // Per-block selection counts (one int32 per (batch, class) block). - shape counts_s{shape::int32_type, {nb * nc}}; + shape output_s{shape::int64_type, {nb * nc * b * 3}}; + // Per-batch-per-class selection counts (one index_int per (batch, class) block). + shape bc_counts_shape{shape::int32_type, {nb * nc}}; // Sort kernel input shapes: [boxes, scores, sorted] - std::vector sort_shapes = {boxes_s, scores_s, sorted_s}; + std::vector sort_shapes = {boxes_s, scores_s, sorted_shape}; // Filter kernel input shapes: [sorted, max, iou, thr, mask, counts, raw_out] - std::vector filter_shapes = {sorted_s, + std::vector filter_shapes = {sorted_shape, raw_shapes[2], raw_shapes[3], raw_shapes[4], - mask_s, - counts_s, + mask_shape, + bc_counts_shape, raw_output_s}; - // Compact kernel input shapes: [counts, raw_out, output] - std::vector compact_shapes = {counts_s, raw_output_s, raw.back()->get_shape()}; + std::vector compact_shapes = {bc_counts_shape, output_s, {shape::int64_type, {1}}}; // The filter kernel can't recover nb/nc/b from its input shapes // (sorted/mask/counts/raw_out are all flat scratch buffers), so we @@ -361,57 +335,47 @@ struct nms_compiler : compiler auto filter_kop = compile_filter(ctx, filter_shapes, augmented); auto compact_kop = compile_compact(ctx, compact_shapes, augmented); + // kernel operations std::vector kops = {sort_kop, filter_kop, compact_kop}; return {kops, - [=](module& m, instruction_ref ins2, const std::vector& cops) { - auto args = ins2->inputs(); - auto out = args.back(); + [=](module& m, instruction_ref rep_ins, const std::vector& ops) { + auto args = rep_ins->inputs(); + auto output = args.back(); args.pop_back(); - + + // fill out optional arguments if(args.size() < 3) { args.push_back(m.insert_literal( - ins2, literal{default_max_s, {std::int64_t{0}}})); + rep_ins, literal{default_max_s, {std::int64_t{0}}})); } if(args.size() < 4) { args.push_back( - m.insert_literal(ins2, literal{default_iou_s, {0.0f}})); + m.insert_literal(rep_ins, literal{default_iou_s, {0.0f}})); } if(args.size() < 5) { args.push_back( - m.insert_literal(ins2, literal{default_thr_s, {0.0f}})); + m.insert_literal(rep_ins, literal{default_thr_s, {0.0f}})); } - auto sorted = m.insert_instruction( - ins2, make_op("hip::allocate", {{"shape", to_value(sorted_s)}})); - auto mask = m.insert_instruction( - ins2, make_op("hip::allocate", {{"shape", to_value(mask_s)}})); - auto raw_out = m.insert_instruction( - ins2, make_op("hip::allocate", {{"shape", to_value(raw_output_s)}})); - auto counts = m.insert_instruction( - ins2, make_op("hip::allocate", {{"shape", to_value(counts_s)}})); - - // Phase 1: sort. Inputs are [boxes, scores, sorted]; the - // returned `sort_ins` is the post-write `sorted` buffer - // which becomes the filter kernel's first input. - auto sort_ins = m.insert_instruction( - ins2, cops[0], {args[0], args[1], sorted}); - - // Phase 2: filter. Use `sort_ins` as the dataflow edge so - // the filter is ordered after sort and `sorted` stays - // live. Returned `filter_ins` is the post-write - // `raw_output` buffer fed to compact. + auto sorted = m.insert_instruction(rep_ins, make_op("hip::allocate", {{"shape", to_value(sorted_shape)}})); + auto mask = m.insert_instruction(rep_ins, make_op("hip::allocate", {{"shape", to_value(mask_shape)}})); + auto bc_counts = m.insert_instruction(rep_ins, make_op("hip::allocate", {{"shape", to_value(bc_counts_shape)}})); + auto output_num_selected = m.insert_instruction(rep_ins, make_op("hip::allocate", {{"shape", to_value(scalar_shape)}})); + + auto load_sort_ins = m.insert_instruction(rep_ins, ops[0], {args[0], args[1], sorted}); + auto filter_ins = m.insert_instruction( - ins2, - cops[1], - {sort_ins, args[2], args[3], args[4], mask, counts, raw_out}); + rep_ins, + ops[1], + {load_sort_ins, args[2], args[3], args[4], mask, bc_counts, output}); - // Phase 3: compact. Counts/filter_ins/out match the - // [counts, raw_output, output] order in compact_shapes. - m.replace_instruction(ins2, cops[2], {counts, filter_ins, out}); + output = m.insert_instruction(rep_ins, make_op("get_tuple_elem", {{"index", 0}}), filter_ins); + auto bc_counts_output = m.insert_instruction(rep_ins, make_op("get_tuple_elem", {{"index", 1}}), filter_ins); + m.replace_instruction(rep_ins, ops[2], {bc_counts_output, output, output_num_selected}); }}; } }; diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp index c4c27a76ed3..3ac8520fc53 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp @@ -105,7 +105,7 @@ struct nms_score_greater } }; -// Phase 1 +// Kernel 1. // One block per (batch_idx, class_idx). // Load data into per-block buffer of nms_data. // Pads values after N with sentinel values. @@ -160,7 +160,7 @@ __device__ void nonmaxsuppression_sort(Boxes boxes_tv, Scores scores_tv, Output bitonic_sort{nms_score_greater{}}.template block_sort(idx, block_out_tv); } -// Phase 2 +// Part of kernel 2. // Build the packed upper-triangular IoU mask for the N sorted boxes. // Work is striped such that each thread does a multiple of 2 rows so each does roughly the same // amount of work regardless of where it falls in the triangle. @@ -189,7 +189,7 @@ __device__ void nms_make_iou_mask(index idx, const nms_data* sorted, uint8_t* ma } } -// Phase 3 +// Part of kernel 2. // Greedy filter that writes selections into a per-block region of a // scratch buffer (block_id * N entries) and stores the per-block count. template @@ -313,72 +313,59 @@ __device__ void nonmaxsuppression_filter(Sorted sorted_buf, reinterpret_cast(counts_buf.data())); } -// Single-block compaction: an exclusive prefix scan over block_counts gives -// each per-block region a base offset in the final output; threads in the -// single launched block then scatter the per-block selections in parallel. -// Block_id order is preserved, which matches the CPU op's (batch, class) -// iteration order, and each block writes its `block_counts[b]` entries in -// order, so the final output is bit-for-bit identical to the serial walker. -// Trailing slots are left as the zero fill applied before this kernel runs. -// TODO: this explaination makes no sense -template -__device__ void -nms_compact(index idx, const int64_t* raw_output, const int32_t* block_counts, int64_t* output) -{ - static_assert(NumBlocks > 0, "num_blocks must be > 0"); - // offsets[] is sized 4 * NumBlocks bytes in LDS, well within the 64 KB - // per-block budget for any realistic ONNX NMS (nb * nc). - static_assert(NumBlocks <= 16384, - "nms_compact: NumBlocks exceeds the LDS budget for offsets[]"); - - __shared__ int32_t offsets[NumBlocks]; - // Exclusive prefix sum: emit(b, inclusive) -> offsets[b] = inclusive - counts[b]. +// Kernel 3. +// Move batch/class box index entries to the beginning of the output buffer. +// Runs with 1 block. Swaps indices within `output_indices`. +// `bc_counts`: Number of selected boxes per batch per class. (read-only) +// `output_indices`: Output box indices that are initially segemented by non-initialized values between selected +// indices between each batch/class. After this kernel, the selected indicies will be compacted to the beginning +// of the tensor. +// `output_num_selected`: Total number of selected boxes. +template +__device__ void nonmaxsuppression_compact(const Counts bc_counts, + NumOutput output_num_selected, + IdxOutput output_indices) +{ + static_assert(NumBatchClass > 0, "NumBatchClass must be > 0"); + static_assert(NumBatchClass <= 16000, "nms_compact: NumBlocks exceeds the LDS budget for offsets[]"); + __shared__ array offsets; + // Exclusive prefix sum on bc_counts to get offsets block_scan( idx, op::sum{}, - int32_t{0}, - index_int{NumBlocks}, - [&](auto b) -> int32_t { return block_counts[b]; }, - [&](auto b, auto inclusive) { offsets[b] = inclusive - block_counts[b]; }); + 0, + NumBlocks, + [&](auto i) -> int32_t { return bc_counts[i]; }, + [&](auto i, auto inclusive_value) { offsets[i] = inclusive_value - block_counts[i]; }); __syncthreads(); - // Parallel scatter: flatten (b, i) so all threads see roughly equal work, - // regardless of how `block_counts[b]` is distributed across blocks. - constexpr index_int total = NumBlocks * NumBoxes; - idx.local_stride(total, [&](auto bi) { - const index_int b = bi / NumBoxes; - const index_int i = bi % NumBoxes; - if(i < static_cast(block_counts[b])) + // Get num_selected_boxes from last value of exclusive scan and add last bc_counts value. + if(idx.local == 0) + { + output_num_selected[0] = offsets[NumBatchClass-1] + block_counts[NumBlocks-1]; + } + + // swap index values to make the output packed + constexpr index_int index_size = 3; + constexpr index_int max_entries = NumBatchClass * NumBoxes; + idx.local_stride(max_entries, [&](auto i) { + const index_int batch_class_idx = i / NumBoxes; + const index_int box_idx = i & NumBoxes; + if(box_idx < block_counts[batch_class_idx]) { - const int64_t* src = raw_output + (b * NumBoxes + i) * 3; - int64_t* dst = output + (offsets[b] + i) * 3; - dst[0] = src[0]; - dst[1] = src[1]; - dst[2] = src[2]; + auto src = [&](auto j){return output_indices[batch_class_idx * NumBoxes + box_idx * index_size + j]}; + auto dst = [&](auto j){return output_indices[(offsets[batch_class_idx] + box_idx) * index_size + j]}; + array tmp_src = {src(0), src(1), src(2)}; + for(int k = 0; k < 3; ++k) + { + src(k) = dst(k); + dst(k) = tmp_src[k]; + } } }); } -// Compact wrapper invoked from the final JIT kernel. Reads the per-block -// counts and raw_output produced by `nonmaxsuppression_filter` and copies -// selections into the final output in block_id (i.e. (batch, class)) order. -// `output` is last to match the JIT convention of using `inputs.back()` as -// the kernel's logical output buffer. -template -__device__ void nonmaxsuppression_compact(Counts counts_buf, - RawOutput raw_output_buf, - Output output_indices) -{ - static_assert(NumBlocks > 0, "num_blocks must be > 0"); - - auto idx = make_index(); - nms_compact(idx, - reinterpret_cast(raw_output_buf.data()), - reinterpret_cast(counts_buf.data()), - output_indices.data()); -} - } // namespace migraphx #endif // MIGRAPHX_GUARD_KERNELS_NONMAXSUPPRESSION_HPP From f2734dcbbd155ca15a9a467b930214f4a4898537 Mon Sep 17 00:00:00 2001 From: charlie Date: Thu, 14 May 2026 18:12:05 -0500 Subject: [PATCH 07/32] minor progress --- .../migraphx/kernels/nonmaxsuppression.hpp | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp index 3ac8520fc53..94bd32dcd5e 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp @@ -105,7 +105,7 @@ struct nms_score_greater } }; -// Kernel 1. +// Phase 1 // One block per (batch_idx, class_idx). // Load data into per-block buffer of nms_data. // Pads values after N with sentinel values. @@ -148,8 +148,7 @@ __device__ void nonmaxsuppression_sort(Boxes boxes_tv, Scores scores_tv, Output } else { - // Sentinel: -inf score so it never beats any real entry, and a - // negative box_index so accidental dereferencing is detectable. + // Sentinel: -inf score so it never beats any real entry d.score = -__FLT_MAX__; d.box = array{0.f, 0.f, 0.f, 0.f}; d.box_index = -1; @@ -160,12 +159,14 @@ __device__ void nonmaxsuppression_sort(Boxes boxes_tv, Scores scores_tv, Output bitonic_sort{nms_score_greater{}}.template block_sort(idx, block_out_tv); } -// Part of kernel 2. +// Phase 2 // Build the packed upper-triangular IoU mask for the N sorted boxes. // Work is striped such that each thread does a multiple of 2 rows so each does roughly the same // amount of work regardless of where it falls in the triangle. -template -__device__ void nms_make_iou_mask(index idx, const nms_data* sorted, uint8_t* mask, float iou_thr) +// `sorted`: sorted nms_data{} tensor +// `mask`: bool mask tensor +template +__device__ void nms_make_iou_mask(index idx, const SortedData sorted, Mask mask, float iou_threshold) { constexpr index_int half = N / 2; @@ -173,7 +174,7 @@ __device__ void nms_make_iou_mask(index idx, const nms_data* sorted, uint8_t* ma for(index_int j = i + 1; j < N; ++j) { mask[nms_packed_idx(i, j, N)] = - nms_iou_over_threshold(sorted[i].box, sorted[j].box, iou_thr) ? 1 : 0; + nms_iou_over_threshold(sorted[i].box, sorted[j].box, iou_threshold) ? 1 : 0; } }; @@ -189,7 +190,7 @@ __device__ void nms_make_iou_mask(index idx, const nms_data* sorted, uint8_t* ma } } -// Part of kernel 2. +// Phase 2 // Greedy filter that writes selections into a per-block region of a // scratch buffer (block_id * N entries) and stores the per-block count. template @@ -282,8 +283,8 @@ __device__ void nonmaxsuppression_filter(Sorted sorted_buf, auto idx = make_index(); const index_int block_id = idx.group; - const int batch_idx = static_cast(block_id / NumClasses); - const int class_idx = static_cast(block_id % NumClasses); + const int batch_idx = block_id / NumClasses; + const int class_idx = block_id % NumClasses; constexpr index_int iou_packed_size = (NumBoxes > 1) ? (NumBoxes * (NumBoxes - 1)) / 2 : 1; nms_data* my_sorted = @@ -314,7 +315,7 @@ __device__ void nonmaxsuppression_filter(Sorted sorted_buf, } -// Kernel 3. +// Phase 3 // Move batch/class box index entries to the beginning of the output buffer. // Runs with 1 block. Swaps indices within `output_indices`. // `bc_counts`: Number of selected boxes per batch per class. (read-only) From 637937738f8709be958d1343d8dc5f9ec55cab2b Mon Sep 17 00:00:00 2001 From: charlie Date: Thu, 14 May 2026 18:22:23 -0500 Subject: [PATCH 08/32] AI refactor to separate instructions --- src/targets/gpu/CMakeLists.txt | 1 + .../gpu/prepare_nonmaxsuppression.hpp | 48 ++++ src/targets/gpu/jit/nonmaxsuppression.cpp | 265 ++++++------------ .../migraphx/kernels/nonmaxsuppression.hpp | 37 ++- src/targets/gpu/lowering.cpp | 29 -- src/targets/gpu/prepare_nonmaxsuppression.cpp | 216 ++++++++++++++ src/targets/gpu/target.cpp | 3 + 7 files changed, 371 insertions(+), 228 deletions(-) create mode 100644 src/targets/gpu/include/migraphx/gpu/prepare_nonmaxsuppression.hpp create mode 100644 src/targets/gpu/prepare_nonmaxsuppression.cpp diff --git a/src/targets/gpu/CMakeLists.txt b/src/targets/gpu/CMakeLists.txt index 6d66ccdc573..b8e92310b99 100644 --- a/src/targets/gpu/CMakeLists.txt +++ b/src/targets/gpu/CMakeLists.txt @@ -183,6 +183,7 @@ add_library(migraphx_gpu pack_args.cpp prefuse_ops.cpp prepare_mlir.cpp + prepare_nonmaxsuppression.cpp prepare_reduce.cpp perfdb.cpp pooling.cpp diff --git a/src/targets/gpu/include/migraphx/gpu/prepare_nonmaxsuppression.hpp b/src/targets/gpu/include/migraphx/gpu/prepare_nonmaxsuppression.hpp new file mode 100644 index 00000000000..bf47c8607b9 --- /dev/null +++ b/src/targets/gpu/include/migraphx/gpu/prepare_nonmaxsuppression.hpp @@ -0,0 +1,48 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ +#ifndef MIGRAPHX_GUARD_GPU_PREPARE_NONMAXSUPPRESSION_HPP +#define MIGRAPHX_GUARD_GPU_PREPARE_NONMAXSUPPRESSION_HPP + +#include +#include +#include + +namespace migraphx { +inline namespace MIGRAPHX_INLINE_NS { + +struct module; + +namespace gpu { + +struct MIGRAPHX_GPU_EXPORT prepare_nonmaxsuppression +{ + std::string name() const { return "gpu::prepare_nonmaxsuppression"; } + void apply(module& m) const; +}; + +} // namespace gpu +} // namespace MIGRAPHX_INLINE_NS +} // namespace migraphx +#endif // MIGRAPHX_GUARD_GPU_PREPARE_NONMAXSUPPRESSION_HPP diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp index dc0202a4109..be32bf75479 100644 --- a/src/targets/gpu/jit/nonmaxsuppression.cpp +++ b/src/targets/gpu/jit/nonmaxsuppression.cpp @@ -27,9 +27,6 @@ #include #include #include -#include -#include -#include #include #include @@ -38,17 +35,12 @@ namespace migraphx { inline namespace MIGRAPHX_INLINE_NS { namespace gpu { -// nms_data is laid out as { float score; float box[4]; int box_index; } for a -// total of 24 bytes per entry. The scratch workspace is allocated as raw int8 -// and reinterpreted in the kernel. -static constexpr std::size_t nms_bytes_per_data = 24; - -// Phase 1 ("sort") kernel: each block normalizes its (batch, class)'s boxes +// Phase-1 ("sort") kernel: each block normalizes its (batch, class)'s boxes // and bitonic-sorts them by descending score into a per-block region of the // `sorted` scratch buffer. Launch dimensions are sized to AlignedNumBoxes so // the sort has enough parallelism even when NumBoxes is small relative to it. // NOLINTNEXTLINE -static const char* const nms_load_sort_kernel_src = R"__migraphx__( +static const char* const nms_sort_kernel_src = R"__migraphx__( #include #include @@ -76,7 +68,9 @@ MIGRAPHX_GLOBAL void nms_sort_kernel(${params}) // records out of the shared `sorted` buffer, builds the IoU mask, runs the // greedy filter, and writes selections into a per-block region of the // `raw_output` scratch plus a per-block count. No global atomic counter is -// used, so per-block contents are deterministic. +// used, so per-block contents are deterministic. The argument order after the +// `mask` scratch reflects the precompile_op tuple output flatten order: +// (raw_output, bc_counts). // NOLINTNEXTLINE static const char* const nms_filter_kernel_src = R"__migraphx__( #include @@ -93,13 +87,13 @@ MIGRAPHX_GLOBAL void nms_filter_kernel(${params}) auto iou_p, auto thr_p, auto mask, - auto counts, - auto raw_out) { + auto raw_out, + auto counts) { nonmaxsuppression_filter<${num_batches}, ${num_classes}, ${num_boxes}, ${aligned_num_boxes}>( - sorted, max_p, iou_p, thr_p, mask, counts, raw_out); + sorted, max_p, iou_p, thr_p, mask, raw_out, counts); }); } @@ -125,8 +119,12 @@ extern "C" { MIGRAPHX_GLOBAL void nms_compact_kernel(${params}) { - make_tensors()(${args})([](auto bc_counts, auto output_indices, auto output_num_selected) { - nonmaxsuppression_compact<${num_batch_class}, ${num_boxes}>(bc_counts, output_indices, output_num_selected); + make_tensors()(${args})([](auto bc_counts, + auto raw_output, + auto output_indices, + auto output_num_selected) { + nonmaxsuppression_compact<${num_batch_class}, ${num_boxes}>( + bc_counts, raw_output, output_indices, output_num_selected); }); } @@ -135,60 +133,65 @@ MIGRAPHX_GLOBAL void nms_compact_kernel(${params}) } // namespace migraphx )__migraphx__"; -// TODO: use compute_block_size and/or compute_global_for? -// TODO: Don't need num_batches, num_classes, num_boxes as template parameters since tensor_view has shapes. -struct nms_compiler : compiler +// Compiler for the per-(batch, class) sort kernel. `inputs` is the +// precompile_op input list: [boxes, scores, sorted_alloc]. +struct nms_sort_compiler : compiler { - std::vector names() const { return {"nonmaxsuppression"}; } + std::vector names() const { return {"gpu::nms_sort"}; } - // Compile the sort kernel. - // inputs: [boxes, scores, sorted] - operation - compile_load_sort(context& ctx, const std::vector& inputs, const value& v) const + operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { const auto& boxes_s = inputs[0]; const auto& scores_s = inputs[1]; - const auto num_batches = boxes_s.lens()[0]; - const auto num_boxes = boxes_s.lens()[1]; - const auto num_classes = scores_s.lens()[1]; - const auto aligned_b = static_cast(bit_ceil(static_cast(num_boxes))); - // clamp between 64 and 1024 threads based on aligned_num_boxes - const auto block_size = std::min(std::max(aligned_b, std::size_t{64}), std::size_t{1024}); + const auto nb = boxes_s.lens()[0]; + const auto b = boxes_s.lens()[1]; + const auto nc = scores_s.lens()[1]; + const auto aligned_b = + static_cast(bit_ceil(static_cast(b))); + // Clamp the block size to [64, 1024] threads, sized for the bitonic sort. + const auto block_size = std::min( + std::max(aligned_b, std::size_t{64}), std::size_t{1024}); hip_compile_options options; options.inputs = inputs; options.output = inputs.back(); options.kernel_name = "nms_sort_kernel"; options.virtual_inputs = inputs; - options.set_launch_params(v, block_size * num_batches * num_classes, block_size); + options.set_launch_params(v, block_size * nb * nc, block_size); auto src = interpolate_string( nms_sort_kernel_src, {{"params", enum_params(inputs.size(), "void * private_p")}, {"args", enum_params(inputs.size(), "private_p")}, - {"num_batches", std::to_string(num_batches)}, - {"num_classes", std::to_string(num_classes)}, - {"num_boxes", std::to_string(num_boxes)}, + {"num_batches", std::to_string(nb)}, + {"num_classes", std::to_string(nc)}, + {"num_boxes", std::to_string(b)}, {"aligned_num_boxes", std::to_string(aligned_b)}, - {"center_point_box", - v.at("center_point_box").to() ? "true" : "false"}}); + {"center_point_box", v.at("center_point_box").to() ? "true" : "false"}}); return compile_hip_code_object(ctx, src, options); } - // inputs: [sorted, max, iou, score_thr, mask, counts, raw_output] - // `raw_output` is the last input so the framework treats it as the( - // kernel's chained output flowing into the compact kernel. The filter's - // inner loops are O(N) per (batch, class), so the launch is sized to - // NumBoxes (not AlignedNumBoxes) to avoid leaving padding-only threads - // idle. nb, nc, b are passed through the augmented value because the - // filter's inputs no longer carry the raw boxes / scores shapes. - operation - compile_filter(context& ctx, const std::vector& inputs, const value& v) const + compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const + { + return compile_op(ctx, to_shapes(ins->inputs()), op.to_value()); + } +}; + +// Compiler for the filter kernel. `inputs` is the precompile_op input list: +// [sorted, max, iou, thr, mask, tuple_alloc] +// where `tuple_alloc` is a tuple allocation holding (raw_output, bc_counts). +// After flattening the tuple, the kernel sees 7 arguments. +struct nms_filter_compiler : compiler +{ + std::vector names() const { return {"gpu::nms_filter"}; } + + operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { - const auto nb = v.at("num_batches").to(); - const auto nc = v.at("num_classes").to(); - const auto b = v.at("num_boxes").to(); - const auto aligned_b = static_cast(bit_ceil(static_cast(b))); + const auto nb = v.at("num_batches").to(); + const auto nc = v.at("num_classes").to(); + const auto b = v.at("num_boxes").to(); + const auto aligned_b = + static_cast(bit_ceil(static_cast(b))); // Clamp the per-block thread count to [64, 256]: a multiple of the // wavefront size keeps __syncthreads / block_scan well-defined, and @@ -202,16 +205,16 @@ struct nms_compiler : compiler std::size_t{256}); hip_compile_options options; - options.inputs = inputs; + options.inputs = flatten(inputs); options.output = inputs.back(); options.kernel_name = "nms_filter_kernel"; - options.virtual_inputs = inputs; + options.virtual_inputs = options.inputs; options.set_launch_params(v, block_size * nb * nc, block_size); auto src = interpolate_string( nms_filter_kernel_src, - {{"params", enum_params(inputs.size(), "void * private_p")}, - {"args", enum_params(inputs.size(), "private_p")}, + {{"params", enum_params(options.inputs.size(), "void * private_p")}, + {"args", enum_params(options.inputs.size(), "private_p")}, {"num_batches", std::to_string(nb)}, {"num_classes", std::to_string(nc)}, {"num_boxes", std::to_string(b)}, @@ -219,13 +222,23 @@ struct nms_compiler : compiler return compile_hip_code_object(ctx, src, options); } - // TODO: REDO this whole thing. It doesn't make sense. - // Compiles the nms_compact_kernel. - operation - compile_compact(context& ctx, const std::vector& inputs, const value& v) const + compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const + { + return compile_op(ctx, to_shapes(ins->inputs()), op.to_value()); + } +}; + +// Compiler for the compact kernel. `inputs` is the precompile_op input list: +// [bc_counts, raw_output, tuple_alloc] +// where `tuple_alloc` is a tuple allocation holding (selected_indices, +// num_selected). After flattening, the kernel sees 4 arguments. `num_blocks` +// (a.k.a. nb*nc) and `num_boxes` are recovered from the input shapes. +struct nms_compact_compiler : compiler +{ + std::vector names() const { return {"gpu::nms_compact"}; } + + operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { - // Derive num_blocks (length of counts) and per-block stride NumBoxes - // (raw_output is sized nb*nc*NumBoxes*3 int64 entries). const auto& cnt_s = inputs[0]; const auto& raw_s = inputs[1]; const auto num_blocks = cnt_s.elements(); @@ -233,7 +246,7 @@ struct nms_compiler : compiler ? raw_s.elements() / (num_blocks * std::size_t{3}) : std::size_t{0}; - const auto total = std::max(num_blocks * num_boxes, std::size_t{1}); + const auto total = std::max(num_blocks * num_boxes, std::size_t{1}); const auto block_size = std::min( std::max( static_cast(bit_ceil(static_cast(total))), @@ -241,142 +254,24 @@ struct nms_compiler : compiler std::size_t{256}); hip_compile_options options; - options.inputs = inputs; + options.inputs = flatten(inputs); options.output = inputs.back(); options.kernel_name = "nms_compact_kernel"; - options.virtual_inputs = inputs; - options.set_launch_params(v, 1, block_size); + options.virtual_inputs = options.inputs; + options.set_launch_params(v, block_size, block_size); auto src = interpolate_string( nms_compact_kernel_src, - {{"params", enum_params(inputs.size(), "void * private_p")}, - {"args", enum_params(inputs.size(), "private_p")}, - {"num_batch_class", std::to_string(num_batch_class)}, + {{"params", enum_params(options.inputs.size(), "void * private_p")}, + {"args", enum_params(options.inputs.size(), "private_p")}, + {"num_batch_class", std::to_string(num_blocks)}, {"num_boxes", std::to_string(num_boxes)}}); return compile_hip_code_object(ctx, src, options); } - // Required compiler<> hook, should not be used for this compiler. - operation compile_op(context& ctx, const std::vector& inputs, const value& v) const - { - return {}; - } - compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const { - // ins->inputs() is [user_inputs..., output_alloc] from - // insert_precompile_op. user_inputs has 2..5 entries per ONNX NMS. - auto raw = ins->inputs(); - if(raw.size() < 3 or raw.size() > 6) - MIGRAPHX_THROW("nms_compiler: unexpected input count " + std::to_string(raw.size())); - - std::vector raw_shapes; - raw_shapes.reserve(raw.size() - 1); - std::transform(raw.begin(), - raw.end() - 1, - std::back_inserter(raw_shapes), - [](auto i) { return i->get_shape(); }); - - // Default shapes for missing optional scalar inputs. The literals - // inserted by the replace lambda use these same shapes so the - // compiled kernel's tensor_view types match the runtime arguments. - const shape default_max_s{shape::int64_type, {1}}; - const shape default_iou_s{shape::float_type, {1}}; - const shape default_thr_s{shape::float_type, {1}}; - if(raw_shapes.size() < 3) - raw_shapes.push_back(default_max_s); - if(raw_shapes.size() < 4) - raw_shapes.push_back(default_iou_s); - if(raw_shapes.size() < 5) - raw_shapes.push_back(default_thr_s); - - const auto& boxes_s = raw_shapes[0]; - const auto& scores_s = raw_shapes[1]; - if(boxes_s.lens().size() != 3 or scores_s.lens().size() != 3) - MIGRAPHX_THROW("nms_compiler: boxes and scores must be 3-D"); - - const auto nb = boxes_s.lens()[0]; - const auto b = boxes_s.lens()[1]; - const auto nc = scores_s.lens()[1]; - const auto aligned_b = static_cast(bit_ceil(static_cast(b))); - const auto iou_packed = (b > 1) ? (b * (b - 1) / 2) : std::size_t{1}; - - shape sorted_shape{shape::int8_type, {nb * nc * aligned_b * nms_bytes_per_data}}; - shape mask_shape{shape::uint8_type, {nb * nc * iou_packed}}; - // Per-block output: nb*nc blocks, each can write up to b - // selections of (batch, class, box_idx) int64 triples. - shape output_s{shape::int64_type, {nb * nc * b * 3}}; - // Per-batch-per-class selection counts (one index_int per (batch, class) block). - shape bc_counts_shape{shape::int32_type, {nb * nc}}; - - // Sort kernel input shapes: [boxes, scores, sorted] - std::vector sort_shapes = {boxes_s, scores_s, sorted_shape}; - - // Filter kernel input shapes: [sorted, max, iou, thr, mask, counts, raw_out] - std::vector filter_shapes = {sorted_shape, - raw_shapes[2], - raw_shapes[3], - raw_shapes[4], - mask_shape, - bc_counts_shape, - raw_output_s}; - - std::vector compact_shapes = {bc_counts_shape, output_s, {shape::int64_type, {1}}}; - - // The filter kernel can't recover nb/nc/b from its input shapes - // (sorted/mask/counts/raw_out are all flat scratch buffers), so we - // pass them through an augmented value alongside the op attributes. - value augmented = op.to_value(); - augmented["num_batches"] = nb; - augmented["num_classes"] = nc; - augmented["num_boxes"] = b; - - auto sort_kop = compile_sort(ctx, sort_shapes, augmented); - auto filter_kop = compile_filter(ctx, filter_shapes, augmented); - auto compact_kop = compile_compact(ctx, compact_shapes, augmented); - - // kernel operations - std::vector kops = {sort_kop, filter_kop, compact_kop}; - - return {kops, - [=](module& m, instruction_ref rep_ins, const std::vector& ops) { - auto args = rep_ins->inputs(); - auto output = args.back(); - args.pop_back(); - - // fill out optional arguments - if(args.size() < 3) - { - args.push_back(m.insert_literal( - rep_ins, literal{default_max_s, {std::int64_t{0}}})); - } - if(args.size() < 4) - { - args.push_back( - m.insert_literal(rep_ins, literal{default_iou_s, {0.0f}})); - } - if(args.size() < 5) - { - args.push_back( - m.insert_literal(rep_ins, literal{default_thr_s, {0.0f}})); - } - - auto sorted = m.insert_instruction(rep_ins, make_op("hip::allocate", {{"shape", to_value(sorted_shape)}})); - auto mask = m.insert_instruction(rep_ins, make_op("hip::allocate", {{"shape", to_value(mask_shape)}})); - auto bc_counts = m.insert_instruction(rep_ins, make_op("hip::allocate", {{"shape", to_value(bc_counts_shape)}})); - auto output_num_selected = m.insert_instruction(rep_ins, make_op("hip::allocate", {{"shape", to_value(scalar_shape)}})); - - auto load_sort_ins = m.insert_instruction(rep_ins, ops[0], {args[0], args[1], sorted}); - - auto filter_ins = m.insert_instruction( - rep_ins, - ops[1], - {load_sort_ins, args[2], args[3], args[4], mask, bc_counts, output}); - - output = m.insert_instruction(rep_ins, make_op("get_tuple_elem", {{"index", 0}}), filter_ins); - auto bc_counts_output = m.insert_instruction(rep_ins, make_op("get_tuple_elem", {{"index", 1}}), filter_ins); - m.replace_instruction(rep_ins, ops[2], {bc_counts_output, output, output_num_selected}); - }}; + return compile_op(ctx, to_shapes(ins->inputs()), op.to_value()); } }; diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp index 94bd32dcd5e..22eebbb1bb1 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp @@ -256,9 +256,10 @@ __device__ void nms_filter_per_block(index idx, // Expecting box-coordinate convention has already been normalized into corner form // in `sorted_buf`. // -// `raw_output_buf` is intentionally the last parameter so that JIT-compiled -// callers (which use `inputs.back()` as the kernel's output buffer) treat it -// as the chained output flowing into the compact kernel. +// The parameter order matches the flatten order of the precompile_op tuple +// output (raw_output, counts). `sorted_buf` and `mask_buf` are scratch inputs +// allocated upstream; `raw_output_buf` and `counts_buf` are the two halves of +// the tuple-typed output buffer. template + class RawOutput, + class Counts> __device__ void nonmaxsuppression_filter(Sorted sorted_buf, MaxOut max_out_p, IouThr iou_thr_p, ScoreThr score_thr_p, Mask mask_buf, - Counts counts_buf, - RawOutput raw_output_buf) + RawOutput raw_output_buf, + Counts counts_buf) { static_assert(NumBatches > 0, "num_batches must be > 0"); static_assert(NumClasses > 0, "num_classes must be > 0"); @@ -317,16 +318,24 @@ __device__ void nonmaxsuppression_filter(Sorted sorted_buf, // Phase 3 // Move batch/class box index entries to the beginning of the output buffer. -// Runs with 1 block. Swaps indices within `output_indices`. +// Runs with 1 block. Reads from `raw_indices` (the filter kernel's per-block +// output) and writes the compacted selections into `output_indices`. // `bc_counts`: Number of selected boxes per batch per class. (read-only) -// `output_indices`: Output box indices that are initially segemented by non-initialized values between selected -// indices between each batch/class. After this kernel, the selected indicies will be compacted to the beginning -// of the tensor. +// `raw_indices`: Per-block raw indices written by the filter kernel +// (read-only). +// `output_indices`: Output box indices, packed contiguously at the beginning +// of the buffer in (batch, class) iteration order. // `output_num_selected`: Total number of selected boxes. -template +template __device__ void nonmaxsuppression_compact(const Counts bc_counts, - NumOutput output_num_selected, - IdxOutput output_indices) + RawIndices raw_indices, + IdxOutput output_indices, + NumOutput output_num_selected) { static_assert(NumBatchClass > 0, "NumBatchClass must be > 0"); static_assert(NumBatchClass <= 16000, "nms_compact: NumBlocks exceeds the LDS budget for offsets[]"); diff --git a/src/targets/gpu/lowering.cpp b/src/targets/gpu/lowering.cpp index 56c6039c075..1a9275de52b 100644 --- a/src/targets/gpu/lowering.cpp +++ b/src/targets/gpu/lowering.cpp @@ -108,7 +108,6 @@ struct miopen_apply add_if_op(); add_loop_op(); add_neg_op(); - //add_nms_op(); add_lrn_op(); add_convolution_backwards_op(); add_select_module_op(); @@ -447,34 +446,6 @@ struct miopen_apply }); } - void add_nms_op() - { - apply_map.emplace("nonmaxsuppression", [=](instruction_ref ins) { - // Fixed-output NMS is handled by the JIT kernel registered via - // jit/nonmaxsuppression.cpp; route it through insert_precompile_op - // so compile_ops picks it up later. The dynamic-output mode still - // falls back to the CPU implementation. - auto op_val = ins->get_operator().to_value(); - if(not op_val.at("use_dyn_output").to()) - return insert_precompile_op(ins); - - auto s = ins->get_shape(); - auto output = insert_allocation(ins, s); - std::vector cpu_inputs; - auto inputs = ins->inputs(); - std::transform( - inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) { - return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in); - }); - cpu_inputs.front() = - mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs); - auto cpu_out = mod->insert_instruction(ins, ins->get_operator(), cpu_inputs); - auto gpu_out = - mod->insert_instruction(ins, make_op("hip::copy_to_gpu"), cpu_out, output); - return mod->replace_instruction(ins, gpu_out); - }); - } - void add_lrn_op() { apply_map.emplace("lrn", [=](instruction_ref ins) { diff --git a/src/targets/gpu/prepare_nonmaxsuppression.cpp b/src/targets/gpu/prepare_nonmaxsuppression.cpp new file mode 100644 index 00000000000..8f9219428a6 --- /dev/null +++ b/src/targets/gpu/prepare_nonmaxsuppression.cpp @@ -0,0 +1,216 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace migraphx { +inline namespace MIGRAPHX_INLINE_NS { +namespace gpu { + +// nms_data is laid out as { float score; float box[4]; int box_index; } for a +// total of 24 bytes per entry. The scratch workspace is allocated as raw int8 +// and reinterpreted in the kernel. +static constexpr std::size_t nms_bytes_per_data = 24; + +// Phase-1 op: sort boxes per (batch, class) into a flat byte scratch buffer. +struct nms_sort +{ + bool center_point_box = false; + + template + static auto reflect(Self& self, F f) + { + return pack(f(self.center_point_box, "center_point_box")); + } + + std::string name() const { return "gpu::nms_sort"; } + + shape compute_shape(const std::vector& inputs) const + { + check_shapes{inputs, *this}.has(2); + const auto& boxes_s = inputs.at(0); + const auto& scores_s = inputs.at(1); + if(boxes_s.lens().size() != 3 or scores_s.lens().size() != 3) + MIGRAPHX_THROW("gpu::nms_sort: boxes and scores must be 3-D"); + const auto nb = boxes_s.lens()[0]; + const auto b = boxes_s.lens()[1]; + const auto nc = scores_s.lens()[1]; + const auto aligned_b = + static_cast(bit_ceil(static_cast(b))); + return shape{shape::int8_type, {nb * nc * aligned_b * nms_bytes_per_data}}; + } +}; +MIGRAPHX_REGISTER_OP(nms_sort); + +// Phase-2 op: build the IoU mask and run the greedy filter. Produces a tuple +// of (raw_output, bc_counts). num_batches/num_classes/num_boxes are kept as +// op attributes because the filter inputs are flat scratch buffers from which +// these can't be recovered. +struct nms_filter +{ + std::size_t num_batches = 0; + std::size_t num_classes = 0; + std::size_t num_boxes = 0; + + template + static auto reflect(Self& self, F f) + { + return pack(f(self.num_batches, "num_batches"), + f(self.num_classes, "num_classes"), + f(self.num_boxes, "num_boxes")); + } + + std::string name() const { return "gpu::nms_filter"; } + + shape compute_shape(const std::vector& inputs) const + { + check_shapes{inputs, *this}.has(5); + shape raw_output_shape{shape::int64_type, {num_batches * num_classes * num_boxes * 3}}; + shape bc_counts_shape{shape::int32_type, {num_batches * num_classes}}; + return shape{{raw_output_shape, bc_counts_shape}}; + } +}; +MIGRAPHX_REGISTER_OP(nms_filter); + +// Phase-3 op: prefix-scan the per-block counts and compact the selections into +// the final (selected_indices, num_selected) tuple. +struct nms_compact +{ + template + static auto reflect(Self&, F) + { + return pack(); + } + + std::string name() const { return "gpu::nms_compact"; } + + shape compute_shape(const std::vector& inputs) const + { + check_shapes{inputs, *this}.has(2); + const auto& raw_out_s = inputs.at(1); + const auto max_num_boxes = raw_out_s.elements() / std::size_t{3}; + shape selected_indices_shape{shape::int64_type, {max_num_boxes, 3}}; + shape num_selected_shape{shape::int64_type, {1}}; + return shape{{selected_indices_shape, num_selected_shape}}; + } +}; +MIGRAPHX_REGISTER_OP(nms_compact); + +namespace { + +std::vector find_nms(module& m) +{ + std::vector result; + auto im = iterator_for(m); + std::copy_if(im.begin(), im.end(), std::back_inserter(result), [](auto ins) { + return ins->name() == "nonmaxsuppression"; + }); + return result; +} + +void rewrite_nms(module& m, instruction_ref ins) +{ + auto inputs = ins->inputs(); + if(inputs.size() < 2 or inputs.size() > 5) + MIGRAPHX_THROW("prepare_nonmaxsuppression: unexpected input count " + + std::to_string(inputs.size())); + + const auto& boxes_s = inputs[0]->get_shape(); + const auto& scores_s = inputs[1]->get_shape(); + if(boxes_s.lens().size() != 3 or scores_s.lens().size() != 3) + MIGRAPHX_THROW("prepare_nonmaxsuppression: boxes and scores must be 3-D"); + + const auto nb = boxes_s.lens()[0]; + const auto b = boxes_s.lens()[1]; + const auto nc = scores_s.lens()[1]; + const auto iou_packed = (b > 1) ? (b * (b - 1) / 2) : std::size_t{1}; + + // Fill in missing optional scalar inputs with default literals. The kernels + // load these via tensor_view, so single-element shapes are needed. + const shape default_max_s{shape::int64_type, {1}}; + const shape default_iou_s{shape::float_type, {1}}; + const shape default_thr_s{shape::float_type, {1}}; + if(inputs.size() < 3) + inputs.push_back(m.insert_literal(ins, literal{default_max_s, {std::int64_t{0}}})); + if(inputs.size() < 4) + inputs.push_back(m.insert_literal(ins, literal{default_iou_s, {0.0f}})); + if(inputs.size() < 5) + inputs.push_back(m.insert_literal(ins, literal{default_thr_s, {0.0f}})); + + auto op_val = ins->get_operator().to_value(); + bool center_point_box = op_val.at("center_point_box").to(); + + // Mask is scratch only; allocate up-front so the standard replace_allocate + // pass can later turn it into hip::allocate. + shape mask_shape{shape::uint8_type, {nb * nc * iou_packed}}; + auto mask_alloc = + m.insert_instruction(ins, make_op("allocate", {{"shape", to_value(mask_shape)}})); + + auto sorted = m.insert_instruction( + ins, make_op("gpu::nms_sort", {{"center_point_box", center_point_box}}), inputs[0], inputs[1]); + + auto filter = m.insert_instruction( + ins, + make_op("gpu::nms_filter", + {{"num_batches", nb}, {"num_classes", nc}, {"num_boxes", b}}), + sorted, + inputs[2], + inputs[3], + inputs[4], + mask_alloc); + + auto raw_output = + m.insert_instruction(ins, make_op("get_tuple_elem", {{"index", 0}}), filter); + auto bc_counts = + m.insert_instruction(ins, make_op("get_tuple_elem", {{"index", 1}}), filter); + + auto compact = m.insert_instruction(ins, make_op("gpu::nms_compact"), bc_counts, raw_output); + + m.replace_instruction(ins, compact); +} + +} // namespace + +void prepare_nonmaxsuppression::apply(module& m) const +{ + for(auto ins : find_nms(m)) + { + rewrite_nms(m, ins); + } +} + +} // namespace gpu +} // namespace MIGRAPHX_INLINE_NS +} // namespace migraphx diff --git a/src/targets/gpu/target.cpp b/src/targets/gpu/target.cpp index 3ed3e72033d..8ff00a75b7b 100644 --- a/src/targets/gpu/target.cpp +++ b/src/targets/gpu/target.cpp @@ -73,6 +73,7 @@ #include #include #include +#include #include #include #include @@ -163,6 +164,8 @@ std::vector target::get_passes(migraphx::context& gctx, const compile_opti dead_code_elimination{}, auto_contiguous{}, dead_code_elimination{}, + prepare_nonmaxsuppression{}, + dead_code_elimination{}, lowering{&ctx, options.offload_copy}, eliminate_contiguous{"gpu::contiguous"}, dead_code_elimination{}, From 2ac67b0ee552483c344c4a7a5e80ad836a81e374 Mon Sep 17 00:00:00 2001 From: charlie Date: Fri, 15 May 2026 18:20:03 -0500 Subject: [PATCH 09/32] Progress on cleanup, now segementation fault in kernel --- src/include/migraphx/op/nonmaxsuppression.hpp | 6 +- src/targets/gpu/compile_hip_code_object.cpp | 3 + src/targets/gpu/jit/nonmaxsuppression.cpp | 127 ++++------- .../migraphx/kernels/nonmaxsuppression.hpp | 206 ++++++++---------- src/targets/gpu/prepare_nonmaxsuppression.cpp | 181 ++++++++------- 5 files changed, 230 insertions(+), 293 deletions(-) diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp index 6b9af617909..9cf3d41070b 100644 --- a/src/include/migraphx/op/nonmaxsuppression.hpp +++ b/src/include/migraphx/op/nonmaxsuppression.hpp @@ -294,7 +294,8 @@ struct nonmaxsuppression argument compute(const shape& output_shape, std::vector args) const { // make buffer of maximum size - shape max_output_shape = {output_shape.type(), output_shape.max_lens()}; + auto output_shapes = flatten({output_shape}); + shape max_output_shape = {output_shapes.at(0).type(), output_shapes.at(0).max_lens()}; argument result{max_output_shape}; std::size_t max_output_boxes_per_class = @@ -317,8 +318,7 @@ struct nonmaxsuppression score_threshold); }); }); - shape scalar_int_shape = {shape::int64_type, {1}}; - argument num_selected_result{scalar_int_shape}; + argument num_selected_result{output_shapes.at(1)}; num_selected_result.visit([&](auto output){ output.begin() = num_selected; }); diff --git a/src/targets/gpu/compile_hip_code_object.cpp b/src/targets/gpu/compile_hip_code_object.cpp index f44804758d5..efe3b4f80bd 100644 --- a/src/targets/gpu/compile_hip_code_object.cpp +++ b/src/targets/gpu/compile_hip_code_object.cpp @@ -192,6 +192,9 @@ compute_global_for(const context& ctx, std::size_t n, std::size_t over) }; } + +// `n`: The amount of parallel work within a block. +// `max_block_size`: Upper limit on block size. std::size_t compute_block_size(const context& ctx, std::size_t n, std::size_t max_block_size) { const std::size_t min_block_size = ctx.get_current_device().get_wavefront_size(); diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp index be32bf75479..be37fcadbc6 100644 --- a/src/targets/gpu/jit/nonmaxsuppression.cpp +++ b/src/targets/gpu/jit/nonmaxsuppression.cpp @@ -35,10 +35,6 @@ namespace migraphx { inline namespace MIGRAPHX_INLINE_NS { namespace gpu { -// Phase-1 ("sort") kernel: each block normalizes its (batch, class)'s boxes -// and bitonic-sorts them by descending score into a per-block region of the -// `sorted` scratch buffer. Launch dimensions are sized to AlignedNumBoxes so -// the sort has enough parallelism even when NumBoxes is small relative to it. // NOLINTNEXTLINE static const char* const nms_sort_kernel_src = R"__migraphx__( #include @@ -64,13 +60,6 @@ MIGRAPHX_GLOBAL void nms_sort_kernel(${params}) } // namespace migraphx )__migraphx__"; -// Phase-2 ("filter") kernel: each block reads its (batch, class)'s sorted -// records out of the shared `sorted` buffer, builds the IoU mask, runs the -// greedy filter, and writes selections into a per-block region of the -// `raw_output` scratch plus a per-block count. No global atomic counter is -// used, so per-block contents are deterministic. The argument order after the -// `mask` scratch reflects the precompile_op tuple output flatten order: -// (raw_output, bc_counts). // NOLINTNEXTLINE static const char* const nms_filter_kernel_src = R"__migraphx__( #include @@ -87,13 +76,13 @@ MIGRAPHX_GLOBAL void nms_filter_kernel(${params}) auto iou_p, auto thr_p, auto mask, - auto raw_out, + auto output, auto counts) { nonmaxsuppression_filter<${num_batches}, ${num_classes}, ${num_boxes}, ${aligned_num_boxes}>( - sorted, max_p, iou_p, thr_p, mask, raw_out, counts); + sorted, max_p, iou_p, thr_p, mask, output, counts); }); } @@ -102,12 +91,6 @@ MIGRAPHX_GLOBAL void nms_filter_kernel(${params}) } // namespace migraphx )__migraphx__"; -// Phase-3 ("compact") kernel: a single block does an exclusive prefix scan -// over the per-block counts to obtain output offsets, then its threads -// scatter selections from each per-block region of `raw_output` into the -// contiguous prefix of the final output. The order of (block_id 0, 1, ...) -// is the same as the CPU op's (batch, class) iteration order, so the -// resulting output matches the CPU op exactly. // NOLINTNEXTLINE static const char* const nms_compact_kernel_src = R"__migraphx__( #include @@ -119,12 +102,12 @@ extern "C" { MIGRAPHX_GLOBAL void nms_compact_kernel(${params}) { - make_tensors()(${args})([](auto bc_counts, - auto raw_output, - auto output_indices, - auto output_num_selected) { + make_tensors()(${args})([](const auto bc_counts, + auto indices, + auto num_selected, + auto output) { nonmaxsuppression_compact<${num_batch_class}, ${num_boxes}>( - bc_counts, raw_output, output_indices, output_num_selected); + bc_counts, indices, num_selected, output); }); } @@ -133,8 +116,7 @@ MIGRAPHX_GLOBAL void nms_compact_kernel(${params}) } // namespace migraphx )__migraphx__"; -// Compiler for the per-(batch, class) sort kernel. `inputs` is the -// precompile_op input list: [boxes, scores, sorted_alloc]. +// `inputs` is the precompile_op input list: [boxes, scores, sorted_alloc]. struct nms_sort_compiler : compiler { std::vector names() const { return {"gpu::nms_sort"}; } @@ -143,30 +125,29 @@ struct nms_sort_compiler : compiler { const auto& boxes_s = inputs[0]; const auto& scores_s = inputs[1]; - const auto nb = boxes_s.lens()[0]; - const auto b = boxes_s.lens()[1]; - const auto nc = scores_s.lens()[1]; - const auto aligned_b = - static_cast(bit_ceil(static_cast(b))); - // Clamp the block size to [64, 1024] threads, sized for the bitonic sort. - const auto block_size = std::min( - std::max(aligned_b, std::size_t{64}), std::size_t{1024}); + const auto num_batches = boxes_s.lens()[0]; + const auto num_boxes = boxes_s.lens()[1]; + const auto num_classes = scores_s.lens()[1]; + const auto aligned_num_boxes = + static_cast(bit_ceil(static_cast(num_boxes))); + // NOTE: topK kernel uses relement/4 for amount of work in a block? + auto block_size = compute_block_size(ctx, num_boxes, 1024); hip_compile_options options; options.inputs = inputs; options.output = inputs.back(); options.kernel_name = "nms_sort_kernel"; options.virtual_inputs = inputs; - options.set_launch_params(v, block_size * nb * nc, block_size); + options.set_launch_params(v, block_size * num_batches * num_classes, block_size); auto src = interpolate_string( nms_sort_kernel_src, {{"params", enum_params(inputs.size(), "void * private_p")}, {"args", enum_params(inputs.size(), "private_p")}, - {"num_batches", std::to_string(nb)}, - {"num_classes", std::to_string(nc)}, - {"num_boxes", std::to_string(b)}, - {"aligned_num_boxes", std::to_string(aligned_b)}, + {"num_batches", std::to_string(num_batches)}, + {"num_classes", std::to_string(num_classes)}, + {"num_boxes", std::to_string(num_boxes)}, + {"aligned_num_boxes", std::to_string(aligned_num_boxes)}, {"center_point_box", v.at("center_point_box").to() ? "true" : "false"}}); return compile_hip_code_object(ctx, src, options); } @@ -177,9 +158,8 @@ struct nms_sort_compiler : compiler } }; -// Compiler for the filter kernel. `inputs` is the precompile_op input list: -// [sorted, max, iou, thr, mask, tuple_alloc] -// where `tuple_alloc` is a tuple allocation holding (raw_output, bc_counts). +// `inputs` is the precompile_op input list: [sorted, max, iou, thr, mask, tuple_alloc]. +// Where `tuple_alloc` is a tuple allocation holding (raw_output, bc_counts). // After flattening the tuple, the kernel sees 7 arguments. struct nms_filter_compiler : compiler { @@ -187,38 +167,30 @@ struct nms_filter_compiler : compiler operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { - const auto nb = v.at("num_batches").to(); - const auto nc = v.at("num_classes").to(); - const auto b = v.at("num_boxes").to(); - const auto aligned_b = - static_cast(bit_ceil(static_cast(b))); - - // Clamp the per-block thread count to [64, 256]: a multiple of the - // wavefront size keeps __syncthreads / block_scan well-defined, and - // 256 is the sweet spot for the O(N) inner loops without inflating - // shared-memory pressure on `removed[N]` (which is sized by N, not by - // block_size). - const auto block_size = std::min( - std::max( - static_cast(bit_ceil(static_cast(b))), - std::size_t{64}), - std::size_t{256}); + const auto num_batches = v.at("num_batches").to(); + const auto num_classes = v.at("num_classes").to(); + const auto num_boxes = v.at("num_boxes").to(); + const auto aligned_num_boxes = + static_cast(bit_ceil(static_cast(num_boxes))); + // TODO: tune for max block size? + // num_boxes/2 because of strided thread work distribution + const auto block_size = compute_block_size(ctx, num_boxes/2, 256); hip_compile_options options; options.inputs = flatten(inputs); options.output = inputs.back(); options.kernel_name = "nms_filter_kernel"; options.virtual_inputs = options.inputs; - options.set_launch_params(v, block_size * nb * nc, block_size); + options.set_launch_params(v, block_size * num_batches * num_classes, block_size); auto src = interpolate_string( nms_filter_kernel_src, {{"params", enum_params(options.inputs.size(), "void * private_p")}, {"args", enum_params(options.inputs.size(), "private_p")}, - {"num_batches", std::to_string(nb)}, - {"num_classes", std::to_string(nc)}, - {"num_boxes", std::to_string(b)}, - {"aligned_num_boxes", std::to_string(aligned_b)}}); + {"num_batches", std::to_string(num_batches)}, + {"num_classes", std::to_string(num_classes)}, + {"num_boxes", std::to_string(num_boxes)}, + {"aligned_num_boxes", std::to_string(aligned_num_boxes)}}); return compile_hip_code_object(ctx, src, options); } @@ -228,30 +200,21 @@ struct nms_filter_compiler : compiler } }; -// Compiler for the compact kernel. `inputs` is the precompile_op input list: -// [bc_counts, raw_output, tuple_alloc] -// where `tuple_alloc` is a tuple allocation holding (selected_indices, -// num_selected). After flattening, the kernel sees 4 arguments. `num_blocks` -// (a.k.a. nb*nc) and `num_boxes` are recovered from the input shapes. +// `inputs` is the precompile_op input list: [bc_counts, raw_output, tuple_alloc] +// where `tuple_alloc` is a tuple allocation holding (selected_indices, num_selected). +// After flattening, the kernel sees 4 arguments. struct nms_compact_compiler : compiler { std::vector names() const { return {"gpu::nms_compact"}; } operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { - const auto& cnt_s = inputs[0]; - const auto& raw_s = inputs[1]; - const auto num_blocks = cnt_s.elements(); - const auto num_boxes = (num_blocks > 0) - ? raw_s.elements() / (num_blocks * std::size_t{3}) - : std::size_t{0}; - - const auto total = std::max(num_blocks * num_boxes, std::size_t{1}); - const auto block_size = std::min( - std::max( - static_cast(bit_ceil(static_cast(total))), - std::size_t{64}), - std::size_t{256}); + const auto& cnt_s = inputs[0]; + const auto& indices_s = inputs[1]; + const auto num_batch_class = cnt_s.elements(); + const auto num_boxes = indices_s.elements() / (num_batch_class * std::size_t{3}); + // TODO: tune for max block size? + const auto block_size = compute_block_size(ctx, num_boxes, 256); hip_compile_options options; options.inputs = flatten(inputs); @@ -264,7 +227,7 @@ struct nms_compact_compiler : compiler nms_compact_kernel_src, {{"params", enum_params(options.inputs.size(), "void * private_p")}, {"args", enum_params(options.inputs.size(), "private_p")}, - {"num_batch_class", std::to_string(num_blocks)}, + {"num_batch_class", std::to_string(num_batch_class)}, {"num_boxes", std::to_string(num_boxes)}}); return compile_hip_code_object(ctx, src, options); } diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp index 22eebbb1bb1..c226ab78ab6 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp @@ -33,6 +33,7 @@ #include #include #include +#include namespace migraphx { @@ -44,7 +45,7 @@ struct nms_data }; // Decode a single box into (xmin, ymin, xmax, ymax) corners. -// Normalize such that [x1, y1] is the bottom left corner +// Normalize such that [x1, y1] is the bottom left corner. template __device__ inline array nms_normalize_box(Box box) { @@ -71,9 +72,9 @@ __device__ inline array nms_normalize_box(Box box) } } -template +template __device__ inline bool -nms_iou_over_threshold(const Box a, Box b, float threshold) +nms_iou_over_threshold(const Box a, Box b, Threshold threshold) { const float left = max(a[0], b[0]); const float right = min(a[2], b[2]); @@ -105,7 +106,6 @@ struct nms_score_greater } }; -// Phase 1 // One block per (batch_idx, class_idx). // Load data into per-block buffer of nms_data. // Pads values after N with sentinel values. @@ -123,8 +123,10 @@ template __device__ void nonmaxsuppression_sort(Boxes boxes_tv, Scores scores_tv, Output out_tv) { - static_assert(NumBatches > 0, "num_batches must be > 0"); - static_assert(NumClasses > 0, "num_classes must be > 0"); + static_assert(NumBatches > 0); + static_assert(NumClasses > 0); + static_assert(NumBoxes > 0); + static_assert(AlignedNumBoxes > 0); auto idx = make_index(); const index_int block_id = idx.group; @@ -138,87 +140,86 @@ __device__ void nonmaxsuppression_sort(Boxes boxes_tv, Scores scores_tv, Output const auto* boxes_b = boxes_tv.data() + batch_idx * NumBoxes * 4; const auto* scores_bc = scores_tv.data() + (batch_idx * NumClasses + class_idx) * NumBoxes; - nms_data d; + nms_data tmp_data; idx.local_stride(AlignedNumBoxes, [&](auto i) { if(i < NumBoxes) { - d.score = scores_bc[i]; - d.box = nms_normalize_box(boxes_b + i * 4); - d.box_index = static_cast(i); + tmp_data.score = scores_bc[i]; + tmp_data.box = nms_normalize_box(boxes_b + i * 4); + tmp_data.box_index = static_cast(i); } else { // Sentinel: -inf score so it never beats any real entry - d.score = -__FLT_MAX__; - d.box = array{0.f, 0.f, 0.f, 0.f}; - d.box_index = -1; + tmp_data.score = -__FLT_MAX__; + tmp_data.box = array{0.f, 0.f, 0.f, 0.f}; + tmp_data.box_index = -1; } - block_out_tv[i] = d; + block_out_tv[i] = tmp_data; }); __syncthreads(); bitonic_sort{nms_score_greater{}}.template block_sort(idx, block_out_tv); } -// Phase 2 -// Build the packed upper-triangular IoU mask for the N sorted boxes. +// Build the packed upper-triangular IoU mask for the NumBoxes sorted boxes. // Work is striped such that each thread does a multiple of 2 rows so each does roughly the same // amount of work regardless of where it falls in the triangle. // `sorted`: sorted nms_data{} tensor // `mask`: bool mask tensor -template -__device__ void nms_make_iou_mask(index idx, const SortedData sorted, Mask mask, float iou_threshold) +template +__device__ void nms_make_iou_mask(index idx, const SortedData sorted, Mask mask, IouThreshold iou_threshold) { - constexpr index_int half = N / 2; + static_assert(NumBoxes > 0); + constexpr index_int half = NumBoxes / 2; auto fill_row = [&](index_int i) { - for(index_int j = i + 1; j < N; ++j) + for(index_int j = i + 1; j < NumBoxes; ++j) { - mask[nms_packed_idx(i, j, N)] = + mask[nms_packed_idx(i, j, NumBoxes)] = nms_iou_over_threshold(sorted[i].box, sorted[j].box, iou_threshold) ? 1 : 0; } }; idx.local_stride(half, [&](auto i) { fill_row(i); - fill_row(N - 1 - i); + fill_row(NumBoxes - 1 - i); }); - if constexpr((N & 1) != 0 and N > 1) + // Have thread 0 do middle row if odd NumBoxes + if constexpr((NumBoxes & 1) != 0 and NumBoxes > 1) { if(idx.local == 0) fill_row(half); } } -// Phase 2 -// Greedy filter that writes selections into a per-block region of a -// scratch buffer (block_id * N entries) and stores the per-block count. -template +// TODO: use template for types +// Greedy filter that writes selections into a per-batch per-class region of output. +template __device__ void nms_filter_per_block(index idx, - const nms_data* sorted, - const uint8_t* mask, - int batch_idx, - int class_idx, + const Sorted sorted, + const Mask mask, int64_t max_output, float score_thr, - int64_t* raw_output, // [num_blocks * N * 3] - int32_t* block_counts) // [num_blocks] + Output output, + Counts bc_counts) { - __shared__ uint8_t removed[N > 0 ? N : 1]; - // Match the CPU op: only filter by score when score_threshold > 0 (the CPU - // takes the same branch). With a non-positive (or sentinel) threshold, all - // boxes are kept regardless of sign. + static_assert(NumBoxes > 1); + + const index_int block_id = idx.group; + const int batch_idx = block_id / NumClasses; + const int class_idx = block_id % NumClasses; + // TODO: use bits for removed mask + __shared__ uint8_t removed[NumBoxes]; + // Match the ref op: only filter by score when score_threshold > 0. const bool do_filter = score_thr > 0.f; - idx.local_stride(N, [&](auto i) { - removed[i] = (do_filter and sorted[i].score < score_thr) ? 1 : 0; + idx.local_stride(NumBoxes, [&](auto i) { + removed[i] = (do_filter and sorted[i].score < score_thr); }); __syncthreads(); - const index_int block_id = idx.group; - int64_t* my_output = raw_output + block_id * N * 3; - index_int output_idx = 0; - for(index_int i = 0; i < N; ++i) + for(index_int i = 0; i < NumBoxes; ++i) { if(output_idx >= max_output) { @@ -229,37 +230,25 @@ __device__ void nms_filter_per_block(index idx, { if(idx.local == 0) { - my_output[output_idx * 3 + 0] = batch_idx; - my_output[output_idx * 3 + 1] = class_idx; - my_output[output_idx * 3 + 2] = sorted[i].box_index; + output[output_idx * 3 + 0] = batch_idx; + output[output_idx * 3 + 1] = class_idx; + output[output_idx * 3 + 2] = sorted[i].box_index; } ++output_idx; - // Update removed[] using row i of the IoU mask. Each thread handles - // a stride of the row to balance work. - for(index_int j = i + 1 + idx.local; j < N; j += idx.nlocal()) + for(index_int j = i + 1 + idx.local; j < NumBoxes; j += idx.nlocal()) { - removed[j] |= mask[nms_packed_idx(i, j, N)]; + removed[j] |= mask[nms_packed_idx(i, j, NumBoxes)]; } } __syncthreads(); } if(idx.local == 0) - block_counts[block_id] = static_cast(output_idx); + bc_counts[block_id] = static_cast(output_idx); } -// Per-block filter driver: one block per (batch_idx, class_idx). Reads the -// previously-sorted records out of `sorted_buf`, builds the IoU mask in -// `mask_buf`, then runs the greedy filter writing selections into a per-block -// region of `raw_output` and the per-block count into `counts_buf`. -// -// Expecting box-coordinate convention has already been normalized into corner form -// in `sorted_buf`. -// -// The parameter order matches the flatten order of the precompile_op tuple -// output (raw_output, counts). `sorted_buf` and `mask_buf` are scratch inputs -// allocated upstream; `raw_output_buf` and `counts_buf` are the two halves of -// the tuple-typed output buffer. +// Per-block filter driver: one block per (batch_idx, class_idx).`. +// Expecting box-coordinate convention has already been normalized into corner form. template __device__ void nonmaxsuppression_filter(Sorted sorted_buf, MaxOut max_out_p, IouThr iou_thr_p, ScoreThr score_thr_p, - Mask mask_buf, - RawOutput raw_output_buf, - Counts counts_buf) + Mask mask, + Output output, + Counts bc_counts) { - static_assert(NumBatches > 0, "num_batches must be > 0"); - static_assert(NumClasses > 0, "num_classes must be > 0"); + static_assert(NumBatches > 0); + static_assert(NumClasses > 0); + static_assert(NumBoxes > 0); auto idx = make_index(); const index_int block_id = idx.group; - const int batch_idx = block_id / NumClasses; - const int class_idx = block_id % NumClasses; - constexpr index_int iou_packed_size = (NumBoxes > 1) ? (NumBoxes * (NumBoxes - 1)) / 2 : 1; + //constexpr index_int iou_packed_size = (NumBoxes > 1) ? (NumBoxes * (NumBoxes - 1)) / 2 : 1; - nms_data* my_sorted = - reinterpret_cast(sorted_buf.data()) + block_id * AlignedNumBoxes; - uint8_t* my_mask = reinterpret_cast(mask_buf.data()) + block_id * iou_packed_size; + constexpr auto my_sorted_shape = make_shape(index_ints{}); + nms_data* my_sorted_p = reinterpret_cast(sorted_buf.data()) + block_id * AlignedNumBoxes; + auto my_sorted = make_tensor_view(my_sorted_p, my_sorted_shape); + + auto my_mask = slice_tensor(mask, block_id, slice_axes<1>()); + auto my_output = slice_tensor(output, block_id, slice_axes<1, 2>()); - // Pull scalar tensor inputs once. They're broadcast to all threads via the - // common load (each thread reads the same single element). + // Read scalar tensor inputs const int64_t max_output_boxes_per_class = max_out_p[0]; const float iou_thr_val = iou_thr_p[0]; const float score_thr_val = score_thr_p[0]; - if constexpr(NumBoxes > 1) - { - nms_make_iou_mask(idx, my_sorted, my_mask, iou_thr_val); - __syncthreads(); - } + nms_make_iou_mask(idx, my_sorted, my_mask, iou_thr_val); + __syncthreads(); - nms_filter_per_block(idx, + nms_filter_per_block(idx, my_sorted, my_mask, - batch_idx, - class_idx, max_output_boxes_per_class, score_thr_val, - reinterpret_cast(raw_output_buf.data()), - reinterpret_cast(counts_buf.data())); + my_output, + bc_counts); } -// Phase 3 -// Move batch/class box index entries to the beginning of the output buffer. -// Runs with 1 block. Reads from `raw_indices` (the filter kernel's per-block -// output) and writes the compacted selections into `output_indices`. +// Move batch/class box index entries to the beginning of the output buffer. Runs with 1 block. // `bc_counts`: Number of selected boxes per batch per class. (read-only) -// `raw_indices`: Per-block raw indices written by the filter kernel -// (read-only). -// `output_indices`: Output box indices, packed contiguously at the beginning +// `indices`: Box indices, kernel packs selected boxes in-place to the beginning // of the buffer in (batch, class) iteration order. -// `output_num_selected`: Total number of selected boxes. +// `num_selected`: Total number of selected boxes. template + class Idx, + class Num, + class Out> __device__ void nonmaxsuppression_compact(const Counts bc_counts, - RawIndices raw_indices, - IdxOutput output_indices, - NumOutput output_num_selected) + const Idx indices, + Num num_selected, + Out output) { - static_assert(NumBatchClass > 0, "NumBatchClass must be > 0"); - static_assert(NumBatchClass <= 16000, "nms_compact: NumBlocks exceeds the LDS budget for offsets[]"); - __shared__ array offsets; + static_assert(NumBatchClass > 0); + static_assert(NumBoxes > 0); + static_assert(NumBatchClass <= 16000, "nms_compact: NumBatchClass exceeds the LDS budget for offsets[]"); + + auto idx = make_index(); + __shared__ index_int offsets[NumBatchClass]; // Exclusive prefix sum on bc_counts to get offsets block_scan( idx, op::sum{}, 0, - NumBlocks, + NumBatchClass, [&](auto i) -> int32_t { return bc_counts[i]; }, - [&](auto i, auto inclusive_value) { offsets[i] = inclusive_value - block_counts[i]; }); + [&](auto i, auto inclusive_value) { offsets[i] = inclusive_value - bc_counts[i]; }); __syncthreads(); // Get num_selected_boxes from last value of exclusive scan and add last bc_counts value. if(idx.local == 0) { - output_num_selected[0] = offsets[NumBatchClass-1] + block_counts[NumBlocks-1]; + num_selected[0] = offsets[NumBatchClass-1] + bc_counts[NumBatchClass-1]; } // swap index values to make the output packed @@ -362,15 +345,12 @@ __device__ void nonmaxsuppression_compact(const Counts bc_counts, idx.local_stride(max_entries, [&](auto i) { const index_int batch_class_idx = i / NumBoxes; const index_int box_idx = i & NumBoxes; - if(box_idx < block_counts[batch_class_idx]) + if(box_idx < bc_counts[batch_class_idx]) { - auto src = [&](auto j){return output_indices[batch_class_idx * NumBoxes + box_idx * index_size + j]}; - auto dst = [&](auto j){return output_indices[(offsets[batch_class_idx] + box_idx) * index_size + j]}; - array tmp_src = {src(0), src(1), src(2)}; for(int k = 0; k < 3; ++k) { - src(k) = dst(k); - dst(k) = tmp_src[k]; + output[(offsets[batch_class_idx] + box_idx) * index_size + k] = + indices[batch_class_idx * NumBoxes + box_idx * index_size + k] ; } } }); diff --git a/src/targets/gpu/prepare_nonmaxsuppression.cpp b/src/targets/gpu/prepare_nonmaxsuppression.cpp index 8f9219428a6..187d193b1bc 100644 --- a/src/targets/gpu/prepare_nonmaxsuppression.cpp +++ b/src/targets/gpu/prepare_nonmaxsuppression.cpp @@ -26,13 +26,12 @@ #include #include #include -#include #include #include +#include #include #include -#include #include namespace migraphx { @@ -40,11 +39,11 @@ inline namespace MIGRAPHX_INLINE_NS { namespace gpu { // nms_data is laid out as { float score; float box[4]; int box_index; } for a -// total of 24 bytes per entry. The scratch workspace is allocated as raw int8 +// total of 24 bytes per entry. The scratch workspace is allocated as raw uint8 // and reinterpreted in the kernel. static constexpr std::size_t nms_bytes_per_data = 24; -// Phase-1 op: sort boxes per (batch, class) into a flat byte scratch buffer. +// Sort boxes per (batch, class) into nms_data{} tensor. struct nms_sort { bool center_point_box = false; @@ -64,20 +63,20 @@ struct nms_sort const auto& scores_s = inputs.at(1); if(boxes_s.lens().size() != 3 or scores_s.lens().size() != 3) MIGRAPHX_THROW("gpu::nms_sort: boxes and scores must be 3-D"); - const auto nb = boxes_s.lens()[0]; - const auto b = boxes_s.lens()[1]; - const auto nc = scores_s.lens()[1]; + const auto num_batches = boxes_s.lens()[0]; + const auto num_boxes = boxes_s.lens()[1]; + const auto num_classes = scores_s.lens()[1]; const auto aligned_b = - static_cast(bit_ceil(static_cast(b))); - return shape{shape::int8_type, {nb * nc * aligned_b * nms_bytes_per_data}}; + static_cast(bit_ceil(static_cast(num_boxes))); + return shape{shape::uint8_type, {num_batches * num_classes * aligned_b * nms_bytes_per_data}}; } }; MIGRAPHX_REGISTER_OP(nms_sort); -// Phase-2 op: build the IoU mask and run the greedy filter. Produces a tuple -// of (raw_output, bc_counts). num_batches/num_classes/num_boxes are kept as -// op attributes because the filter inputs are flat scratch buffers from which -// these can't be recovered. +// Build the IoU mask and run the greedy filter. +// Produces a tuple of (raw_output, bc_counts). +// num_batches/num_classes/num_boxes are kept as op attributes because the filter inputs +// is a scratch buffer from which these can't be recovered. struct nms_filter { std::size_t num_batches = 0; @@ -97,23 +96,19 @@ struct nms_filter shape compute_shape(const std::vector& inputs) const { check_shapes{inputs, *this}.has(5); - shape raw_output_shape{shape::int64_type, {num_batches * num_classes * num_boxes * 3}}; + shape output_shape{shape::int64_type, {num_batches * num_classes, num_boxes, 3}}; shape bc_counts_shape{shape::int32_type, {num_batches * num_classes}}; - return shape{{raw_output_shape, bc_counts_shape}}; + return shape{{output_shape, bc_counts_shape}}; } }; MIGRAPHX_REGISTER_OP(nms_filter); -// Phase-3 op: prefix-scan the per-block counts and compact the selections into -// the final (selected_indices, num_selected) tuple. +// TODO: This should work in-place, saving memory. Need to update IR to handle it. +// Needs a make_tuple type of operator that reuses the indicies input. +// Prefix-scan the per-block counts and compact the selections into +// the final selected_indices. Output as selected_indices and num_selected tuple. struct nms_compact { - template - static auto reflect(Self&, F) - { - return pack(); - } - std::string name() const { return "gpu::nms_compact"; } shape compute_shape(const std::vector& inputs) const @@ -130,85 +125,81 @@ MIGRAPHX_REGISTER_OP(nms_compact); namespace { -std::vector find_nms(module& m) +struct find_nonmaxsuppression { - std::vector result; - auto im = iterator_for(m); - std::copy_if(im.begin(), im.end(), std::back_inserter(result), [](auto ins) { - return ins->name() == "nonmaxsuppression"; - }); - return result; -} + auto matcher() const { return match::name("nonmaxsuppression"); } -void rewrite_nms(module& m, instruction_ref ins) -{ - auto inputs = ins->inputs(); - if(inputs.size() < 2 or inputs.size() > 5) - MIGRAPHX_THROW("prepare_nonmaxsuppression: unexpected input count " + - std::to_string(inputs.size())); - - const auto& boxes_s = inputs[0]->get_shape(); - const auto& scores_s = inputs[1]->get_shape(); - if(boxes_s.lens().size() != 3 or scores_s.lens().size() != 3) - MIGRAPHX_THROW("prepare_nonmaxsuppression: boxes and scores must be 3-D"); - - const auto nb = boxes_s.lens()[0]; - const auto b = boxes_s.lens()[1]; - const auto nc = scores_s.lens()[1]; - const auto iou_packed = (b > 1) ? (b * (b - 1) / 2) : std::size_t{1}; - - // Fill in missing optional scalar inputs with default literals. The kernels - // load these via tensor_view, so single-element shapes are needed. - const shape default_max_s{shape::int64_type, {1}}; - const shape default_iou_s{shape::float_type, {1}}; - const shape default_thr_s{shape::float_type, {1}}; - if(inputs.size() < 3) - inputs.push_back(m.insert_literal(ins, literal{default_max_s, {std::int64_t{0}}})); - if(inputs.size() < 4) - inputs.push_back(m.insert_literal(ins, literal{default_iou_s, {0.0f}})); - if(inputs.size() < 5) - inputs.push_back(m.insert_literal(ins, literal{default_thr_s, {0.0f}})); - - auto op_val = ins->get_operator().to_value(); - bool center_point_box = op_val.at("center_point_box").to(); - - // Mask is scratch only; allocate up-front so the standard replace_allocate - // pass can later turn it into hip::allocate. - shape mask_shape{shape::uint8_type, {nb * nc * iou_packed}}; - auto mask_alloc = - m.insert_instruction(ins, make_op("allocate", {{"shape", to_value(mask_shape)}})); - - auto sorted = m.insert_instruction( - ins, make_op("gpu::nms_sort", {{"center_point_box", center_point_box}}), inputs[0], inputs[1]); - - auto filter = m.insert_instruction( - ins, - make_op("gpu::nms_filter", - {{"num_batches", nb}, {"num_classes", nc}, {"num_boxes", b}}), - sorted, - inputs[2], - inputs[3], - inputs[4], - mask_alloc); - - auto raw_output = - m.insert_instruction(ins, make_op("get_tuple_elem", {{"index", 0}}), filter); - auto bc_counts = - m.insert_instruction(ins, make_op("get_tuple_elem", {{"index", 1}}), filter); - - auto compact = m.insert_instruction(ins, make_op("gpu::nms_compact"), bc_counts, raw_output); - - m.replace_instruction(ins, compact); -} + void apply(module& m, const match::matcher_result& r) const + { + auto ins = r.result; + auto inputs = ins->inputs(); + if(inputs.size() < 2 or inputs.size() > 5) + MIGRAPHX_THROW("prepare_nonmaxsuppression: unexpected input count " + + std::to_string(inputs.size())); + + const auto& boxes_s = inputs[0]->get_shape(); + const auto& scores_s = inputs[1]->get_shape(); + if(boxes_s.ndim() != 3 or scores_s.ndim() != 3) + MIGRAPHX_THROW("prepare_nonmaxsuppression: boxes and scores must be 3-D"); + + const auto num_batches = boxes_s.lens()[0]; + const auto num_boxes = boxes_s.lens()[1]; + const auto num_classes = scores_s.lens()[1]; + const auto iou_packed = (num_boxes * (num_boxes - 1) / 2); + + // Fill in missing optional scalar inputs with default literals. + const shape default_max_s{shape::int64_type, {1}}; + const shape default_iou_s{shape::float_type, {1}}; + const shape default_thr_s{shape::float_type, {1}}; + if(inputs.size() < 3) + inputs.push_back(m.insert_literal(ins, literal{default_max_s, {std::int64_t{0}}})); + if(inputs.size() < 4) + inputs.push_back(m.insert_literal(ins, literal{default_iou_s, {0.0f}})); + if(inputs.size() < 5) + inputs.push_back(m.insert_literal(ins, literal{default_thr_s, {0.0f}})); + + auto op_val = ins->get_operator().to_value(); + bool center_point_box = op_val.at("center_point_box").to(); + + // Mask is scratch only; allocate up-front so the standard + // replace_allocate pass can later turn it into hip::allocate. + shape mask_shape{shape::uint8_type, {num_batches * num_classes, iou_packed}}; + auto mask_alloc = + m.insert_instruction(ins, make_op("allocate", {{"shape", to_value(mask_shape)}})); + + auto sorted = m.insert_instruction( + ins, + make_op("gpu::nms_sort", {{"center_point_box", center_point_box}}), + inputs[0], + inputs[1]); + + auto filter = m.insert_instruction( + ins, + make_op("gpu::nms_filter", + {{"num_batches", num_batches}, {"num_classes", num_classes}, {"num_boxes", num_boxes}}), + sorted, + inputs[2], + inputs[3], + inputs[4], + mask_alloc); + + auto output = + m.insert_instruction(ins, make_op("get_tuple_elem", {{"index", 0}}), filter); + auto bc_counts = + m.insert_instruction(ins, make_op("get_tuple_elem", {{"index", 1}}), filter); + + auto compact = + m.insert_instruction(ins, make_op("gpu::nms_compact"), bc_counts, output); + + m.replace_instruction(ins, compact); + } +}; } // namespace void prepare_nonmaxsuppression::apply(module& m) const { - for(auto ins : find_nms(m)) - { - rewrite_nms(m, ins); - } + match::find_matches(m, find_nonmaxsuppression{}); } } // namespace gpu From 5ca611f7cc8d8eced2ce84152ec8e6fb3bc470ac Mon Sep 17 00:00:00 2001 From: charlie Date: Mon, 18 May 2026 12:31:16 -0500 Subject: [PATCH 10/32] Fix JIT global and local. Single verify_test test_nms works. --- .../gpu/include/migraphx/gpu/compile_hip_code_object.hpp | 4 +++- src/targets/gpu/jit/nonmaxsuppression.cpp | 6 +++--- test/verify/test_nms.cpp | 7 ++++--- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp b/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp index f434348dbd5..592e32b9af4 100644 --- a/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp +++ b/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp @@ -36,8 +36,10 @@ namespace gpu { struct context; struct hip_compile_options -{ +{ + // Total number of threads std::size_t global; + // Threads per block std::size_t local; std::vector inputs; shape output; diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp index be37fcadbc6..14df58b8d41 100644 --- a/src/targets/gpu/jit/nonmaxsuppression.cpp +++ b/src/targets/gpu/jit/nonmaxsuppression.cpp @@ -131,14 +131,14 @@ struct nms_sort_compiler : compiler const auto aligned_num_boxes = static_cast(bit_ceil(static_cast(num_boxes))); // NOTE: topK kernel uses relement/4 for amount of work in a block? - auto block_size = compute_block_size(ctx, num_boxes, 1024); + auto block_size = compute_block_size(ctx, aligned_num_boxes, 1024); hip_compile_options options; options.inputs = inputs; options.output = inputs.back(); options.kernel_name = "nms_sort_kernel"; options.virtual_inputs = inputs; - options.set_launch_params(v, block_size * num_batches * num_classes, block_size); + options.set_launch_params(v, num_batches * num_classes * block_size, block_size); auto src = interpolate_string( nms_sort_kernel_src, @@ -181,7 +181,7 @@ struct nms_filter_compiler : compiler options.output = inputs.back(); options.kernel_name = "nms_filter_kernel"; options.virtual_inputs = options.inputs; - options.set_launch_params(v, block_size * num_batches * num_classes, block_size); + options.set_launch_params(v, num_batches * num_classes * block_size, block_size); auto src = interpolate_string( nms_filter_kernel_src, diff --git a/test/verify/test_nms.cpp b/test/verify/test_nms.cpp index 9039784a689..99a502dbbfc 100644 --- a/test/verify/test_nms.cpp +++ b/test/verify/test_nms.cpp @@ -51,10 +51,11 @@ struct test_nms : verify_program iou_threshold, score_threshold); - auto indices = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms); + //auto indices = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms); auto num_selected = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 1}}), nms); - auto slice_ins = mm->add_instruction(migraphx::make_op("slice", {{"axes", {0}}, {"starts", {0}}}), indices, num_selected); - mm->add_return({slice_ins}); + //auto slice_ins = mm->add_instruction(migraphx::make_op("slice", {{"axes", {0}}, {"starts", {0}}}), indices, num_selected); + //mm->add_return({slice_ins}); + mm->add_return({num_selected}); return p; } From a48c90999ab2114eae3fcea51fe6427fafcd9e05 Mon Sep 17 00:00:00 2001 From: charlie Date: Mon, 18 May 2026 17:04:07 -0500 Subject: [PATCH 11/32] Fixes --- src/include/migraphx/op/nonmaxsuppression.hpp | 9 ++++++--- src/targets/gpu/jit/nonmaxsuppression.cpp | 11 ++++++----- .../include/migraphx/kernels/nonmaxsuppression.hpp | 11 +++++------ src/targets/gpu/prepare_nonmaxsuppression.cpp | 2 +- 4 files changed, 18 insertions(+), 15 deletions(-) diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp index 9cf3d41070b..68ea521f4f6 100644 --- a/src/include/migraphx/op/nonmaxsuppression.hpp +++ b/src/include/migraphx/op/nonmaxsuppression.hpp @@ -297,12 +297,16 @@ struct nonmaxsuppression auto output_shapes = flatten({output_shape}); shape max_output_shape = {output_shapes.at(0).type(), output_shapes.at(0).max_lens()}; argument result{max_output_shape}; + argument num_selected_result{output_shapes.at(1)}; std::size_t max_output_boxes_per_class = (args.size() > 2) ? (args.at(2).at()) : 0; if(max_output_boxes_per_class == 0) { - return result; + num_selected_result.visit([&](auto output){ + output.at(0) = 0; + }); + return {{result, num_selected_result}}; } double iou_threshold = (args.size() > 3) ? (args.at(3).at()) : 0.0f; double score_threshold = (args.size() > 4) ? (args.at(4).at()) : 0.0f; @@ -318,9 +322,8 @@ struct nonmaxsuppression score_threshold); }); }); - argument num_selected_result{output_shapes.at(1)}; num_selected_result.visit([&](auto output){ - output.begin() = num_selected; + output.at(0) = num_selected; }); return {{result, num_selected_result}}; } diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp index 14df58b8d41..2581f975d8b 100644 --- a/src/targets/gpu/jit/nonmaxsuppression.cpp +++ b/src/targets/gpu/jit/nonmaxsuppression.cpp @@ -104,10 +104,10 @@ MIGRAPHX_GLOBAL void nms_compact_kernel(${params}) { make_tensors()(${args})([](const auto bc_counts, auto indices, - auto num_selected, - auto output) { + auto selected_indices, + auto num_selected) { nonmaxsuppression_compact<${num_batch_class}, ${num_boxes}>( - bc_counts, indices, num_selected, output); + bc_counts, indices, selected_indices, num_selected); }); } @@ -213,8 +213,9 @@ struct nms_compact_compiler : compiler const auto& indices_s = inputs[1]; const auto num_batch_class = cnt_s.elements(); const auto num_boxes = indices_s.elements() / (num_batch_class * std::size_t{3}); - // TODO: tune for max block size? - const auto block_size = compute_block_size(ctx, num_boxes, 256); + // TODO: tune for block size? + // num_boxes block size could also work? + const auto block_size = compute_block_size(ctx, num_batch_class * num_boxes, 256); hip_compile_options options; options.inputs = flatten(inputs); diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp index c226ab78ab6..ca4236c65fa 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp @@ -150,8 +150,8 @@ __device__ void nonmaxsuppression_sort(Boxes boxes_tv, Scores scores_tv, Output } else { - // Sentinel: -inf score so it never beats any real entry - tmp_data.score = -__FLT_MAX__; + // Sentinel: score so it never beats any real entry + tmp_data.score = numeric_limits::lowest(); tmp_data.box = array{0.f, 0.f, 0.f, 0.f}; tmp_data.box_index = -1; } @@ -204,8 +204,7 @@ __device__ void nms_filter_per_block(index idx, Output output, Counts bc_counts) { - static_assert(NumBoxes > 1); - + static_assert(NumBoxes > 0); const index_int block_id = idx.group; const int batch_idx = block_id / NumClasses; const int class_idx = block_id % NumClasses; @@ -344,13 +343,13 @@ __device__ void nonmaxsuppression_compact(const Counts bc_counts, constexpr index_int max_entries = NumBatchClass * NumBoxes; idx.local_stride(max_entries, [&](auto i) { const index_int batch_class_idx = i / NumBoxes; - const index_int box_idx = i & NumBoxes; + const index_int box_idx = i % NumBoxes; if(box_idx < bc_counts[batch_class_idx]) { for(int k = 0; k < 3; ++k) { output[(offsets[batch_class_idx] + box_idx) * index_size + k] = - indices[batch_class_idx * NumBoxes + box_idx * index_size + k] ; + indices[(batch_class_idx * NumBoxes + box_idx) * index_size + k] ; } } }); diff --git a/src/targets/gpu/prepare_nonmaxsuppression.cpp b/src/targets/gpu/prepare_nonmaxsuppression.cpp index 187d193b1bc..25f65c65b00 100644 --- a/src/targets/gpu/prepare_nonmaxsuppression.cpp +++ b/src/targets/gpu/prepare_nonmaxsuppression.cpp @@ -189,7 +189,7 @@ struct find_nonmaxsuppression m.insert_instruction(ins, make_op("get_tuple_elem", {{"index", 1}}), filter); auto compact = - m.insert_instruction(ins, make_op("gpu::nms_compact"), bc_counts, output); + m.insert_instruction(ins, make_op("gpu::nms_compact"), output, bc_counts); m.replace_instruction(ins, compact); } From e1e936b27c4cff366b446f39b832de5c08719472 Mon Sep 17 00:00:00 2001 From: charlie Date: Mon, 18 May 2026 17:43:35 -0500 Subject: [PATCH 12/32] Add ref-like tests for GPU NMS, rename shape's flatten to flatten_shapes beacuse of operator::flatten same name clashing. --- src/include/migraphx/op/nonmaxsuppression.hpp | 7 +- src/include/migraphx/shape.hpp | 2 +- src/shape.cpp | 4 +- src/targets/gpu/code_object_op.cpp | 2 +- src/targets/gpu/jit/nonmaxsuppression.cpp | 4 +- src/targets/gpu/jit/pointwise.cpp | 2 +- src/targets/gpu/jit/reduce.cpp | 2 +- src/targets/gpu/jit/topk.cpp | 2 +- .../migraphx/kernels/nonmaxsuppression.hpp | 12 +- test/gpu/nonmaxsuppression.cpp | 311 ++++++++++++++++++ 10 files changed, 332 insertions(+), 16 deletions(-) create mode 100644 test/gpu/nonmaxsuppression.cpp diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp index 68ea521f4f6..b6cbd4c9bc1 100644 --- a/src/include/migraphx/op/nonmaxsuppression.hpp +++ b/src/include/migraphx/op/nonmaxsuppression.hpp @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -294,7 +295,7 @@ struct nonmaxsuppression argument compute(const shape& output_shape, std::vector args) const { // make buffer of maximum size - auto output_shapes = flatten({output_shape}); + auto output_shapes = flatten_shapes({output_shape}); shape max_output_shape = {output_shapes.at(0).type(), output_shapes.at(0).max_lens()}; argument result{max_output_shape}; argument num_selected_result{output_shapes.at(1)}; @@ -304,7 +305,7 @@ struct nonmaxsuppression if(max_output_boxes_per_class == 0) { num_selected_result.visit([&](auto output){ - output.at(0) = 0; + output[0] = 0; }); return {{result, num_selected_result}}; } @@ -323,7 +324,7 @@ struct nonmaxsuppression }); }); num_selected_result.visit([&](auto output){ - output.at(0) = num_selected; + output[0] = num_selected; }); return {{result, num_selected_result}}; } diff --git a/src/include/migraphx/shape.hpp b/src/include/migraphx/shape.hpp index 1a8c1f9d53e..08bff8ee04c 100644 --- a/src/include/migraphx/shape.hpp +++ b/src/include/migraphx/shape.hpp @@ -615,7 +615,7 @@ struct MIGRAPHX_EXPORT shape }; /// Flatten subshapes to a single vector of non-tuple type of shapes -MIGRAPHX_EXPORT std::vector flatten(const std::vector& shapes); +MIGRAPHX_EXPORT std::vector flatten_shapes(const std::vector& shapes); MIGRAPHX_EXPORT void migraphx_to_value(value& v, const shape& s); MIGRAPHX_EXPORT void migraphx_from_value(const value& v, shape& s); diff --git a/src/shape.cpp b/src/shape.cpp index 7732ead6b11..afd716f72d2 100644 --- a/src/shape.cpp +++ b/src/shape.cpp @@ -1378,14 +1378,14 @@ const std::vector& shape::sub_shapes() const { return impl->m_shapes; } void shape::debug_print() const { std::cout << *this << std::endl; } -std::vector flatten(const std::vector& shapes) +std::vector flatten_shapes(const std::vector& shapes) { std::vector result; for(const auto& s : shapes) { if(s.type() == shape::tuple_type) { - auto subs = flatten(s.sub_shapes()); + auto subs = flatten_shapes(s.sub_shapes()); result.insert(result.end(), subs.begin(), subs.end()); } else diff --git a/src/targets/gpu/code_object_op.cpp b/src/targets/gpu/code_object_op.cpp index 6a567329c6b..e53724c23ce 100644 --- a/src/targets/gpu/code_object_op.cpp +++ b/src/targets/gpu/code_object_op.cpp @@ -41,7 +41,7 @@ shape code_object_op::compute_shape(std::vector inputs) const std::transform(einputs.begin(), einputs.end(), einputs.begin(), [](const shape& s) { return s.normalize_standard(); }); - if(not migraphx::equal(flatten(einputs), flatten(inputs), &shape::is_compatible)) + if(not migraphx::equal(flatten_shapes(einputs), flatten_shapes(inputs), &shape::is_compatible)) MIGRAPHX_THROW("Input shapes have changed: [" + to_string_range(einputs) + "] -> [" + to_string_range(inputs) + "]"); auto output_buffer_shape = inputs.at(get_output_arg(inputs.size())); diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp index 2581f975d8b..dfa5aaffcba 100644 --- a/src/targets/gpu/jit/nonmaxsuppression.cpp +++ b/src/targets/gpu/jit/nonmaxsuppression.cpp @@ -177,7 +177,7 @@ struct nms_filter_compiler : compiler const auto block_size = compute_block_size(ctx, num_boxes/2, 256); hip_compile_options options; - options.inputs = flatten(inputs); + options.inputs = flatten_shapes(inputs); options.output = inputs.back(); options.kernel_name = "nms_filter_kernel"; options.virtual_inputs = options.inputs; @@ -218,7 +218,7 @@ struct nms_compact_compiler : compiler const auto block_size = compute_block_size(ctx, num_batch_class * num_boxes, 256); hip_compile_options options; - options.inputs = flatten(inputs); + options.inputs = flatten_shapes(inputs); options.output = inputs.back(); options.kernel_name = "nms_compact_kernel"; options.virtual_inputs = options.inputs; diff --git a/src/targets/gpu/jit/pointwise.cpp b/src/targets/gpu/jit/pointwise.cpp index c3ce45c12cf..80597139dac 100644 --- a/src/targets/gpu/jit/pointwise.cpp +++ b/src/targets/gpu/jit/pointwise.cpp @@ -74,7 +74,7 @@ struct pointwise_compiler : compiler operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { hip_compile_options options; - options.inputs = flatten(inputs); + options.inputs = flatten_shapes(inputs); options.output = inputs.back(); options.virtual_inputs = reduce_dims(normalize_permutation(options.inputs)); options.emplace_param("-Wno-float-equal"); diff --git a/src/targets/gpu/jit/reduce.cpp b/src/targets/gpu/jit/reduce.cpp index a9d506c41e7..a5b9613c5b2 100644 --- a/src/targets/gpu/jit/reduce.cpp +++ b/src/targets/gpu/jit/reduce.cpp @@ -322,7 +322,7 @@ struct fused_reduce_compiler : compiler { auto assign = v.get("assign", "assign_none"); auto axes = v.at("axes").to_vector(); - auto finputs = flatten(inputs); + auto finputs = flatten_shapes(inputs); auto noutputs = finputs.size() - inputs.size() + 1; auto virtual_inputs = finputs; virtual_inputs.push_back(get_reduced_shape(get_input_shape(finputs), axes)); diff --git a/src/targets/gpu/jit/topk.cpp b/src/targets/gpu/jit/topk.cpp index 745d5b2c7da..1deafb2db60 100644 --- a/src/targets/gpu/jit/topk.cpp +++ b/src/targets/gpu/jit/topk.cpp @@ -65,7 +65,7 @@ struct topk_compiler : compiler { hip_compile_options options; options.output = inputs.back(); - options.inputs = flatten(inputs); + options.inputs = flatten_shapes(inputs); options.kernel_name = "topk_kernel"; auto axis = v.at("axis").to(); diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp index ca4236c65fa..c1d4398acc9 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp @@ -137,15 +137,19 @@ __device__ void nonmaxsuppression_sort(Boxes boxes_tv, Scores scores_tv, Output auto* p = reinterpret_cast(out_tv.data()) + block_id * AlignedNumBoxes; auto block_out_tv = make_tensor_view(p, block_out_shape); - const auto* boxes_b = boxes_tv.data() + batch_idx * NumBoxes * 4; - const auto* scores_bc = scores_tv.data() + (batch_idx * NumClasses + class_idx) * NumBoxes; + //const auto* boxes_b = boxes_tv.data() + batch_idx * NumBoxes * 4; + //const auto* scores_bc = scores_tv.data() + (batch_idx * NumClasses + class_idx) * NumBoxes; + // Get tensor_view slice of boxes. numpy slicing: boxes[batch_idx, :, :] + const auto my_boxes = slice_tensor(boxes_tv, batch_idx, slice_axes<1, 2>()); + // Get tensor_view slice of scores. numpy slicing: scores[batch_idx, class_idx, :] + const auto my_scores = slice_tensor(scores_tv, block_id, slice_axes<2>()); nms_data tmp_data; idx.local_stride(AlignedNumBoxes, [&](auto i) { if(i < NumBoxes) { - tmp_data.score = scores_bc[i]; - tmp_data.box = nms_normalize_box(boxes_b + i * 4); + tmp_data.score = my_scores[i]; + tmp_data.box = nms_normalize_box(my_boxes + i * 4); tmp_data.box_index = static_cast(i); } else diff --git a/test/gpu/nonmaxsuppression.cpp b/test/gpu/nonmaxsuppression.cpp new file mode 100644 index 00000000000..ad4c1f27fe4 --- /dev/null +++ b/test/gpu/nonmaxsuppression.cpp @@ -0,0 +1,311 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include +#include +#include +#include +#include +#include + +#include + +static std::pair, int64_t> +run_gpu_nms(migraphx::program p, const migraphx::parameter_map& host_params = {}) +{ + migraphx::target t = migraphx::make_target("gpu"); + p.compile(t); + + migraphx::parameter_map gpu_params; + for(auto&& x : p.get_parameter_shapes()) + { + auto it = host_params.find(x.first); + if(it != host_params.end()) + gpu_params[x.first] = t.copy_to(it->second); + else + gpu_params[x.first] = t.allocate(x.second); + } + + auto results = p.eval(gpu_params); + auto idx_host = t.copy_from(results.at(0)); + auto cnt_host = t.copy_from(results.at(1)); + + std::vector indices; + idx_host.visit([&](auto v) { indices.assign(v.begin(), v.end()); }); + + int64_t num_selected = 0; + cnt_host.visit([&](auto v) { num_selected = static_cast(v[0]); }); + + return {indices, num_selected}; +} + +static void add_nms_return(migraphx::module* mm, migraphx::instruction_ref nms) +{ + auto idx = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms); + auto cnt = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 1}}), nms); + mm->add_return({idx, cnt}); +} + +TEST_CASE(nms_test) +{ + migraphx::program p; + auto* mm = p.get_main_module(); + migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}}; + std::vector boxes_vec = {0.5, 0.5, 1.0, 1.0, 0.5, 0.6, 1.0, 1.0, 0.5, 0.4, 1.0, 1.0, + 0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0}; + + migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}}; + std::vector scores_vec = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3}; + + auto boxes_l = mm->add_literal(migraphx::literal(boxes_s, boxes_vec)); + auto scores_l = mm->add_literal(migraphx::literal(scores_s, scores_vec)); + auto max_out_l = mm->add_literal(int64_t{4}); + auto iou_threshold = mm->add_literal(0.5f); + auto score_threshold = mm->add_literal(0.0f); + + auto nms = + mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), + boxes_l, + scores_l, + max_out_l, + iou_threshold, + score_threshold); + add_nms_return(mm, nms); + + auto [indices, num_selected] = run_gpu_nms(std::move(p)); + indices.resize(static_cast(num_selected) * 3); + std::vector gold = {0, 0, 3, 0, 0, 0, 0, 0, 5}; + EXPECT(migraphx::verify::verify_rms_range(indices, gold)); + EXPECT(num_selected == 3); +} + +TEST_CASE(nms_identical_all_test) +{ + migraphx::program p; + auto* mm = p.get_main_module(); + migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}}; + std::vector boxes_vec = {0.5, 0.5, 0.7, 0.7, 0.7, 0.7, 0.5, 0.5, 0.7, 0.7, 0.5, 0.5, + 0.5, 0.5, 0.7, 0.7, 0.5, 0.5, 0.7, 0.7, 0.7, 0.7, 0.5, 0.5}; + migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}}; + std::vector scores_vec = {0.9, 0.9, 0.9, 0.9, 0.9, 0.9}; + + auto boxes_l = mm->add_literal(migraphx::literal(boxes_s, boxes_vec)); + auto scores_l = mm->add_literal(migraphx::literal(scores_s, scores_vec)); + auto max_out_l = mm->add_literal(int64_t{6}); + auto iou_threshold = mm->add_literal(0.1f); + auto score_threshold = mm->add_literal(0.0f); + + auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"), + boxes_l, + scores_l, + max_out_l, + iou_threshold, + score_threshold); + add_nms_return(mm, nms); + + auto [indices, num_selected] = run_gpu_nms(std::move(p)); + indices.resize(static_cast(num_selected) * 3); + std::vector gold = {0, 0, 0}; + EXPECT(migraphx::verify::verify_rms_range(indices, gold)); + EXPECT(num_selected == 1); +} + +TEST_CASE(nms_not_center_test) +{ + migraphx::program p; + auto* mm = p.get_main_module(); + migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}}; + std::vector boxes_vec = {1.0, 1.0, 0.0, 0.0, 0.0, 0.1, 1.0, 1.1, + 0.0, 0.9, 1.0, -0.1, 0.0, 10.0, 1.0, 11.0, + 1.0, 10.1, 0.0, 11.1, 1.0, 101.0, 0.0, 100.0}; + + migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}}; + std::vector scores_vec = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3}; + + auto boxes_l = mm->add_literal(migraphx::literal(boxes_s, boxes_vec)); + auto scores_l = mm->add_literal(migraphx::literal(scores_s, scores_vec)); + auto max_out_l = mm->add_literal(int64_t{4}); + auto iou_threshold = mm->add_literal(0.5f); + auto score_threshold = mm->add_literal(0.0f); + + auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"), + boxes_l, + scores_l, + max_out_l, + iou_threshold, + score_threshold); + add_nms_return(mm, nms); + + auto [indices, num_selected] = run_gpu_nms(std::move(p)); + indices.resize(static_cast(num_selected) * 3); + std::vector gold = {0, 0, 3, 0, 0, 0, 0, 0, 5}; + EXPECT(migraphx::verify::verify_rms_range(indices, gold)); + EXPECT(num_selected == 3); +} + +TEST_CASE(nms_transpose1_test) +{ + migraphx::program p; + auto* mm = p.get_main_module(); + migraphx::shape boxes_s{migraphx::shape::float_type, {1, 4, 6}}; + std::vector boxes_vec = { + 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.6, 0.4, 10.5, 10.6, 100.5, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + }; + + migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}}; + std::vector scores_vec = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3}; + + auto t_boxes_l = mm->add_literal(migraphx::literal(boxes_s, boxes_vec)); + auto scores_l = mm->add_literal(migraphx::literal(scores_s, scores_vec)); + auto max_out_l = mm->add_literal(int64_t{4}); + auto iou_threshold = mm->add_literal(0.5f); + auto score_threshold = mm->add_literal(0.0f); + + auto transpose_boxes = mm->add_instruction( + migraphx::make_op("transpose", {{"permutation", {0, 2, 1}}}), t_boxes_l); + auto nms = + mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), + transpose_boxes, + scores_l, + max_out_l, + iou_threshold, + score_threshold); + add_nms_return(mm, nms); + + auto [indices, num_selected] = run_gpu_nms(std::move(p)); + indices.resize(static_cast(num_selected) * 3); + std::vector gold = {0, 0, 3, 0, 0, 0, 0, 0, 5}; + EXPECT(migraphx::verify::verify_rms_range(indices, gold)); + EXPECT(num_selected == 3); +} + +TEST_CASE(nms_transpose2_test) +{ + migraphx::program p; + auto* mm = p.get_main_module(); + migraphx::shape boxes_s{migraphx::shape::float_type, {4, 1, 6}}; + std::vector boxes_vec = { + 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.6, 0.4, 10.5, 10.6, 100.5, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + }; + + migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}}; + std::vector scores_vec = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3}; + + auto t_boxes_l = mm->add_literal(migraphx::literal(boxes_s, boxes_vec)); + auto scores_l = mm->add_literal(migraphx::literal(scores_s, scores_vec)); + auto max_out_l = mm->add_literal(int64_t{4}); + auto iou_threshold = mm->add_literal(0.5f); + auto score_threshold = mm->add_literal(0.0f); + + auto transpose_boxes = mm->add_instruction( + migraphx::make_op("transpose", {{"permutation", {1, 2, 0}}}), t_boxes_l); + auto nms = + mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), + transpose_boxes, + scores_l, + max_out_l, + iou_threshold, + score_threshold); + add_nms_return(mm, nms); + + auto [indices, num_selected] = run_gpu_nms(std::move(p)); + indices.resize(static_cast(num_selected) * 3); + std::vector gold = {0, 0, 3, 0, 0, 0, 0, 0, 5}; + EXPECT(migraphx::verify::verify_rms_range(indices, gold)); + EXPECT(num_selected == 3); +} + +TEST_CASE(nms_multi_batch_test) +{ + migraphx::program p; + auto* mm = p.get_main_module(); + migraphx::shape boxes_s{migraphx::shape::float_type, {2, 6, 4}}; + std::vector boxes_vec = {0.5, 0.5, 1.0, 1.0, 0.5, 0.6, 1.0, 1.0, 0.5, 0.4, 1.0, 1.0, + 0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0, + 0.5, 0.5, 1.0, 1.0, 0.5, 0.6, 1.0, 1.0, 0.5, 0.4, 1.0, 1.0, + 0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0}; + + migraphx::shape scores_s{migraphx::shape::float_type, {2, 1, 6}}; + std::vector scores_vec = { + 0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.9, 0.75, 0.6, 0.95, 0.5, 0.3}; + + auto boxes_l = mm->add_literal(migraphx::literal(boxes_s, boxes_vec)); + auto scores_l = mm->add_literal(migraphx::literal(scores_s, scores_vec)); + auto max_out_l = mm->add_literal(int64_t{4}); + auto iou_threshold = mm->add_literal(0.5f); + auto score_threshold = mm->add_literal(0.0f); + + auto nms = + mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), + boxes_l, + scores_l, + max_out_l, + iou_threshold, + score_threshold); + add_nms_return(mm, nms); + + auto [indices, num_selected] = run_gpu_nms(std::move(p)); + indices.resize(static_cast(num_selected) * 3); + std::vector gold = {0, 0, 3, 0, 0, 0, 0, 0, 5, 1, 0, 3, 1, 0, 0, 1, 0, 5}; + EXPECT(migraphx::verify::verify_rms_range(indices, gold)); + EXPECT(num_selected == 6); +} + +TEST_CASE(nms_multi_class_test) +{ + migraphx::program p; + auto* mm = p.get_main_module(); + migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}}; + std::vector boxes_vec = {0.0, 0.0, 1.0, 1.0, 0.0, 0.1, 1.0, 1.1, + 0.0, -0.1, 1.0, 0.9, 0.0, 10.0, 1.0, 11.0, + 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0}; + + migraphx::shape scores_s{migraphx::shape::float_type, {1, 2, 6}}; + std::vector scores_vec = { + 0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.9, 0.75, 0.6, 0.95, 0.5, 0.3}; + + auto boxes_l = mm->add_literal(migraphx::literal(boxes_s, boxes_vec)); + auto scores_l = mm->add_literal(migraphx::literal(scores_s, scores_vec)); + auto max_out_l = mm->add_literal(int64_t{2}); + auto iou_threshold = mm->add_literal(0.5f); + auto score_threshold = mm->add_literal(0.0f); + + auto nms = + mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), + boxes_l, + scores_l, + max_out_l, + iou_threshold, + score_threshold); + add_nms_return(mm, nms); + + auto [indices, num_selected] = run_gpu_nms(std::move(p)); + indices.resize(static_cast(num_selected) * 3); + std::vector gold = {0, 0, 3, 0, 0, 0, 0, 1, 3, 0, 1, 0}; + EXPECT(migraphx::verify::verify_rms_range(indices, gold)); + EXPECT(num_selected == 4); +} + +int main(int argc, const char* argv[]) { test::run(argc, argv); } From fc728f3642e9bbcbb8dbd96894b12ebffeb6f27d Mon Sep 17 00:00:00 2001 From: charlie Date: Mon, 18 May 2026 17:44:34 -0500 Subject: [PATCH 13/32] Remove verify NMS tests. They don't make sense for random data. --- test/verify/test_nms.cpp | 201 --------------------------------------- 1 file changed, 201 deletions(-) delete mode 100644 test/verify/test_nms.cpp diff --git a/test/verify/test_nms.cpp b/test/verify/test_nms.cpp deleted file mode 100644 index 99a502dbbfc..00000000000 --- a/test/verify/test_nms.cpp +++ /dev/null @@ -1,201 +0,0 @@ -/* - * The MIT License (MIT) - * - * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include "verify_program.hpp" -#include -#include -#include - -struct test_nms : verify_program -{ - migraphx::program create_program() const - { - migraphx::program p; - auto* mm = p.get_main_module(); - - migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}}; - migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}}; - - auto boxes_l = mm->add_parameter("boxes", boxes_s); - auto scores_l = mm->add_parameter("scores", scores_s); - auto max_out_l = mm->add_literal(int64_t{4}); - auto iou_threshold = mm->add_literal(0.5f); - auto score_threshold = mm->add_literal(0.0f); - - auto nms = - mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", 1}}), - boxes_l, - scores_l, - max_out_l, - iou_threshold, - score_threshold); - - //auto indices = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms); - auto num_selected = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 1}}), nms); - //auto slice_ins = mm->add_instruction(migraphx::make_op("slice", {{"axes", {0}}, {"starts", {0}}}), indices, num_selected); - //mm->add_return({slice_ins}); - mm->add_return({num_selected}); - - return p; - } -}; - -// Multi-batch fixed-output NMS exercises the (batch_idx, class_idx) -> block_id -// dispatch in the GPU kernel. -struct test_nms_multi_batch : verify_program -{ - migraphx::program create_program() const - { - migraphx::program p; - auto* mm = p.get_main_module(); - - migraphx::shape boxes_s{migraphx::shape::float_type, {2, 6, 4}}; - migraphx::shape scores_s{migraphx::shape::float_type, {2, 1, 6}}; - - auto boxes_l = mm->add_parameter("boxes", boxes_s); - auto scores_l = mm->add_parameter("scores", scores_s); - auto max_out_l = mm->add_literal(int64_t{4}); - auto iou_threshold = mm->add_literal(0.5f); - auto score_threshold = mm->add_literal(0.0f); - - auto r = - mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", 1}}), - boxes_l, - scores_l, - max_out_l, - iou_threshold, - score_threshold); - mm->add_return({r}); - - return p; - } -}; - -// Multi-class fixed-output NMS exercises per-class greedy filtering with -// outputs interleaved by the global atomic counter. -struct test_nms_multi_class : verify_program -{ - migraphx::program create_program() const - { - migraphx::program p; - auto* mm = p.get_main_module(); - - migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}}; - migraphx::shape scores_s{migraphx::shape::float_type, {1, 2, 6}}; - - auto boxes_l = mm->add_parameter("boxes", boxes_s); - auto scores_l = mm->add_parameter("scores", scores_s); - auto max_out_l = mm->add_literal(int64_t{2}); - auto iou_threshold = mm->add_literal(0.5f); - auto score_threshold = mm->add_literal(0.0f); - - auto r = - mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", 1}}), - boxes_l, - scores_l, - max_out_l, - iou_threshold, - score_threshold); - mm->add_return({r}); - - return p; - } -}; - -// center_point_box=0 path with potentially flipped corner coordinates. -struct test_nms_not_center : verify_program -{ - migraphx::program create_program() const - { - migraphx::program p; - auto* mm = p.get_main_module(); - - migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}}; - migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}}; - - auto boxes_l = mm->add_parameter("boxes", boxes_s); - auto scores_l = mm->add_parameter("scores", scores_s); - auto max_out_l = mm->add_literal(int64_t{4}); - auto iou_threshold = mm->add_literal(0.5f); - auto score_threshold = mm->add_literal(0.0f); - - auto r = - mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", 0}}), - boxes_l, - scores_l, - max_out_l, - iou_threshold, - score_threshold); - mm->add_return({r}); - - return p; - } -}; - -// TODO: update this test -// Test NMS with dynamic inputs that have different compile-time spatial ranges. -// This reproduces the scenario from nms_repro_minidimmismatch.py where -// boxes has 10 spatial entries and scores has 5, but at runtime both are -// sliced/provided with spatial_dimension=5. The compile-time ranges differ: -// boxes spatial: {4, 10}, scores spatial: {4, 5} -// but runtime spatial dimensions match so NMS should succeed. -struct test_nms_dyn_slice : verify_program -{ - migraphx::program create_program() const - { - migraphx::program p; - auto* mm = p.get_main_module(); - - // boxes: [1, {4..10}, 4] — up to 10 spatial entries - migraphx::shape boxes_s{migraphx::shape::float_type, {{1, 1}, {4, 10}, {4, 4}}}; - // scores: [1, 1, {4..5}] — up to 5 spatial entries (different range!) - migraphx::shape scores_s{migraphx::shape::float_type, {{1, 1}, {1, 1}, {4, 5}}}; - - auto boxes_l = mm->add_parameter("boxes", boxes_s); - auto scores_l = mm->add_parameter("scores", scores_s); - - auto max_out_l = mm->add_literal(int64_t{4}); - auto iou_threshold = mm->add_literal(0.5f); - auto score_threshold = mm->add_literal(0.0f); - - auto r = - mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"use_dyn_output", true}}), - boxes_l, - scores_l, - max_out_l, - iou_threshold, - score_threshold); - mm->add_return({r}); - - return p; - } - - // At runtime, both have spatial_dimension=5 (matching) - std::unordered_map get_test_dims() const - { - return {{"boxes", migraphx::shape{migraphx::shape::float_type, {1, 5, 4}}}, - {"scores", migraphx::shape{migraphx::shape::float_type, {1, 1, 5}}}}; - } -}; - From c2ddb73bbdbb89e0a198bc491bbfbb3d8cdb05dd Mon Sep 17 00:00:00 2001 From: charlie Date: Tue, 19 May 2026 16:09:35 -0500 Subject: [PATCH 14/32] Fix kernels and tests --- .../migraphx/kernels/nonmaxsuppression.hpp | 24 +- src/targets/gpu/prepare_nonmaxsuppression.cpp | 5 +- test/gpu/nonmaxsuppression.cpp | 737 ++++++++++++++++-- 3 files changed, 667 insertions(+), 99 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp index c1d4398acc9..bde081bbc69 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp @@ -34,6 +34,7 @@ #include #include #include +#include namespace migraphx { @@ -130,32 +131,29 @@ __device__ void nonmaxsuppression_sort(Boxes boxes_tv, Scores scores_tv, Output auto idx = make_index(); const index_int block_id = idx.group; - const int batch_idx = static_cast(block_id / NumClasses); - const int class_idx = static_cast(block_id % NumClasses); + const int batch_idx = block_id / NumClasses; + const int class_idx = block_id % NumClasses; constexpr auto block_out_shape = make_shape(index_ints{}); auto* p = reinterpret_cast(out_tv.data()) + block_id * AlignedNumBoxes; auto block_out_tv = make_tensor_view(p, block_out_shape); - //const auto* boxes_b = boxes_tv.data() + batch_idx * NumBoxes * 4; - //const auto* scores_bc = scores_tv.data() + (batch_idx * NumClasses + class_idx) * NumBoxes; - // Get tensor_view slice of boxes. numpy slicing: boxes[batch_idx, :, :] - const auto my_boxes = slice_tensor(boxes_tv, batch_idx, slice_axes<1, 2>()); - // Get tensor_view slice of scores. numpy slicing: scores[batch_idx, class_idx, :] - const auto my_scores = slice_tensor(scores_tv, block_id, slice_axes<2>()); + // numpy indexing: scores[batch_idx, class_idx, :] + const auto my_scores = slice_tensor(scores_tv, array{batch_idx, class_idx, 0}, slice_axes<2>()); nms_data tmp_data; idx.local_stride(AlignedNumBoxes, [&](auto i) { if(i < NumBoxes) { tmp_data.score = my_scores[i]; - tmp_data.box = nms_normalize_box(my_boxes + i * 4); + // numpy indexing: boxes[batch_idx, i, :] + tmp_data.box = nms_normalize_box(slice_tensor(boxes_tv, array{batch_idx, i, 0}, slice_axes<2>())); tmp_data.box_index = static_cast(i); } else { - // Sentinel: score so it never beats any real entry - tmp_data.score = numeric_limits::lowest(); + // Sentinel score so it never beats any real entry + tmp_data.score = numeric_lowest(); tmp_data.box = array{0.f, 0.f, 0.f, 0.f}; tmp_data.box_index = -1; } @@ -317,8 +315,8 @@ template __device__ void nonmaxsuppression_compact(const Counts bc_counts, const Idx indices, - Num num_selected, - Out output) + Out output, + Num num_selected) { static_assert(NumBatchClass > 0); static_assert(NumBoxes > 0); diff --git a/src/targets/gpu/prepare_nonmaxsuppression.cpp b/src/targets/gpu/prepare_nonmaxsuppression.cpp index 25f65c65b00..53514963c13 100644 --- a/src/targets/gpu/prepare_nonmaxsuppression.cpp +++ b/src/targets/gpu/prepare_nonmaxsuppression.cpp @@ -148,6 +148,7 @@ struct find_nonmaxsuppression const auto iou_packed = (num_boxes * (num_boxes - 1) / 2); // Fill in missing optional scalar inputs with default literals. + // TODO: this is the wrong way to handle this. Should be checking if the input is eval'able. const shape default_max_s{shape::int64_type, {1}}; const shape default_iou_s{shape::float_type, {1}}; const shape default_thr_s{shape::float_type, {1}}; @@ -183,13 +184,13 @@ struct find_nonmaxsuppression inputs[4], mask_alloc); - auto output = + auto raw_output = m.insert_instruction(ins, make_op("get_tuple_elem", {{"index", 0}}), filter); auto bc_counts = m.insert_instruction(ins, make_op("get_tuple_elem", {{"index", 1}}), filter); auto compact = - m.insert_instruction(ins, make_op("gpu::nms_compact"), output, bc_counts); + m.insert_instruction(ins, make_op("gpu::nms_compact"), bc_counts, raw_output); m.replace_instruction(ins, compact); } diff --git a/test/gpu/nonmaxsuppression.cpp b/test/gpu/nonmaxsuppression.cpp index ad4c1f27fe4..3f7aab9b432 100644 --- a/test/gpu/nonmaxsuppression.cpp +++ b/test/gpu/nonmaxsuppression.cpp @@ -71,28 +71,40 @@ TEST_CASE(nms_test) migraphx::program p; auto* mm = p.get_main_module(); migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}}; - std::vector boxes_vec = {0.5, 0.5, 1.0, 1.0, 0.5, 0.6, 1.0, 1.0, 0.5, 0.4, 1.0, 1.0, - 0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0}; - migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}}; - std::vector scores_vec = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3}; + migraphx::shape scalar_s{migraphx::shape::float_type, {1}}; + migraphx::shape int_scalar_s{migraphx::shape::int64_type, {1}}; - auto boxes_l = mm->add_literal(migraphx::literal(boxes_s, boxes_vec)); - auto scores_l = mm->add_literal(migraphx::literal(scores_s, scores_vec)); - auto max_out_l = mm->add_literal(int64_t{4}); - auto iou_threshold = mm->add_literal(0.5f); - auto score_threshold = mm->add_literal(0.0f); + auto boxes_p = mm->add_parameter("boxes", boxes_s); + auto scores_p = mm->add_parameter("scores", scores_s); + auto max_out_p = mm->add_parameter("max_out", int_scalar_s); + auto iou_threshold = mm->add_parameter("iou_threshold", scalar_s); + auto score_threshold = mm->add_parameter("score_threshold", scalar_s); auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), - boxes_l, - scores_l, - max_out_l, + boxes_p, + scores_p, + max_out_p, iou_threshold, score_threshold); add_nms_return(mm, nms); - auto [indices, num_selected] = run_gpu_nms(std::move(p)); + std::vector boxes_vec = {0.5, 0.5, 1.0, 1.0, 0.5, 0.6, 1.0, 1.0, 0.5, 0.4, 1.0, 1.0, + 0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0}; + std::vector scores_vec = {0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f}; + int64_t max_out_val = 4; + float iou_val = 0.5f; + float score_val = 0.0f; + + migraphx::parameter_map host_params; + host_params["boxes"] = migraphx::argument(boxes_s, boxes_vec.data()); + host_params["scores"] = migraphx::argument(scores_s, scores_vec.data()); + host_params["max_out"] = migraphx::argument(int_scalar_s, &max_out_val); + host_params["iou_threshold"] = migraphx::argument(scalar_s, &iou_val); + host_params["score_threshold"] = migraphx::argument(scalar_s, &score_val); + + auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params); indices.resize(static_cast(num_selected) * 3); std::vector gold = {0, 0, 3, 0, 0, 0, 0, 0, 5}; EXPECT(migraphx::verify::verify_rms_range(indices, gold)); @@ -104,26 +116,34 @@ TEST_CASE(nms_identical_all_test) migraphx::program p; auto* mm = p.get_main_module(); migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}}; - std::vector boxes_vec = {0.5, 0.5, 0.7, 0.7, 0.7, 0.7, 0.5, 0.5, 0.7, 0.7, 0.5, 0.5, - 0.5, 0.5, 0.7, 0.7, 0.5, 0.5, 0.7, 0.7, 0.7, 0.7, 0.5, 0.5}; migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}}; - std::vector scores_vec = {0.9, 0.9, 0.9, 0.9, 0.9, 0.9}; + migraphx::shape scalar_s{migraphx::shape::float_type, {1}}; - auto boxes_l = mm->add_literal(migraphx::literal(boxes_s, boxes_vec)); - auto scores_l = mm->add_literal(migraphx::literal(scores_s, scores_vec)); + auto boxes_p = mm->add_parameter("boxes", boxes_s); + auto scores_p = mm->add_parameter("scores", scores_s); auto max_out_l = mm->add_literal(int64_t{6}); - auto iou_threshold = mm->add_literal(0.1f); + auto iou_threshold = mm->add_parameter("iou_threshold", scalar_s); auto score_threshold = mm->add_literal(0.0f); auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"), - boxes_l, - scores_l, + boxes_p, + scores_p, max_out_l, iou_threshold, score_threshold); add_nms_return(mm, nms); - auto [indices, num_selected] = run_gpu_nms(std::move(p)); + std::vector boxes_vec = {0.5, 0.5, 0.7, 0.7, 0.7, 0.7, 0.5, 0.5, 0.7, 0.7, 0.5, 0.5, + 0.5, 0.5, 0.7, 0.7, 0.5, 0.5, 0.7, 0.7, 0.7, 0.7, 0.5, 0.5}; + std::vector scores_vec = {0.9f, 0.9f, 0.9f, 0.9f, 0.9f, 0.9f}; + float iou_val = 0.1f; + + migraphx::parameter_map host_params; + host_params["boxes"] = migraphx::argument(boxes_s, boxes_vec.data()); + host_params["scores"] = migraphx::argument(scores_s, scores_vec.data()); + host_params["iou_threshold"] = migraphx::argument(scalar_s, &iou_val); + + auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params); indices.resize(static_cast(num_selected) * 3); std::vector gold = {0, 0, 0}; EXPECT(migraphx::verify::verify_rms_range(indices, gold)); @@ -135,28 +155,32 @@ TEST_CASE(nms_not_center_test) migraphx::program p; auto* mm = p.get_main_module(); migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}}; - std::vector boxes_vec = {1.0, 1.0, 0.0, 0.0, 0.0, 0.1, 1.0, 1.1, - 0.0, 0.9, 1.0, -0.1, 0.0, 10.0, 1.0, 11.0, - 1.0, 10.1, 0.0, 11.1, 1.0, 101.0, 0.0, 100.0}; - migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}}; - std::vector scores_vec = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3}; - auto boxes_l = mm->add_literal(migraphx::literal(boxes_s, boxes_vec)); - auto scores_l = mm->add_literal(migraphx::literal(scores_s, scores_vec)); + auto boxes_p = mm->add_parameter("boxes", boxes_s); + auto scores_p = mm->add_parameter("scores", scores_s); auto max_out_l = mm->add_literal(int64_t{4}); auto iou_threshold = mm->add_literal(0.5f); auto score_threshold = mm->add_literal(0.0f); auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"), - boxes_l, - scores_l, + boxes_p, + scores_p, max_out_l, iou_threshold, score_threshold); add_nms_return(mm, nms); - auto [indices, num_selected] = run_gpu_nms(std::move(p)); + std::vector boxes_vec = {1.0, 1.0, 0.0, 0.0, 0.0, 0.1, 1.0, 1.1, + 0.0, 0.9, 1.0, -0.1, 0.0, 10.0, 1.0, 11.0, + 1.0, 10.1, 0.0, 11.1, 1.0, 101.0, 0.0, 100.0}; + std::vector scores_vec = {0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f}; + + migraphx::parameter_map host_params; + host_params["boxes"] = migraphx::argument(boxes_s, boxes_vec.data()); + host_params["scores"] = migraphx::argument(scores_s, scores_vec.data()); + + auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params); indices.resize(static_cast(num_selected) * 3); std::vector gold = {0, 0, 3, 0, 0, 0, 0, 0, 5}; EXPECT(migraphx::verify::verify_rms_range(indices, gold)); @@ -168,32 +192,39 @@ TEST_CASE(nms_transpose1_test) migraphx::program p; auto* mm = p.get_main_module(); migraphx::shape boxes_s{migraphx::shape::float_type, {1, 4, 6}}; - std::vector boxes_vec = { - 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.6, 0.4, 10.5, 10.6, 100.5, - 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - }; - migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}}; - std::vector scores_vec = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3}; + migraphx::shape int_scalar_s{migraphx::shape::int64_type, {1}}; - auto t_boxes_l = mm->add_literal(migraphx::literal(boxes_s, boxes_vec)); - auto scores_l = mm->add_literal(migraphx::literal(scores_s, scores_vec)); - auto max_out_l = mm->add_literal(int64_t{4}); + auto t_boxes_p = mm->add_parameter("boxes", boxes_s); + auto scores_p = mm->add_parameter("scores", scores_s); + auto max_out_p = mm->add_parameter("max_out", int_scalar_s); auto iou_threshold = mm->add_literal(0.5f); auto score_threshold = mm->add_literal(0.0f); auto transpose_boxes = mm->add_instruction( - migraphx::make_op("transpose", {{"permutation", {0, 2, 1}}}), t_boxes_l); + migraphx::make_op("transpose", {{"permutation", {0, 2, 1}}}), t_boxes_p); auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), transpose_boxes, - scores_l, - max_out_l, + scores_p, + max_out_p, iou_threshold, score_threshold); add_nms_return(mm, nms); - auto [indices, num_selected] = run_gpu_nms(std::move(p)); + std::vector boxes_vec = { + 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.6, 0.4, 10.5, 10.6, 100.5, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + }; + std::vector scores_vec = {0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f}; + int64_t max_out_val = 4; + + migraphx::parameter_map host_params; + host_params["boxes"] = migraphx::argument(boxes_s, boxes_vec.data()); + host_params["scores"] = migraphx::argument(scores_s, scores_vec.data()); + host_params["max_out"] = migraphx::argument(int_scalar_s, &max_out_val); + + auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params); indices.resize(static_cast(num_selected) * 3); std::vector gold = {0, 0, 3, 0, 0, 0, 0, 0, 5}; EXPECT(migraphx::verify::verify_rms_range(indices, gold)); @@ -205,32 +236,36 @@ TEST_CASE(nms_transpose2_test) migraphx::program p; auto* mm = p.get_main_module(); migraphx::shape boxes_s{migraphx::shape::float_type, {4, 1, 6}}; - std::vector boxes_vec = { - 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.6, 0.4, 10.5, 10.6, 100.5, - 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - }; - migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}}; - std::vector scores_vec = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3}; - auto t_boxes_l = mm->add_literal(migraphx::literal(boxes_s, boxes_vec)); - auto scores_l = mm->add_literal(migraphx::literal(scores_s, scores_vec)); + auto t_boxes_p = mm->add_parameter("boxes", boxes_s); + auto scores_p = mm->add_parameter("scores", scores_s); auto max_out_l = mm->add_literal(int64_t{4}); auto iou_threshold = mm->add_literal(0.5f); auto score_threshold = mm->add_literal(0.0f); auto transpose_boxes = mm->add_instruction( - migraphx::make_op("transpose", {{"permutation", {1, 2, 0}}}), t_boxes_l); + migraphx::make_op("transpose", {{"permutation", {1, 2, 0}}}), t_boxes_p); auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), transpose_boxes, - scores_l, + scores_p, max_out_l, iou_threshold, score_threshold); add_nms_return(mm, nms); - auto [indices, num_selected] = run_gpu_nms(std::move(p)); + std::vector boxes_vec = { + 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.6, 0.4, 10.5, 10.6, 100.5, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + }; + std::vector scores_vec = {0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f}; + + migraphx::parameter_map host_params; + host_params["boxes"] = migraphx::argument(boxes_s, boxes_vec.data()); + host_params["scores"] = migraphx::argument(scores_s, scores_vec.data()); + + auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params); indices.resize(static_cast(num_selected) * 3); std::vector gold = {0, 0, 3, 0, 0, 0, 0, 0, 5}; EXPECT(migraphx::verify::verify_rms_range(indices, gold)); @@ -242,31 +277,43 @@ TEST_CASE(nms_multi_batch_test) migraphx::program p; auto* mm = p.get_main_module(); migraphx::shape boxes_s{migraphx::shape::float_type, {2, 6, 4}}; - std::vector boxes_vec = {0.5, 0.5, 1.0, 1.0, 0.5, 0.6, 1.0, 1.0, 0.5, 0.4, 1.0, 1.0, - 0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0, - 0.5, 0.5, 1.0, 1.0, 0.5, 0.6, 1.0, 1.0, 0.5, 0.4, 1.0, 1.0, - 0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0}; - migraphx::shape scores_s{migraphx::shape::float_type, {2, 1, 6}}; - std::vector scores_vec = { - 0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.9, 0.75, 0.6, 0.95, 0.5, 0.3}; + migraphx::shape scalar_s{migraphx::shape::float_type, {1}}; + migraphx::shape int_scalar_s{migraphx::shape::int64_type, {1}}; - auto boxes_l = mm->add_literal(migraphx::literal(boxes_s, boxes_vec)); - auto scores_l = mm->add_literal(migraphx::literal(scores_s, scores_vec)); - auto max_out_l = mm->add_literal(int64_t{4}); - auto iou_threshold = mm->add_literal(0.5f); - auto score_threshold = mm->add_literal(0.0f); + auto boxes_p = mm->add_parameter("boxes", boxes_s); + auto scores_p = mm->add_parameter("scores", scores_s); + auto max_out_p = mm->add_parameter("max_out", int_scalar_s); + auto iou_threshold = mm->add_parameter("iou_threshold", scalar_s); + auto score_threshold = mm->add_parameter("score_threshold", scalar_s); auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), - boxes_l, - scores_l, - max_out_l, + boxes_p, + scores_p, + max_out_p, iou_threshold, score_threshold); add_nms_return(mm, nms); - auto [indices, num_selected] = run_gpu_nms(std::move(p)); + std::vector boxes_vec = {0.5, 0.5, 1.0, 1.0, 0.5, 0.6, 1.0, 1.0, 0.5, 0.4, 1.0, 1.0, + 0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0, + 0.5, 0.5, 1.0, 1.0, 0.5, 0.6, 1.0, 1.0, 0.5, 0.4, 1.0, 1.0, + 0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0}; + std::vector scores_vec = { + 0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f, 0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f}; + int64_t max_out_val = 4; + float iou_val = 0.5f; + float score_val = 0.0f; + + migraphx::parameter_map host_params; + host_params["boxes"] = migraphx::argument(boxes_s, boxes_vec.data()); + host_params["scores"] = migraphx::argument(scores_s, scores_vec.data()); + host_params["max_out"] = migraphx::argument(int_scalar_s, &max_out_val); + host_params["iou_threshold"] = migraphx::argument(scalar_s, &iou_val); + host_params["score_threshold"] = migraphx::argument(scalar_s, &score_val); + + auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params); indices.resize(static_cast(num_selected) * 3); std::vector gold = {0, 0, 3, 0, 0, 0, 0, 0, 5, 1, 0, 3, 1, 0, 0, 1, 0, 5}; EXPECT(migraphx::verify::verify_rms_range(indices, gold)); @@ -278,34 +325,556 @@ TEST_CASE(nms_multi_class_test) migraphx::program p; auto* mm = p.get_main_module(); migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}}; - std::vector boxes_vec = {0.0, 0.0, 1.0, 1.0, 0.0, 0.1, 1.0, 1.1, - 0.0, -0.1, 1.0, 0.9, 0.0, 10.0, 1.0, 11.0, - 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0}; - migraphx::shape scores_s{migraphx::shape::float_type, {1, 2, 6}}; - std::vector scores_vec = { - 0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.9, 0.75, 0.6, 0.95, 0.5, 0.3}; + migraphx::shape scalar_s{migraphx::shape::float_type, {1}}; - auto boxes_l = mm->add_literal(migraphx::literal(boxes_s, boxes_vec)); - auto scores_l = mm->add_literal(migraphx::literal(scores_s, scores_vec)); + auto boxes_p = mm->add_parameter("boxes", boxes_s); + auto scores_p = mm->add_parameter("scores", scores_s); auto max_out_l = mm->add_literal(int64_t{2}); auto iou_threshold = mm->add_literal(0.5f); - auto score_threshold = mm->add_literal(0.0f); + auto score_threshold = mm->add_parameter("score_threshold", scalar_s); auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), - boxes_l, - scores_l, + boxes_p, + scores_p, max_out_l, iou_threshold, score_threshold); add_nms_return(mm, nms); - auto [indices, num_selected] = run_gpu_nms(std::move(p)); + std::vector boxes_vec = {0.0, 0.0, 1.0, 1.0, 0.0, 0.1, 1.0, 1.1, + 0.0, -0.1, 1.0, 0.9, 0.0, 10.0, 1.0, 11.0, + 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0}; + std::vector scores_vec = { + 0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f, 0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f}; + float score_val = 0.0f; + + migraphx::parameter_map host_params; + host_params["boxes"] = migraphx::argument(boxes_s, boxes_vec.data()); + host_params["scores"] = migraphx::argument(scores_s, scores_vec.data()); + host_params["score_threshold"] = migraphx::argument(scalar_s, &score_val); + + auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params); indices.resize(static_cast(num_selected) * 3); std::vector gold = {0, 0, 3, 0, 0, 0, 0, 1, 3, 0, 1, 0}; EXPECT(migraphx::verify::verify_rms_range(indices, gold)); EXPECT(num_selected == 4); } +TEST_CASE(nms_20boxes_test) +{ + migraphx::program p; + auto* mm = p.get_main_module(); + migraphx::shape boxes_s{migraphx::shape::float_type, {1, 20, 4}}; + migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 20}}; + + auto boxes_p = mm->add_parameter("boxes", boxes_s); + auto scores_p = mm->add_parameter("scores", scores_s); + auto max_out_l = mm->add_literal(int64_t{10}); + auto iou_threshold = mm->add_literal(0.5000f); + auto score_threshold = mm->add_literal(0.0000f); + + auto nms = + mm->add_instruction(migraphx::make_op("nonmaxsuppression"), + boxes_p, + scores_p, + max_out_l, + iou_threshold, + score_threshold); + add_nms_return(mm, nms); + + std::vector boxes_vec = { + 32.7256f, 35.1377f, 43.0832f, 42.2579f, 13.9286f, 15.6152f, 21.5240f, 28.2727f, 44.0782f, 37.5280f, 52.9916f, 48.3318f, + 38.8011f, 32.1818f, 50.5110f, 37.5550f, 33.9761f, -1.6170f, 43.8622f, 11.0347f, 5.3569f, 42.6478f, 14.1070f, 54.9145f, + 18.9216f, 34.8446f, 27.7505f, 41.2693f, -0.4375f, 36.7849f, 4.8178f, 41.8215f, 6.9987f, 1.1282f, 8.4302f, 11.6832f, + 30.5954f, 21.0410f, 37.7095f, 23.9976f, 35.2360f, 16.6405f, 39.2402f, 20.4393f, 45.0158f, 45.7867f, 51.7352f, 46.8898f, + 9.8174f, 26.1848f, 22.7651f, 38.2017f, 16.3854f, 35.9841f, 20.6606f, 46.2920f, 22.5697f, 16.7346f, 24.3859f, 27.6069f, + 7.0039f, 5.3968f, 11.9433f, 17.3270f, 3.9409f, 24.0168f, 9.0512f, 31.4417f, 18.6518f, -1.2903f, 28.9187f, 7.6721f, + 6.9462f, 39.9030f, 15.7447f, 42.8601f, 27.5034f, 30.2815f, 39.4780f, 32.8849f}; + std::vector scores_vec = { + 0.6979f, 0.4657f, 0.8326f, 0.2503f, 0.1204f, 0.1810f, 0.7501f, 0.5157f, 0.2451f, 0.5509f, 0.2371f, 0.7267f, + 0.5015f, 0.4429f, 0.3714f, 0.6673f, 0.4256f, 0.1789f, 0.2062f, 0.9657f}; + + migraphx::parameter_map host_params; + host_params["boxes"] = migraphx::argument(boxes_s, boxes_vec.data()); + host_params["scores"] = migraphx::argument(scores_s, scores_vec.data()); + + auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params); + indices.resize(static_cast(num_selected) * 3); + std::vector gold = {0, 0, 19, 0, 0, 2, 0, 0, 6, 0, 0, 11, 0, 0, 0, 0, 0, 15, 0, 0, 9, 0, 0, 7, 0, 0, 12, 0, 0, 1}; + EXPECT(migraphx::verify::verify_rms_range(indices, gold)); + EXPECT(num_selected == 10); +} + +TEST_CASE(nms_50boxes_center_test) +{ + migraphx::program p; + auto* mm = p.get_main_module(); + migraphx::shape boxes_s{migraphx::shape::float_type, {1, 50, 4}}; + migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 50}}; + + auto boxes_p = mm->add_parameter("boxes", boxes_s); + auto scores_p = mm->add_parameter("scores", scores_s); + auto max_out_l = mm->add_literal(int64_t{20}); + auto iou_threshold = mm->add_literal(0.4000f); + auto score_threshold = mm->add_literal(0.2000f); + + auto nms = + mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), + boxes_p, + scores_p, + max_out_l, + iou_threshold, + score_threshold); + add_nms_return(mm, nms); + + std::vector boxes_vec = { + 90.8581f, 82.6292f, 23.5447f, 19.9060f, 69.9707f, 89.6161f, 29.1830f, 26.1572f, 26.5870f, 14.0249f, 15.5215f, 14.1630f, + 96.9176f, 55.4036f, 5.1730f, 8.1873f, 77.8751f, 10.8576f, 1.4042f, 7.8632f, 71.6890f, 67.2240f, 7.6600f, 22.6344f, + 44.9361f, 28.1234f, 4.8228f, 24.6805f, 27.2242f, 65.9423f, 20.6521f, 4.0531f, 9.6391f, 72.6995f, 4.5331f, 2.9302f, + 90.2602f, 76.8647f, 15.6836f, 18.2386f, 45.5776f, 10.7741f, 21.1336f, 5.2390f, 20.2363f, 91.6012f, 17.8524f, 24.9153f, + 30.5957f, 23.0214f, 6.7935f, 9.9997f, 57.9220f, 3.7413f, 24.3196f, 5.1723f, 17.6773f, 55.4852f, 21.7468f, 27.7081f, + 85.6614f, 37.0922f, 22.4305f, 5.8004f, 75.8520f, 82.9790f, 4.8007f, 9.2569f, 71.9463f, 80.8251f, 4.5889f, 5.4548f, + 43.2093f, 31.7139f, 27.8993f, 4.3492f, 62.7309f, 95.2899f, 12.5298f, 1.6133f, 58.4098f, 29.0918f, 9.7275f, 2.6065f, + 64.9847f, 51.5057f, 15.1689f, 6.0646f, 8.4444f, 25.5965f, 20.2231f, 2.5481f, 41.5807f, 93.6044f, 28.7131f, 18.1432f, + 4.1614f, 16.4608f, 9.3069f, 20.7407f, 49.3991f, 4.4911f, 27.8194f, 12.4153f, 32.9861f, 43.5097f, 1.7209f, 10.2217f, + 14.4524f, 99.2376f, 17.1007f, 15.6313f, 10.3403f, 89.1677f, 19.3853f, 26.3751f, 58.7645f, 74.8608f, 4.0710f, 25.6828f, + 17.0593f, 89.0792f, 5.0698f, 2.2608f, 92.5120f, 89.3447f, 13.1543f, 6.2635f, 58.1061f, 51.8858f, 29.0207f, 7.8656f, + 34.6870f, 31.5929f, 18.2852f, 8.2322f, 59.0915f, 77.2012f, 28.0577f, 17.5657f, 2.2804f, 66.1661f, 24.3265f, 13.0716f, + 95.8559f, 37.3658f, 14.5541f, 2.4284f, 48.2303f, 9.4467f, 23.7581f, 11.8348f, 78.2735f, 74.6790f, 1.5173f, 16.1888f, + 8.2730f, 26.2461f, 4.1652f, 3.9485f, 48.6658f, 93.6813f, 25.0534f, 25.1703f, 49.0707f, 24.0971f, 24.1077f, 2.5069f, + 93.7826f, 12.2758f, 7.7466f, 27.8204f, 57.1728f, 83.1113f, 16.3923f, 3.8743f, 47.3489f, 15.3284f, 18.5745f, 25.4637f, + 26.6976f, 17.9268f, 26.1644f, 27.1769f, 33.1569f, 59.9383f, 18.4901f, 29.4075f, 52.0672f, 87.4562f, 12.9646f, 24.2588f, + 43.8911f, 19.6435f, 11.8513f, 23.6048f, 2.1612f, 31.0324f, 13.3506f, 19.6320f}; + std::vector scores_vec = { + 0.8011f, 0.2211f, 0.5825f, 0.5628f, 0.8718f, 0.5165f, 0.4466f, 0.6756f, 0.3398f, 0.2258f, 0.5301f, 0.4752f, + 0.3093f, 0.4308f, 0.4298f, 0.3947f, 0.4415f, 0.7172f, 0.3672f, 0.9540f, 0.9247f, 0.5328f, 0.3955f, 0.5819f, + 0.8637f, 0.6873f, 0.8240f, 0.5795f, 0.6696f, 0.3593f, 0.7614f, 0.2822f, 0.7253f, 0.8746f, 0.2189f, 0.6529f, + 0.1856f, 0.7531f, 0.1760f, 0.9423f, 0.2237f, 0.9630f, 0.8208f, 0.6343f, 0.8044f, 0.8156f, 0.9514f, 0.3280f, + 0.6311f, 0.1855f}; + + migraphx::parameter_map host_params; + host_params["boxes"] = migraphx::argument(boxes_s, boxes_vec.data()); + host_params["scores"] = migraphx::argument(scores_s, scores_vec.data()); + + auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params); + indices.resize(static_cast(num_selected) * 3); + std::vector gold = {0, 0, 41, 0, 0, 19, 0, 0, 46, 0, 0, 39, 0, 0, 20, 0, 0, 33, 0, 0, 4, 0, 0, 24, 0, 0, 26, 0, 0, 42, 0, 0, 45, 0, 0, 44, 0, 0, 0, 0, 0, 30, 0, 0, 32, 0, 0, 17, 0, 0, 25, 0, 0, 7, 0, 0, 28, 0, 0, 35}; + EXPECT(migraphx::verify::verify_rms_range(indices, gold)); + EXPECT(num_selected == 20); +} + +TEST_CASE(nms_100boxes_2batch_test) +{ + migraphx::program p; + auto* mm = p.get_main_module(); + migraphx::shape boxes_s{migraphx::shape::float_type, {2, 100, 4}}; + migraphx::shape scores_s{migraphx::shape::float_type, {2, 1, 100}}; + + auto boxes_p = mm->add_parameter("boxes", boxes_s); + auto scores_p = mm->add_parameter("scores", scores_s); + auto max_out_l = mm->add_literal(int64_t{15}); + auto iou_threshold = mm->add_literal(0.5000f); + auto score_threshold = mm->add_literal(0.1000f); + + auto nms = + mm->add_instruction(migraphx::make_op("nonmaxsuppression"), + boxes_p, + scores_p, + max_out_l, + iou_threshold, + score_threshold); + add_nms_return(mm, nms); + + std::vector boxes_vec = { + -3.8699f, 108.8880f, 20.8101f, 137.5783f, 149.9079f, 29.3134f, 203.7504f, 39.2031f, 121.6031f, 107.1528f, 162.2282f, 118.8275f, + 27.1146f, 87.2265f, 42.1365f, 141.7457f, -7.3128f, 91.3799f, 44.0012f, 95.0142f, 25.9397f, 97.1572f, 47.4736f, 111.8955f, + 170.3318f, 143.6689f, 221.6791f, 161.9004f, 82.3933f, 144.8881f, 101.0310f, 174.8098f, 138.9017f, 80.6305f, 174.7306f, 116.2308f, + 115.0719f, 104.8666f, 139.4914f, 134.9707f, 105.8753f, 183.2658f, 123.0900f, 189.2287f, 2.3726f, 16.2585f, 55.6795f, 31.6349f, + 183.1709f, -1.9651f, 195.2389f, 48.8066f, 57.2666f, -1.7671f, 63.2705f, 36.8507f, 105.0166f, 111.9228f, 126.1903f, 151.2225f, + 118.2848f, 63.4507f, 161.6255f, 103.9927f, 105.5274f, 131.8586f, 154.1659f, 177.8699f, 158.1560f, 132.0321f, 218.0818f, 136.4605f, + 20.4451f, 55.4126f, 38.9305f, 78.0425f, 89.1363f, 163.2572f, 114.2048f, 196.0894f, 76.2707f, 142.0220f, 85.3431f, 162.9909f, + 77.3750f, 28.6949f, 112.2925f, 79.5191f, -6.0851f, 58.1025f, 53.7721f, 87.5743f, 5.6429f, 39.7135f, 47.9949f, 86.0625f, + 37.5563f, 5.8879f, 73.6739f, 57.1568f, 48.8660f, 14.1653f, 73.0158f, 44.9480f, 58.0793f, 159.8937f, 113.0820f, 214.5573f, + 107.0385f, 69.7607f, 137.3566f, 105.4010f, 122.4620f, 51.0809f, 131.3896f, 102.2471f, 71.0835f, 135.3897f, 93.6408f, 156.4846f, + 79.2752f, 95.3835f, 84.2380f, 125.8137f, 37.0673f, 171.0514f, 49.9841f, 203.4046f, 116.6400f, 152.4634f, 118.6825f, 159.6572f, + 49.5364f, 83.6166f, 77.2799f, 108.1312f, -12.0070f, 47.7104f, 26.4309f, 102.8334f, 73.0529f, 178.2168f, 94.3071f, 216.4359f, + 81.9253f, 137.8156f, 107.7278f, 149.2885f, 16.3219f, 179.7427f, 73.9152f, 200.7352f, 91.8087f, 17.5434f, 137.1745f, 29.8480f, + 96.6991f, 168.8745f, 129.6096f, 171.3390f, 131.5065f, 99.5547f, 149.2944f, 155.2749f, 102.6283f, 10.6622f, 156.5511f, 38.1065f, + 123.0512f, 108.0793f, 137.9220f, 127.2239f, 53.1452f, 119.0642f, 73.3404f, 155.3743f, 130.1690f, 1.7448f, 184.8039f, 3.1763f, + 93.7074f, 82.1619f, 125.9504f, 99.5652f, 63.8853f, 143.8404f, 108.6820f, 186.3194f, 107.2755f, 39.8756f, 143.1295f, 78.2680f, + 52.3550f, 62.2463f, 91.9079f, 121.1729f, 93.2160f, 69.6623f, 111.8797f, 107.2634f, 139.7207f, 45.7991f, 154.9616f, 74.9719f, + 167.2671f, 160.7261f, 187.2941f, 206.6506f, 179.1259f, 129.1106f, 189.2970f, 183.4070f, 74.4343f, 0.3572f, 127.0189f, 43.8782f, + 95.1992f, 170.4922f, 112.9108f, 228.3217f, 142.9101f, 152.2709f, 177.0380f, 199.4092f, 39.0269f, 30.7110f, 86.7534f, 82.8523f, + 143.8537f, 163.5132f, 191.0993f, 171.2454f, 85.3959f, -0.8223f, 112.2607f, 43.3901f, 8.6218f, 186.3383f, 37.7209f, 213.3036f, + -15.4319f, 116.3204f, 44.2555f, 149.9535f, 147.9980f, 110.2290f, 188.7993f, 149.8210f, -13.4183f, -11.0214f, 35.6454f, 47.1977f, + 28.9969f, 149.8616f, 83.2476f, 208.9517f, 43.0921f, -3.2028f, 90.5599f, 14.8026f, 28.6361f, 26.0199f, 40.5617f, 70.3113f, + 45.6946f, 5.9799f, 79.8627f, 51.2289f, 145.0326f, 144.6320f, 152.0444f, 166.0751f, -16.8246f, 35.4867f, 22.6978f, 43.7950f, + 136.7519f, 180.4197f, 194.1175f, 183.8356f, 155.6840f, 107.8222f, 186.9352f, 154.6854f, 61.1796f, -7.7136f, 87.7250f, 22.1787f, + 29.1652f, -28.4875f, 32.2799f, 30.6594f, 91.3547f, -3.8851f, 148.9814f, 24.5483f, 20.3959f, 91.8365f, 27.4731f, 150.5336f, + 71.2720f, 147.6549f, 74.6957f, 172.9379f, 183.9269f, 23.7969f, 199.4448f, 71.6242f, 196.6597f, 166.8796f, 201.5260f, 172.8839f, + 140.4950f, -5.4397f, 168.3470f, 28.3325f, 46.4677f, 136.0320f, 77.9169f, 184.3535f, 127.8122f, 157.7804f, 147.2538f, 213.3378f, + 139.0779f, 129.6555f, 143.0846f, 179.1879f, 73.7761f, 138.0335f, 81.3605f, 141.2148f, 116.3348f, 156.1013f, 140.0206f, 179.0908f, + -0.1401f, 6.0937f, 4.4311f, 9.9669f, 20.7149f, 36.6326f, 62.9081f, 44.0802f, 98.4106f, 4.5632f, 111.6248f, 45.4062f, + 23.3391f, 79.3651f, 42.1614f, 122.4473f, 21.0547f, 125.7129f, 45.3081f, 172.3624f, 154.4709f, 99.9714f, 180.0508f, 152.0333f, + 197.2776f, 147.9130f, 198.3756f, 192.5394f, 107.3878f, 6.9169f, 115.0000f, 55.1683f, 141.8624f, 144.9798f, 193.7655f, 148.8687f, + 197.5280f, 31.1895f, 198.6007f, 46.0271f, 12.8282f, 35.3058f, 43.8101f, 72.9977f, 74.7088f, 116.1662f, 104.5894f, 167.7956f, + 68.1883f, 195.4082f, 88.8408f, 196.6737f, 2.7857f, 106.6272f, 29.2340f, 137.9903f, 127.5389f, -9.5799f, 174.5932f, 31.3800f, + 61.4403f, 121.8884f, 112.0713f, 124.6352f, 15.4868f, 35.9096f, 55.8899f, 68.2298f, 35.5922f, 56.6701f, 44.2246f, 72.3261f, + 163.1796f, 40.7751f, 180.4136f, 56.2181f, 177.9262f, 90.7157f, 187.1069f, 101.2297f, 33.5656f, 108.4211f, 51.2933f, 164.8822f, + 73.5555f, 18.9549f, 114.3649f, 72.3462f, 119.3443f, 42.7151f, 174.0536f, 89.5792f, 169.1987f, 170.3059f, 182.1476f, 201.8479f, + 59.3192f, -5.2591f, 92.3019f, 24.6868f, 82.2129f, 76.0264f, 124.5949f, 108.2814f, 119.7321f, 125.9828f, 176.9545f, 158.6404f, + 127.7304f, 16.7712f, 164.7240f, 43.4104f, 148.5664f, 5.0880f, 164.6177f, 13.8616f, 95.0352f, 23.4340f, 132.9384f, 31.8482f, + 10.9685f, 155.1733f, 30.8775f, 212.3560f, 151.4989f, -12.8680f, 210.0904f, 16.5719f, 160.8241f, 9.0448f, 185.4050f, 66.2840f, + 138.8994f, 0.9312f, 180.3396f, 11.5822f, 18.7873f, 5.2706f, 21.1577f, 38.9812f, 28.5777f, 117.4022f, 53.1813f, 130.6575f, + 122.4044f, 40.3588f, 175.0358f, 56.2967f, -13.8737f, 112.4558f, 23.1297f, 115.2290f, 182.2486f, 114.0300f, 209.4412f, 122.0482f, + 47.3188f, 142.3400f, 103.5391f, 197.4341f, 118.1700f, -9.0369f, 169.5550f, 10.9335f, 167.5089f, 152.2341f, 187.5196f, 189.1137f, + 62.3618f, 109.6059f, 95.4902f, 138.0417f, 48.8767f, 20.2354f, 78.7763f, 44.8620f, 102.5983f, 138.3968f, 140.8982f, 170.7781f, + 105.8416f, 165.0748f, 126.5542f, 177.1219f, 74.1239f, 21.1889f, 89.5320f, 80.5165f, 92.9311f, 159.1187f, 147.7788f, 208.3988f, + 159.3220f, 68.5139f, 214.8306f, 113.2691f, 68.1500f, 106.3565f, 118.9061f, 135.0133f, 9.9914f, 191.9200f, 68.7055f, 201.9398f, + 52.9639f, 44.6476f, 97.9184f, 99.9669f, 55.7637f, 152.0609f, 101.8791f, 173.2028f, 3.2253f, 61.7017f, 49.2181f, 65.6580f, + 17.8964f, 149.2418f, 47.2522f, 170.4436f, 122.9471f, 96.2103f, 150.8778f, 144.0833f, 60.3089f, 24.4012f, 75.4822f, 62.1410f, + 171.4575f, 60.1555f, 210.5018f, 105.4550f, 39.6844f, 39.6149f, 57.7543f, 87.4394f, 11.6796f, 8.8690f, 27.8902f, 22.3743f, + 132.9151f, -21.7847f, 168.4868f, 33.7186f, 163.6127f, 55.8750f, 188.8017f, 82.7164f, 48.6664f, -15.5441f, 62.5789f, 23.1577f, + 15.8440f, 32.5294f, 64.9913f, 33.6657f, 11.2664f, 115.2323f, 63.0400f, 174.8410f, 98.9553f, 132.8318f, 109.8496f, 150.4047f, + 92.9619f, 145.3852f, 94.4048f, 150.0469f, 41.4721f, 49.4119f, 62.3038f, 77.4494f, -14.9919f, 173.6975f, 33.0612f, 182.3103f, + 71.0426f, 113.7725f, 121.5539f, 123.7598f, 187.2858f, 6.0529f, 196.4472f, 44.3576f, 107.1609f, 16.6524f, 153.8468f, 40.8351f, + 95.1880f, 110.9244f, 103.0146f, 166.3137f, 10.1316f, 24.6737f, 34.1453f, 44.5039f, 20.5283f, 79.5362f, 80.4462f, 123.3809f, + 52.7734f, 184.2525f, 65.1362f, 212.4573f, 147.9188f, -19.1670f, 158.0026f, 20.7701f, 162.3696f, -14.8751f, 188.3148f, 21.5070f, + 161.5482f, 184.1698f, 199.1086f, 213.0640f, 168.8931f, 88.4010f, 224.9343f, 145.4546f, 167.0391f, 14.7719f, 225.9076f, 35.9920f, + 188.0454f, 173.7320f, 193.1542f, 185.1889f, 9.7935f, 155.5723f, 18.9354f, 196.5798f, 3.7319f, 81.7829f, 51.3855f, 132.6973f, + 52.4097f, 122.6709f, 69.3770f, 126.0459f, 83.9766f, 40.8733f, 137.1827f, 68.4016f, -0.6763f, -16.7244f, 39.4674f, 36.9323f, + 165.3600f, 96.2998f, 172.9588f, 141.5273f, 98.2916f, 29.1927f, 148.4108f, 88.7094f, 102.7704f, 116.5475f, 114.1754f, 148.9009f, + 20.0692f, 147.2792f, 46.0554f, 187.2189f, 33.8616f, -5.7911f, 67.4406f, 13.0553f, 16.7898f, 90.6905f, 47.3350f, 147.5951f, + 149.6448f, 34.9492f, 191.1284f, 57.5630f, 97.0913f, 152.4916f, 136.5998f, 197.0638f, 117.2606f, 38.3403f, 176.7911f, 63.1255f, + 29.2236f, 105.0804f, 89.1895f, 139.2277f, 58.5150f, 88.9746f, 89.9861f, 132.4418f, 77.6626f, 63.7197f, 84.2794f, 94.7469f, + 130.0316f, 108.2651f, 173.9744f, 162.7832f, 125.1590f, 132.2845f, 183.7822f, 158.0233f, 31.4721f, 93.7989f, 51.2533f, 132.9762f, + 174.2021f, 141.0848f, 202.4134f, 162.2841f, 11.1001f, 184.1428f, 37.1620f, 209.2240f, 177.2076f, 70.3730f, 181.2413f, 97.3360f, + -0.2527f, 98.7053f, 40.4109f, 107.1279f, 41.9845f, -0.7119f, 63.8314f, 5.6998f, 145.5655f, 139.0148f, 193.0259f, 179.3967f, + 10.8509f, 84.2082f, 60.9460f, 123.8838f, 57.9873f, 61.5364f, 107.4399f, 101.6481f, 77.1802f, 17.7313f, 102.7635f, 19.8975f, + 39.0662f, 167.7982f, 59.0374f, 188.0644f, 119.4588f, 72.6661f, 164.6393f, 85.3368f, 146.1259f, 113.0609f, 194.4079f, 159.9718f, + 159.9229f, 3.9862f, 189.9071f, 55.7634f, 41.0200f, 184.5329f, 94.7088f, 200.0870f}; + std::vector scores_vec = { + 0.1439f, 0.8791f, 0.0961f, 0.1535f, 0.5338f, 0.0675f, 0.0528f, 0.0005f, 0.4363f, 0.7746f, 0.0348f, 0.6523f, + 0.8231f, 0.1680f, 0.1469f, 0.8608f, 0.8231f, 0.5389f, 0.8192f, 0.0928f, 0.3945f, 0.7378f, 0.2575f, 0.7523f, + 0.5042f, 0.7503f, 0.4647f, 0.3679f, 0.2192f, 0.2084f, 0.7515f, 0.1189f, 0.0860f, 0.1763f, 0.1753f, 0.8231f, + 0.3985f, 0.9904f, 0.1372f, 0.6535f, 0.4487f, 0.3929f, 0.8751f, 0.9756f, 0.8729f, 0.1923f, 0.2208f, 0.6561f, + 0.2891f, 0.7347f, 0.5664f, 0.5509f, 0.8285f, 0.7105f, 0.0266f, 0.0495f, 0.6016f, 0.4862f, 0.2602f, 0.4187f, + 0.7579f, 0.8266f, 0.5612f, 0.3854f, 0.2707f, 0.5219f, 0.3147f, 0.5641f, 0.6767f, 0.0661f, 0.0011f, 0.2123f, + 0.8945f, 0.6463f, 0.1720f, 0.8903f, 0.4700f, 0.4761f, 0.9355f, 0.0595f, 0.2152f, 0.5858f, 0.1955f, 0.6795f, + 0.2141f, 0.0992f, 0.2070f, 0.4227f, 0.1761f, 0.1347f, 0.8603f, 0.3204f, 0.3608f, 0.0553f, 0.3574f, 0.2648f, + 0.6105f, 0.2054f, 0.8884f, 0.9297f, 0.0998f, 0.1074f, 0.1153f, 0.6196f, 0.1220f, 0.8524f, 0.7543f, 0.8198f, + 0.5261f, 0.9967f, 0.0442f, 0.4013f, 0.3239f, 0.9486f, 0.5769f, 0.8062f, 0.1703f, 0.9786f, 0.4986f, 0.4937f, + 0.9709f, 0.3807f, 0.3975f, 0.5848f, 0.1281f, 0.3211f, 0.1932f, 0.1033f, 0.8661f, 0.5893f, 0.3587f, 0.4087f, + 0.4315f, 0.6331f, 0.9268f, 0.9328f, 0.3915f, 0.3293f, 0.4510f, 0.5679f, 0.4618f, 0.6588f, 0.5544f, 0.3207f, + 0.3457f, 0.3786f, 0.0946f, 0.1661f, 0.7231f, 0.3891f, 0.2145f, 0.5627f, 0.7555f, 0.2574f, 0.8268f, 0.9275f, + 0.5974f, 0.6689f, 0.0526f, 0.9455f, 0.3925f, 0.9239f, 0.5790f, 0.0046f, 0.0385f, 0.6804f, 0.5627f, 0.0265f, + 0.7435f, 0.8521f, 0.4964f, 0.4658f, 0.0055f, 0.7866f, 0.3307f, 0.8788f, 0.3731f, 0.5651f, 0.2703f, 0.1606f, + 0.7749f, 0.4966f, 0.5365f, 0.9654f, 0.9636f, 0.8556f, 0.1876f, 0.5943f, 0.8781f, 0.3745f, 0.1011f, 0.8110f, + 0.4818f, 0.5644f, 0.9821f, 0.6072f, 0.4250f, 0.3700f, 0.4176f, 0.1184f}; + + migraphx::parameter_map host_params; + host_params["boxes"] = migraphx::argument(boxes_s, boxes_vec.data()); + host_params["scores"] = migraphx::argument(scores_s, scores_vec.data()); + + auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params); + indices.resize(static_cast(num_selected) * 3); + std::vector gold = {0, 0, 37, 0, 0, 43, 0, 0, 78, 0, 0, 99, 0, 0, 72, 0, 0, 75, 0, 0, 98, 0, 0, 1, 0, 0, 42, 0, 0, 44, 0, 0, 15, 0, 0, 90, 0, 0, 52, 0, 0, 61, 0, 0, 12, 1, 0, 9, 1, 0, 94, 1, 0, 17, 1, 0, 20, 1, 0, 83, 1, 0, 84, 1, 0, 13, 1, 0, 59, 1, 0, 35, 1, 0, 55, 1, 0, 34, 1, 0, 61, 1, 0, 75, 1, 0, 88, 1, 0, 28}; + EXPECT(migraphx::verify::verify_rms_range(indices, gold)); + EXPECT(num_selected == 30); +} + +TEST_CASE(nms_30boxes_3class_test) +{ + migraphx::program p; + auto* mm = p.get_main_module(); + migraphx::shape boxes_s{migraphx::shape::float_type, {1, 30, 4}}; + migraphx::shape scores_s{migraphx::shape::float_type, {1, 3, 30}}; + + auto boxes_p = mm->add_parameter("boxes", boxes_s); + auto scores_p = mm->add_parameter("scores", scores_s); + auto max_out_l = mm->add_literal(int64_t{5}); + auto iou_threshold = mm->add_literal(0.4500f); + auto score_threshold = mm->add_literal(0.1500f); + + auto nms = + mm->add_instruction(migraphx::make_op("nonmaxsuppression"), + boxes_p, + scores_p, + max_out_l, + iou_threshold, + score_threshold); + add_nms_return(mm, nms); + + std::vector boxes_vec = { + 31.2680f, 53.5348f, 37.7043f, 73.6253f, 1.8071f, 55.2945f, 3.9368f, 78.7402f, 40.5016f, 12.5670f, 45.0345f, 32.9366f, + 78.2552f, 12.9548f, 80.7117f, 35.6526f, 73.9527f, 67.9870f, 79.4405f, 71.9065f, -3.8066f, -7.7339f, 10.2705f, 11.5692f, + 45.4706f, 34.8613f, 67.4569f, 48.4119f, 17.4632f, 30.3439f, 30.8192f, 43.8443f, 64.5403f, 44.3725f, 79.9380f, 66.0477f, + 0.7877f, 1.3956f, 6.4307f, 24.7471f, 65.1632f, 44.8608f, 84.5766f, 62.0721f, 59.3935f, 24.0849f, 74.6026f, 36.1925f, + -1.0372f, 43.7485f, 19.8379f, 55.2458f, -6.6257f, -1.7353f, 16.1976f, 8.1505f, 62.2758f, 32.2798f, 71.2775f, 41.5966f, + 10.9190f, 36.7777f, 14.0023f, 46.7824f, 39.6937f, 15.6139f, 45.8900f, 18.6783f, 67.7244f, 9.7794f, 78.7948f, 12.5604f, + 34.0204f, 5.6094f, 56.7713f, 24.5464f, 26.9281f, 21.9014f, 36.6292f, 33.1611f, 26.2374f, -3.4581f, 44.9652f, 18.9477f, + -1.6661f, 68.2450f, 11.7649f, 83.3261f, 74.8979f, 31.4950f, 80.1025f, 33.3041f, 20.6639f, 62.4061f, 29.0408f, 67.0291f, + 7.1374f, 75.0864f, 23.1608f, 80.8203f, 14.6460f, -5.2621f, 31.1216f, 18.1798f, 71.6501f, 49.1185f, 82.6496f, 55.1487f, + 4.4135f, 63.2815f, 10.6723f, 76.1439f, 60.5823f, 39.4727f, 78.1862f, 62.0048f, 54.1855f, 22.5844f, 59.0696f, 46.0598f}; + std::vector scores_vec = { + 0.9367f, 0.1879f, 0.1073f, 0.4976f, 0.5195f, 0.5082f, 0.4367f, 0.9948f, 0.4863f, 0.4779f, 0.4218f, 0.0668f, + 0.5930f, 0.2280f, 0.6376f, 0.0508f, 0.9814f, 0.4690f, 0.8968f, 0.4756f, 0.0603f, 0.8222f, 0.6482f, 0.7818f, + 0.4282f, 0.6379f, 0.8562f, 0.6311f, 0.3477f, 0.6625f, 0.6719f, 0.9606f, 0.3709f, 0.4251f, 0.8121f, 0.5058f, + 0.7366f, 0.4597f, 0.2155f, 0.7452f, 0.1312f, 0.1986f, 0.6268f, 0.7473f, 0.8947f, 0.2726f, 0.1107f, 0.9560f, + 0.1544f, 0.1977f, 0.2913f, 0.5294f, 0.8828f, 0.7605f, 0.7082f, 0.1752f, 0.3577f, 0.4784f, 0.1474f, 0.2734f, + 0.3083f, 0.1273f, 0.5502f, 0.7050f, 0.0699f, 0.4811f, 0.7822f, 0.7480f, 0.8151f, 0.4482f, 0.8206f, 0.2408f, + 0.3608f, 0.1764f, 0.4675f, 0.3921f, 0.2409f, 0.7518f, 0.3138f, 0.2728f, 0.1309f, 0.4388f, 0.3030f, 0.3693f, + 0.2360f, 0.7632f, 0.9300f, 0.4979f, 0.6430f, 0.8672f}; + + migraphx::parameter_map host_params; + host_params["boxes"] = migraphx::argument(boxes_s, boxes_vec.data()); + host_params["scores"] = migraphx::argument(scores_s, scores_vec.data()); + + auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params); + indices.resize(static_cast(num_selected) * 3); + std::vector gold = {0, 0, 7, 0, 0, 16, 0, 0, 0, 0, 0, 18, 0, 0, 26, 0, 1, 1, 0, 1, 17, 0, 1, 14, 0, 1, 22, 0, 1, 4, 0, 2, 26, 0, 2, 29, 0, 2, 10, 0, 2, 6, 0, 2, 25}; + EXPECT(migraphx::verify::verify_rms_range(indices, gold)); + EXPECT(num_selected == 15); +} + +TEST_CASE(nms_200boxes_2batch_2class_test) +{ + migraphx::program p; + auto* mm = p.get_main_module(); + migraphx::shape boxes_s{migraphx::shape::float_type, {2, 200, 4}}; + migraphx::shape scores_s{migraphx::shape::float_type, {2, 2, 200}}; + + auto boxes_p = mm->add_parameter("boxes", boxes_s); + auto scores_p = mm->add_parameter("scores", scores_s); + auto max_out_l = mm->add_literal(int64_t{25}); + auto iou_threshold = mm->add_literal(0.3000f); + auto score_threshold = mm->add_literal(0.2500f); + + auto nms = + mm->add_instruction(migraphx::make_op("nonmaxsuppression"), + boxes_p, + scores_p, + max_out_l, + iou_threshold, + score_threshold); + add_nms_return(mm, nms); + + std::vector boxes_vec = { + 132.1894f, 453.1169f, 199.9736f, 545.7127f, 64.3090f, 275.1729f, 104.8258f, 338.3436f, 76.1273f, 401.7875f, 135.6448f, 487.9920f, + 12.8305f, 442.3624f, 77.1708f, 466.2458f, -5.9609f, 340.1129f, 126.0715f, 451.3386f, 15.0119f, 224.3769f, 56.2927f, 236.5545f, + 427.8277f, -14.2917f, 561.9954f, 95.4457f, 4.7940f, -55.8461f, 69.2637f, 71.6517f, 41.3494f, 202.9014f, 91.1927f, 274.2992f, + 375.6902f, 208.6749f, 451.5645f, 285.6396f, 258.4982f, 179.9212f, 321.7420f, 227.4412f, 367.5344f, 211.3590f, 406.8828f, 356.8083f, + 277.5064f, 220.9636f, 353.4056f, 331.1991f, 429.2783f, 390.3169f, 452.8968f, 446.2962f, 292.5150f, 40.8054f, 345.9525f, 67.8517f, + 218.4112f, 95.7302f, 303.7139f, 129.4475f, 325.0759f, 361.4403f, 387.6738f, 431.5647f, 161.8149f, 353.1971f, 285.5779f, 494.6398f, + 153.4061f, 442.2182f, 192.6577f, 552.6060f, 161.0782f, 419.9203f, 306.5742f, 452.9917f, 25.8953f, 380.4122f, 143.8188f, 509.4868f, + 325.7002f, 128.4980f, 470.8716f, 185.8499f, 67.4107f, 136.8775f, 193.2931f, 264.7841f, 65.6790f, 115.5359f, 87.8525f, 152.5492f, + 83.4548f, 256.5595f, 162.8974f, 349.7399f, 407.8717f, 399.8657f, 434.1985f, 538.9396f, 103.6427f, 152.6073f, 226.5586f, 192.0336f, + 299.0049f, 226.3779f, 387.0450f, 330.6239f, 408.0779f, 74.0950f, 448.3318f, 222.2046f, -30.8828f, 73.1804f, 108.6275f, 96.6196f, + 373.4308f, 90.5068f, 391.5936f, 104.6787f, 111.3250f, -21.7549f, 196.3405f, 79.7002f, 54.0937f, 448.8364f, 162.5287f, 500.4571f, + 339.5665f, 195.6321f, 349.3349f, 207.2475f, 409.8580f, 381.1502f, 499.9386f, 452.9707f, 86.2250f, 284.0088f, 208.7943f, 397.3206f, + 278.8861f, 74.2190f, 289.9477f, 117.7022f, 106.2550f, 62.2701f, 183.5792f, 113.1921f, 257.3803f, 342.4895f, 296.9053f, 469.4987f, + 261.0432f, 93.1105f, 360.8189f, 171.6012f, 295.8262f, 393.3591f, 314.5092f, 519.9261f, 241.4629f, 36.2717f, 382.0835f, 103.7837f, + 0.3826f, 267.3577f, 134.6972f, 410.3510f, 332.4151f, 358.2527f, 361.1253f, 456.2211f, 312.7919f, 108.4937f, 361.9585f, 126.7627f, + 297.0153f, 71.6643f, 385.5729f, 204.5431f, -16.9604f, 445.3092f, 91.0309f, 519.2097f, 189.9415f, 121.2467f, 256.8973f, 143.3509f, + 192.3739f, 203.1031f, 216.6613f, 226.8539f, 35.0965f, 164.5365f, 51.6150f, 267.9791f, 36.2014f, 122.4881f, 186.1665f, 130.5466f, + 186.0576f, 366.0443f, 254.9050f, 409.7468f, 305.9496f, 375.0105f, 436.9568f, 396.8388f, 82.0940f, 155.7987f, 154.9680f, 222.5193f, + 345.6593f, 386.1935f, 484.0906f, 448.9323f, 265.8611f, 67.1577f, 279.9372f, 145.9173f, 371.2164f, -19.1800f, 389.2053f, 23.4858f, + 166.5204f, 282.6964f, 306.0356f, 288.4709f, 178.5089f, 450.7671f, 320.6853f, 543.3107f, 285.9132f, -9.0198f, 333.8062f, 47.6641f, + 437.0255f, 54.9746f, 490.9451f, 153.0235f, 211.6987f, 250.8616f, 280.1138f, 268.0530f, 232.8247f, 403.4440f, 295.8328f, 406.4968f, + 286.3401f, 25.5231f, 315.6569f, 63.5189f, 301.3286f, 163.1046f, 436.1865f, 232.1301f, 16.5538f, 343.6795f, 55.2966f, 403.3963f, + 204.8009f, 124.9041f, 310.8865f, 246.6391f, 235.2927f, 65.7693f, 246.2989f, 123.0671f, 457.4555f, 57.7300f, 464.2295f, 137.7658f, + 197.5504f, 160.3075f, 295.9562f, 249.7413f, 208.4036f, 237.5821f, 259.9170f, 241.8350f, 431.7683f, 392.0298f, 530.4317f, 469.7846f, + 217.7836f, 294.9363f, 232.7928f, 347.3161f, 19.1783f, 313.3156f, 161.7061f, 377.0863f, 52.1937f, 483.5222f, 164.7224f, 499.4650f, + -18.1881f, 147.1016f, 113.3757f, 264.7419f, -10.3830f, 130.9681f, 10.9511f, 272.3863f, 191.6208f, 459.5145f, 240.3248f, 463.8325f, + 356.6797f, 77.6355f, 412.5629f, 168.2401f, 326.2139f, 307.5013f, 407.2526f, 422.3140f, -6.5422f, 355.5684f, 38.6912f, 399.0047f, + 279.9745f, -10.2789f, 290.0085f, 108.0669f, 49.1601f, 186.5052f, 105.1230f, 281.7262f, 451.0742f, 30.5586f, 490.0021f, 170.0038f, + 54.4314f, 19.1028f, 112.9336f, 166.2725f, 298.1461f, 228.2593f, 328.4931f, 235.5688f, 143.1079f, 111.0670f, 183.1305f, 178.3627f, + 273.5727f, 356.7796f, 367.9886f, 439.2808f, 176.7118f, 442.3701f, 235.5468f, 465.2348f, 353.5905f, 375.8070f, 406.0526f, 426.9136f, + 75.0636f, 58.9357f, 155.6155f, 207.0952f, 394.8923f, 135.3580f, 510.8995f, 138.7764f, 221.3792f, 93.1523f, 278.8305f, 161.5760f, + 333.7764f, 4.2413f, 422.3168f, 130.7968f, 352.3830f, 447.2686f, 497.3472f, 496.5298f, 460.0268f, 164.7789f, 538.8018f, 237.2689f, + 43.6929f, 38.9803f, 180.2527f, 185.7092f, 83.8176f, 387.4572f, 203.0748f, 459.2138f, 120.3420f, 189.3440f, 130.0911f, 209.8513f, + 98.9678f, 13.2052f, 163.9035f, 21.9117f, 238.6976f, 10.0373f, 343.7471f, 151.9043f, 422.7512f, 299.3224f, 570.7713f, 339.9280f, + 460.4900f, 353.3999f, 529.7881f, 429.5054f, 255.9741f, 98.2099f, 270.7991f, 112.7245f, 277.1439f, 426.6355f, 361.8833f, 490.7601f, + 420.0563f, 355.7057f, 439.9143f, 495.2914f, 409.9785f, 386.2606f, 522.9550f, 462.1201f, 63.6084f, 40.9810f, 140.2522f, 186.6801f, + 209.8752f, 5.4847f, 318.6665f, 45.0513f, 351.1511f, 395.6231f, 481.6860f, 471.8004f, 104.2444f, 88.3651f, 198.9577f, 217.4352f, + 173.7778f, 275.5634f, 266.0312f, 343.3530f, 436.0951f, 358.6616f, 549.5261f, 401.3052f, 429.2604f, -0.0863f, 555.7863f, 128.3795f, + 387.8089f, 360.8724f, 518.2979f, 419.9659f, 396.0101f, 429.2169f, 402.4382f, 509.2946f, 92.6291f, 290.9362f, 176.5014f, 437.4388f, + 143.8130f, 206.2184f, 177.0371f, 235.0044f, 209.0457f, 415.3847f, 338.2372f, 461.2934f, 231.5831f, 260.9141f, 329.1943f, 266.5435f, + 220.9448f, 342.6935f, 284.5580f, 402.0774f, 303.8214f, 394.8393f, 332.8489f, 425.6666f, 178.4043f, 323.5138f, 229.9188f, 425.8390f, + 321.6556f, 129.9190f, 427.5185f, 157.9359f, 151.0502f, 8.1484f, 182.4998f, 109.6955f, 157.8666f, 99.0403f, 172.8104f, 139.2982f, + -3.0452f, 224.4737f, 130.2711f, 278.4012f, 36.9224f, 226.1483f, 151.7898f, 279.1286f, 409.8757f, 237.4242f, 440.6452f, 345.2202f, + 200.8640f, 162.2960f, 245.4184f, 232.8059f, 41.0147f, 366.0289f, 186.8531f, 420.8625f, 326.4108f, 392.5565f, 432.9303f, 520.5973f, + 231.0067f, 80.2522f, 322.9745f, 166.4729f, -12.8403f, 351.8312f, 33.9963f, 384.6920f, 135.3959f, 271.4291f, 180.9655f, 406.5427f, + 85.0562f, 235.5178f, 91.9452f, 287.5727f, 273.1645f, 90.8612f, 382.7083f, 97.6691f, 133.7990f, 360.2684f, 141.2321f, 434.9638f, + 31.6115f, 470.5798f, 33.3353f, 490.0465f, -27.3799f, 342.6524f, 82.3149f, 379.1839f, 219.6726f, 402.7702f, 362.0547f, 515.0898f, + -45.9977f, 481.8516f, 67.7212f, 502.3336f, 388.7589f, 115.4080f, 460.0333f, 236.6427f, 40.9882f, 248.8122f, 114.4089f, 389.4114f, + 270.2910f, 191.2797f, 336.2753f, 282.6530f, 197.6581f, 439.8926f, 247.0300f, 546.7361f, 182.0580f, -6.7583f, 260.7935f, 100.5661f, + 3.2778f, 131.7233f, 68.5193f, 280.6516f, 356.3126f, 411.8249f, 446.4396f, 463.7141f, 379.1163f, 129.3928f, 513.9362f, 154.6585f, + -69.1199f, 354.7185f, 80.1365f, 433.0744f, 82.9357f, 151.1645f, 95.6685f, 231.6187f, 422.7932f, 476.2348f, 481.1110f, 503.7437f, + 260.7842f, 395.5883f, 288.7094f, 487.9416f, 48.2868f, 149.1079f, 101.7528f, 152.2125f, 79.4785f, 315.4853f, 123.3120f, 454.7079f, + 316.4901f, 148.2175f, 343.4961f, 188.6391f, 304.9847f, 299.7342f, 419.8321f, 306.6287f, 262.2399f, 320.6758f, 337.1869f, 337.8050f, + 407.5904f, 396.3992f, 545.5580f, 433.1963f, 244.1037f, -8.6806f, 249.9599f, 33.1314f, 144.6461f, 107.1346f, 155.6258f, 113.0233f, + 208.0726f, 334.6470f, 269.1603f, 377.2708f, 173.3525f, 266.8875f, 186.3138f, 296.6358f, 92.1346f, 219.0953f, 132.2813f, 276.5098f, + -50.9776f, -1.5900f, 96.9408f, 56.8000f, 160.0388f, 148.3819f, 192.1737f, 199.8940f, 340.4449f, 407.6198f, 370.9644f, 457.4804f, + -34.0173f, 8.2614f, 52.4551f, 22.6314f, 181.9884f, 195.8403f, 257.1901f, 200.5959f, 278.2621f, 457.0166f, 365.7473f, 488.1317f, + 276.6353f, -31.4300f, 333.7688f, 82.3108f, 326.2304f, 300.5375f, 450.4180f, 449.1682f, 394.4356f, 59.1311f, 416.0841f, 198.4815f, + 323.4377f, 395.2401f, 388.2682f, 471.3687f, -0.4884f, 332.9131f, 103.2861f, 413.1549f, 172.3276f, 418.9163f, 302.6948f, 466.7889f, + 273.6699f, 49.8039f, 329.7361f, 166.1209f, 79.9860f, 208.1720f, 165.5801f, 323.1208f, 15.6250f, 326.2367f, 26.9268f, 453.0333f, + 98.6064f, 55.6348f, 124.9839f, 190.0650f, 221.7964f, 82.5141f, 233.0980f, 148.2322f, 152.2380f, -44.0412f, 261.6923f, 71.2233f, + 66.3730f, 418.6809f, 110.2940f, 539.8344f, 357.7888f, 331.5282f, 466.6268f, 378.4887f, 457.3967f, 248.0516f, 468.2900f, 387.5087f, + 35.9143f, 364.4689f, 165.4340f, 379.5258f, 402.0395f, 191.2334f, 527.5334f, 340.3795f, 1.8053f, 180.1951f, 16.0557f, 295.9387f, + 460.2114f, 217.3174f, 464.7511f, 232.2148f, 471.2709f, 270.8305f, 480.6579f, 369.6087f, -58.0695f, 97.7211f, 70.1214f, 103.8139f, + 363.5242f, 386.1504f, 399.4951f, 501.9083f, 443.7544f, 345.8341f, 526.4471f, 465.9183f, 420.6959f, 129.4022f, 485.2063f, 220.1614f, + 425.5884f, 224.9686f, 545.1217f, 353.6407f, 238.2388f, 62.7213f, 312.0847f, 78.3060f, 1.2788f, 465.1168f, 76.8773f, 507.2295f, + 350.7072f, 420.0901f, 499.0819f, 482.8026f, 295.2295f, 457.2856f, 318.5988f, 464.6119f, 248.9387f, 366.2193f, 368.7308f, 464.4846f, + 266.4057f, -43.0988f, 411.9049f, 94.8485f, 365.3591f, 230.8355f, 381.3726f, 246.8133f, 213.6699f, 419.1429f, 302.9046f, 467.1919f, + 282.3146f, 326.7091f, 321.6300f, 338.5049f, 157.0835f, 271.7193f, 238.9818f, 413.4953f, -3.7474f, 97.9864f, 45.0004f, 165.3309f, + 28.3577f, 158.4742f, 71.5941f, 260.1006f, 284.2465f, 120.1271f, 370.7495f, 246.4540f, 483.6205f, 186.3921f, 511.9348f, 335.0511f, + -27.5488f, 218.5612f, 43.3521f, 243.6668f, 229.8062f, 103.3855f, 327.7773f, 223.5129f, 365.4548f, 86.1273f, 385.5540f, 219.3533f, + 343.5581f, 121.2852f, 483.2167f, 129.5677f, 234.4260f, 125.8439f, 310.7789f, 239.2034f, 248.4032f, 48.0437f, 371.5128f, 101.8978f, + 299.1465f, 387.2317f, 397.5784f, 484.8726f, 376.0880f, 262.2631f, 482.8782f, 339.8563f, 7.2930f, 47.0424f, 114.9965f, 86.7440f, + 397.3961f, 336.3557f, 528.7860f, 357.5037f, -33.2049f, 414.6207f, 59.2223f, 433.0458f, 396.8727f, 110.5703f, 439.3271f, 126.9654f, + 30.4567f, 27.2849f, 46.2837f, 123.3157f, 51.6484f, -22.3715f, 142.9798f, 30.9887f, -3.4962f, 6.9860f, 7.3904f, 40.2644f, + 204.1520f, 329.0802f, 241.1047f, 433.1711f, 162.1569f, 441.9229f, 172.2023f, 545.2635f, 41.6043f, -18.2279f, 124.3886f, 63.1082f, + 213.0999f, 303.8811f, 237.9903f, 444.1898f, 155.2101f, 6.7177f, 247.1608f, 65.1444f, 324.4111f, 233.2946f, 443.2500f, 358.8382f, + 384.8351f, 371.9398f, 508.2953f, 384.1355f, 302.7226f, 123.9848f, 349.8446f, 235.2196f, 20.8081f, -68.6720f, 103.6023f, 79.6067f, + 105.2511f, 234.0231f, 190.1397f, 361.1662f, 420.9290f, 451.9373f, 492.3893f, 539.3073f, -4.9387f, 81.6146f, 93.6732f, 176.0028f, + 187.2764f, 67.9256f, 219.5794f, 121.5657f, 397.7987f, 10.8413f, 544.7059f, 113.0846f, 467.5255f, 219.7334f, 483.1394f, 335.5223f, + 143.3246f, 223.3545f, 267.8786f, 373.0906f, 288.9383f, 358.9469f, 378.4586f, 433.3239f, 209.6311f, 371.4695f, 247.1145f, 381.6038f, + 320.6775f, 401.3793f, 432.7831f, 491.1622f, 8.9968f, 393.5190f, 22.5845f, 412.2537f, 13.8844f, 104.8985f, 130.2727f, 142.3685f, + 262.6455f, 252.9446f, 351.5533f, 302.9328f, 107.5252f, 93.7443f, 125.0270f, 203.6677f, 326.6030f, 150.6990f, 339.4493f, 179.0864f, + 119.1742f, 453.1236f, 232.0488f, 478.8208f, 420.9991f, 337.0981f, 465.6465f, 344.7978f, 342.8767f, 421.7388f, 476.3827f, 552.8516f, + 189.1445f, 156.2901f, 303.6933f, 260.6224f, 333.9324f, 265.2428f, 438.9627f, 272.1948f, 114.3128f, 240.9499f, 156.8251f, 246.1655f, + 193.8135f, 11.5223f, 300.4463f, 95.7648f, 27.6040f, 96.8022f, 169.8780f, 139.8998f, 423.1219f, 218.8621f, 437.7643f, 308.7743f, + 386.7347f, 0.8091f, 436.3329f, 66.5652f, 433.0917f, 396.4442f, 469.0579f, 535.0178f, 408.9413f, 39.9801f, 468.5356f, 83.8636f, + 423.9944f, 47.8940f, 535.6019f, 150.0867f, 78.3370f, 378.1336f, 149.9992f, 387.1877f, 422.8927f, -23.2443f, 508.9316f, 120.1789f, + 261.7021f, 376.5726f, 309.5111f, 523.7055f, 200.2215f, 307.2894f, 222.2736f, 418.4116f, 259.8004f, -0.8479f, 300.5735f, 69.4688f, + 106.7550f, 329.0340f, 235.8474f, 362.8130f, 98.8964f, 254.7818f, 189.6566f, 376.8467f, 91.9970f, 323.3163f, 149.3173f, 434.0331f, + -18.1340f, 397.0634f, 100.5620f, 431.1345f, 242.9804f, 325.0598f, 253.5845f, 393.2908f, 424.4659f, 258.1096f, 463.2957f, 328.0667f, + 297.4333f, 99.1641f, 332.7187f, 223.2992f, 186.5782f, 297.1904f, 334.3975f, 400.0833f, 161.1921f, 430.0698f, 267.4008f, 526.9018f, + 185.6758f, 244.8488f, 278.7259f, 342.6730f, 103.7673f, 311.5224f, 105.5101f, 352.8224f, 397.2368f, 190.3715f, 425.6990f, 246.7565f, + 51.3437f, 374.1586f, 147.0393f, 381.9622f, 329.5223f, 439.7066f, 387.1005f, 557.9608f, 310.6336f, 47.4363f, 449.3514f, 112.9530f, + 229.9626f, 68.0539f, 344.9065f, 134.3514f, 397.6331f, 250.9398f, 465.2933f, 288.4979f, 89.1863f, 224.5854f, 201.8640f, 256.7900f, + 367.6410f, 241.4922f, 513.9763f, 330.0776f, 329.8622f, 6.7118f, 399.5483f, 42.3622f, 351.0067f, 196.8547f, 447.7431f, 207.4218f, + 263.3493f, 233.8098f, 401.2304f, 349.1684f, 404.1452f, 264.0487f, 442.1978f, 321.1426f, 430.0009f, 299.8394f, 563.0980f, 357.4945f, + 202.3143f, 327.4748f, 217.8485f, 392.7412f, 358.1485f, 259.5528f, 455.7672f, 381.9944f, 313.4684f, 370.7192f, 431.1113f, 419.5239f, + 180.1469f, 255.4066f, 272.7232f, 369.3540f, 426.0572f, 198.2577f, 500.8918f, 339.2499f, 150.7206f, 253.3635f, 243.7053f, 352.8329f, + 270.9340f, 17.9364f, 294.5319f, 83.2569f, 36.4112f, 80.3679f, 69.5312f, 192.7886f, 92.2801f, 229.0865f, 133.4951f, 298.3132f, + 375.3135f, 405.1188f, 465.3827f, 467.8684f, 164.8547f, 299.8922f, 231.6980f, 379.1594f, 178.3286f, 21.0337f, 215.7555f, 69.3744f, + 56.7212f, 287.8708f, 189.2598f, 304.4041f, 217.4480f, 79.4625f, 274.1624f, 142.2755f, 369.1791f, 357.2809f, 436.6378f, 376.7356f, + 416.5593f, 382.6425f, 478.6048f, 444.7983f, 21.0025f, 254.7366f, 49.1120f, 338.7197f, 232.4042f, 225.8433f, 342.4166f, 365.5193f, + 199.7265f, 166.0972f, 267.5468f, 172.4943f, 305.4298f, 176.3264f, 308.8521f, 269.9237f, 151.3188f, 397.4529f, 295.9569f, 466.6555f, + 138.0480f, 359.6507f, 260.5968f, 363.6696f, 181.5352f, 240.7855f, 290.3455f, 278.9682f, 225.7522f, 174.7890f, 356.2469f, 193.4433f, + 182.4345f, 8.5387f, 318.5487f, 41.8410f, 210.4292f, 50.5482f, 261.7152f, 92.4592f, 362.9012f, 66.1153f, 454.9341f, 126.9099f, + 326.9678f, 146.7783f, 418.6802f, 226.6052f, 150.2754f, 471.4981f, 191.1031f, 472.6456f, 383.2531f, 240.0174f, 417.3240f, 265.1360f, + 417.8392f, 109.9494f, 435.8114f, 124.8908f, 27.1272f, 11.4244f, 126.3650f, 94.3257f, 232.6628f, 144.1367f, 350.0197f, 194.1688f, + 85.4650f, 366.5097f, 199.8470f, 449.2209f, 345.5237f, 174.6456f, 393.6487f, 208.6972f, 103.6008f, 383.9478f, 135.1845f, 388.5580f, + 301.4075f, 330.7206f, 369.9960f, 471.9843f, 86.3247f, 46.8414f, 168.7999f, 63.9793f, 186.5999f, 294.3789f, 324.5439f, 314.2809f, + 408.6489f, 468.1303f, 539.9976f, 490.9658f, 121.9074f, 127.4639f, 259.4001f, 274.6741f, 374.0247f, -21.0436f, 501.7138f, 71.9877f, + 421.1110f, 415.6848f, 565.8336f, 507.6180f, 402.2457f, 367.8241f, 472.6052f, 515.8422f, 78.8962f, 253.9820f, 86.9698f, 268.1594f, + 403.1037f, 203.0262f, 416.5545f, 349.2269f, -13.5009f, 90.1716f, 45.6503f, 121.5695f, 176.9532f, 362.8065f, 216.3486f, 456.6442f, + 422.2061f, 217.5038f, 448.5273f, 281.0963f, 272.8624f, -12.1655f, 415.8898f, 46.0433f, 251.3114f, 271.6299f, 281.4290f, 411.3851f, + 121.9583f, 463.6307f, 265.9058f, 486.8656f, 348.9660f, 339.7936f, 463.3310f, 489.3569f, 306.5287f, 109.8543f, 403.0297f, 167.3439f, + 183.3392f, -22.1712f, 285.0661f, 75.4963f, 421.0473f, 397.5667f, 471.4370f, 542.7847f, 66.3152f, 463.7401f, 163.6328f, 473.3226f, + 70.7872f, 196.9543f, 99.6043f, 335.4611f, 251.0428f, 278.3568f, 391.7609f, 363.9607f, 463.0136f, 178.3225f, 508.9808f, 284.2776f, + 104.1169f, 198.2685f, 143.1397f, 221.4969f, 71.3536f, 19.4869f, 178.3168f, 99.9616f, 20.3440f, -2.3003f, 119.1549f, 99.0532f, + 396.1600f, 81.8756f, 464.4035f, 150.8565f, 65.5815f, 406.2740f, 160.8160f, 430.3668f, 239.2070f, 54.2293f, 263.9715f, 91.6030f, + 444.7733f, 49.1971f, 546.0992f, 177.5016f, -14.5900f, 271.2390f, 26.7309f, 277.3751f, 257.4168f, 54.2554f, 299.0693f, 160.8758f, + 243.5621f, 6.6488f, 268.7269f, 156.5579f, 378.4616f, 280.6006f, 428.9858f, 282.7156f, 152.4626f, 171.5487f, 202.8190f, 196.5445f, + 170.8344f, 262.3559f, 239.5070f, 363.8034f, 69.2827f, 451.1334f, 98.6552f, 461.0720f, 355.5286f, 31.0572f, 385.2867f, 119.9359f, + 351.4949f, 405.2588f, 433.2140f, 508.1748f, 58.2303f, 406.9281f, 78.4330f, 495.5619f, 144.9057f, 386.8375f, 248.5514f, 442.2501f, + 375.6284f, 263.1954f, 517.2766f, 368.0905f, -30.9426f, 265.2984f, 33.6499f, 354.8483f, 81.7472f, 303.6374f, 217.0119f, 335.5753f, + 269.6966f, 302.7942f, 285.3457f, 387.7014f, 163.3466f, -57.9610f, 170.7473f, 74.4432f, 81.7806f, 428.8672f, 190.2646f, 529.2253f, + 172.8226f, 257.1534f, 287.2148f, 328.4503f, 27.4537f, 366.2749f, 154.0694f, 415.1909f, 260.0797f, 181.7424f, 269.5455f, 195.5394f, + 294.9684f, -12.5261f, 411.7275f, 24.9233f, 259.0953f, 253.5339f, 316.1996f, 256.2007f, 23.4560f, 179.5914f, 69.6533f, 327.5987f, + 408.8140f, 201.4197f, 435.5946f, 235.5696f, 12.7857f, 108.6503f, 162.1921f, 231.0668f, 377.1631f, 111.8490f, 387.6489f, 137.9771f, + 118.1705f, 242.1441f, 242.3947f, 285.4007f, 343.2383f, 155.9774f, 439.5230f, 219.3007f, 47.8730f, 460.2977f, 158.3999f, 509.6342f, + 39.8081f, 26.4865f, 146.8540f, 146.4408f, 184.0596f, 87.9846f, 312.9663f, 231.6809f, 2.2755f, 81.2708f, 30.6605f, 212.6897f, + 112.0872f, 259.7130f, 113.2101f, 283.5961f, 316.9157f, 191.2768f, 407.0965f, 308.0034f, 391.8293f, 310.3482f, 445.5542f, 333.3923f, + 30.6705f, 406.4540f, 50.1148f, 543.5478f, 426.6715f, 103.5286f, 455.4062f, 181.6925f, 373.5433f, 320.8254f, 423.9739f, 371.9462f, + 429.1098f, 0.3217f, 440.5745f, 24.7185f, 344.4742f, 129.8145f, 353.9543f, 132.5740f, 268.3326f, 212.8878f, 405.8205f, 250.8319f, + 238.7950f, -53.0971f, 286.2983f, 84.0919f}; + std::vector scores_vec = { + 0.9822f, 0.9644f, 0.1426f, 0.7149f, 0.6008f, 0.6906f, 0.0962f, 0.1886f, 0.0766f, 0.6041f, 0.9866f, 0.6720f, + 0.7108f, 0.9846f, 0.6780f, 0.0402f, 0.8670f, 0.3647f, 0.0044f, 0.5072f, 0.9370f, 0.2573f, 0.4915f, 0.1738f, + 0.0577f, 0.0805f, 0.7270f, 0.8641f, 0.1433f, 0.2883f, 0.1950f, 0.0269f, 0.5534f, 0.6999f, 0.6479f, 0.3881f, + 0.5550f, 0.0941f, 0.1543f, 0.9318f, 0.7615f, 0.9227f, 0.9167f, 0.6494f, 0.9282f, 0.4167f, 0.0036f, 0.0626f, + 0.1095f, 0.0954f, 0.3517f, 0.7013f, 0.7906f, 0.5902f, 0.1464f, 0.7479f, 0.3548f, 0.0130f, 0.2806f, 0.3306f, + 0.2742f, 0.8119f, 0.7599f, 0.6956f, 0.1390f, 0.8078f, 0.6772f, 0.1948f, 0.6481f, 0.4835f, 0.4394f, 0.1121f, + 0.5183f, 0.0999f, 0.1643f, 0.1325f, 0.9541f, 0.2849f, 0.3552f, 0.3221f, 0.8983f, 0.5630f, 0.9192f, 0.2999f, + 0.1148f, 0.5562f, 0.3455f, 0.8019f, 0.8794f, 0.4726f, 0.9714f, 0.5530f, 0.2709f, 0.4890f, 0.0373f, 0.8040f, + 0.1014f, 0.3087f, 0.5653f, 0.0430f, 0.0793f, 0.6961f, 0.0718f, 0.4771f, 0.3387f, 0.2281f, 0.1888f, 0.7634f, + 0.9515f, 0.1402f, 0.9597f, 0.5948f, 0.6417f, 0.7099f, 0.7041f, 0.8198f, 0.4835f, 0.5334f, 0.3238f, 0.1053f, + 0.6646f, 0.0336f, 0.2756f, 0.0942f, 0.1907f, 0.6387f, 0.6285f, 0.4211f, 0.0902f, 0.4334f, 0.3527f, 0.7205f, + 0.5790f, 0.4916f, 0.4870f, 0.9663f, 0.7563f, 0.4970f, 0.4792f, 0.0265f, 0.9425f, 0.3192f, 0.2559f, 0.9994f, + 0.7187f, 0.0474f, 0.0619f, 0.0255f, 0.5996f, 0.0716f, 0.9334f, 0.9369f, 0.5461f, 0.6166f, 0.2919f, 0.0640f, + 0.7375f, 0.1018f, 0.0856f, 0.3112f, 0.0125f, 0.4340f, 0.7077f, 0.8013f, 0.6043f, 0.8469f, 0.4065f, 0.8488f, + 0.5065f, 0.2230f, 0.9441f, 0.2750f, 0.0262f, 0.2427f, 0.3667f, 0.3513f, 0.5247f, 0.8831f, 0.2923f, 0.5208f, + 0.3401f, 0.8218f, 0.1576f, 0.1035f, 0.5030f, 0.6719f, 0.7955f, 0.5896f, 0.7738f, 0.3927f, 0.0329f, 0.1161f, + 0.0387f, 0.3289f, 0.4955f, 0.3563f, 0.5606f, 0.4806f, 0.6779f, 0.6670f, 0.3181f, 0.3462f, 0.5851f, 0.5964f, + 0.3147f, 0.3303f, 0.6940f, 0.6474f, 0.1351f, 0.4410f, 0.8927f, 0.0363f, 0.8552f, 0.1632f, 0.5072f, 0.4243f, + 0.0101f, 0.9154f, 0.4549f, 0.9543f, 0.2867f, 0.8663f, 0.9224f, 0.5568f, 0.2027f, 0.6852f, 0.5490f, 0.9445f, + 0.4393f, 0.2685f, 0.1383f, 0.6986f, 0.9741f, 0.0283f, 0.7404f, 0.9269f, 0.0748f, 0.1102f, 0.6920f, 0.6480f, + 0.0688f, 0.8344f, 0.5234f, 0.9072f, 0.8780f, 0.8125f, 0.5159f, 0.2517f, 0.5060f, 0.1008f, 0.6588f, 0.1340f, + 0.5112f, 0.0544f, 0.2995f, 0.2321f, 0.6200f, 0.7868f, 0.0573f, 0.8503f, 0.8608f, 0.3423f, 0.6590f, 0.4026f, + 0.1542f, 0.5287f, 0.0864f, 0.8785f, 0.9243f, 0.8216f, 0.5625f, 0.5576f, 0.9846f, 0.2479f, 0.0759f, 0.5619f, + 0.3288f, 0.3223f, 0.0071f, 0.5962f, 0.2640f, 0.1879f, 0.0404f, 0.3644f, 0.8790f, 0.3367f, 0.6791f, 0.7565f, + 0.3281f, 0.8216f, 0.6919f, 0.5592f, 0.0010f, 0.0351f, 0.9909f, 0.7823f, 0.9376f, 0.9023f, 0.0204f, 0.7918f, + 0.4511f, 0.7896f, 0.0067f, 0.2882f, 0.7513f, 0.7930f, 0.6197f, 0.3013f, 0.3104f, 0.9668f, 0.4392f, 0.4471f, + 0.5523f, 0.4095f, 0.5527f, 0.4323f, 0.8267f, 0.9091f, 0.9321f, 0.5643f, 0.4421f, 0.7052f, 0.8383f, 0.5630f, + 0.7000f, 0.7497f, 0.6764f, 0.7461f, 0.2086f, 0.4984f, 0.5883f, 0.0025f, 0.8560f, 0.6100f, 0.1291f, 0.8164f, + 0.7171f, 0.7583f, 0.3920f, 0.8542f, 0.4140f, 0.5705f, 0.0006f, 0.6449f, 0.7182f, 0.5671f, 0.4966f, 0.8099f, + 0.6814f, 0.2781f, 0.9591f, 0.7073f, 0.9879f, 0.9713f, 0.9189f, 0.7554f, 0.6094f, 0.1722f, 0.5434f, 0.7654f, + 0.5209f, 0.8682f, 0.1097f, 0.3809f, 0.5060f, 0.4323f, 0.1086f, 0.1535f, 0.8376f, 0.4844f, 0.0487f, 0.0165f, + 0.4735f, 0.1644f, 0.7051f, 0.7953f, 0.2283f, 0.5922f, 0.1544f, 0.3036f, 0.8888f, 0.5441f, 0.8859f, 0.2252f, + 0.3300f, 0.4710f, 0.4801f, 0.9976f, 0.1144f, 0.8520f, 0.8637f, 0.5532f, 0.3440f, 0.5192f, 0.2925f, 0.7991f, + 0.4983f, 0.9258f, 0.6227f, 0.5143f, 0.7111f, 0.5039f, 0.9045f, 0.1844f, 0.9733f, 0.8122f, 0.8607f, 0.4829f, + 0.8372f, 0.3068f, 0.7619f, 0.1405f, 0.3071f, 0.4457f, 0.3223f, 0.3870f, 0.8201f, 0.2567f, 0.7453f, 0.0737f, + 0.7657f, 0.7920f, 0.4017f, 0.7225f, 0.9151f, 0.8007f, 0.3904f, 0.4842f, 0.7794f, 0.2926f, 0.8039f, 0.3281f, + 0.8060f, 0.0868f, 0.0444f, 0.9977f, 0.8695f, 0.8828f, 0.9513f, 0.4383f, 0.2868f, 0.1300f, 0.5012f, 0.2200f, + 0.9356f, 0.0040f, 0.1432f, 0.2465f, 0.1990f, 0.2258f, 0.6560f, 0.3275f, 0.6150f, 0.8903f, 0.6026f, 0.6945f, + 0.3655f, 0.1597f, 0.3206f, 0.9643f, 0.6218f, 0.2775f, 0.4509f, 0.8355f, 0.6684f, 0.5607f, 0.8852f, 0.6724f, + 0.6427f, 0.1898f, 0.1064f, 0.9651f, 0.5989f, 0.4157f, 0.5890f, 0.0618f, 0.8221f, 0.2166f, 0.8045f, 0.5344f, + 0.2766f, 0.0302f, 0.8158f, 0.1765f, 0.0518f, 0.7559f, 0.3500f, 0.3893f, 0.2471f, 0.8592f, 0.2973f, 0.2102f, + 0.3092f, 0.2031f, 0.3177f, 0.0829f, 0.1585f, 0.4171f, 0.8795f, 0.0573f, 0.2127f, 0.9083f, 0.8900f, 0.6795f, + 0.2405f, 0.4198f, 0.2112f, 0.1286f, 0.3800f, 0.5758f, 0.3599f, 0.6108f, 0.2963f, 0.3459f, 0.7907f, 0.8783f, + 0.3220f, 0.5715f, 0.2782f, 0.0533f, 0.7379f, 0.1710f, 0.4257f, 0.4870f, 0.1845f, 0.0946f, 0.3480f, 0.9523f, + 0.6151f, 0.3814f, 0.0389f, 0.6003f, 0.0923f, 0.5425f, 0.7520f, 0.4236f, 0.2994f, 0.0474f, 0.0248f, 0.4300f, + 0.8833f, 0.2441f, 0.5741f, 0.6843f, 0.0608f, 0.1531f, 0.3313f, 0.6701f, 0.4390f, 0.7342f, 0.8676f, 0.7584f, + 0.9922f, 0.7544f, 0.8522f, 0.8324f, 0.7303f, 0.8018f, 0.9347f, 0.4752f, 0.6383f, 0.5149f, 0.8510f, 0.4314f, + 0.8197f, 0.7994f, 0.9619f, 0.2489f, 0.7096f, 0.7569f, 0.9363f, 0.9069f, 0.5735f, 0.5792f, 0.1673f, 0.9750f, + 0.2550f, 0.7247f, 0.7958f, 0.4412f, 0.2112f, 0.1890f, 0.8565f, 0.5108f, 0.0901f, 0.7170f, 0.2502f, 0.8764f, + 0.3096f, 0.2003f, 0.0849f, 0.5115f, 0.4507f, 0.7513f, 0.4646f, 0.3438f, 0.2617f, 0.2781f, 0.9278f, 0.1651f, + 0.9882f, 0.3269f, 0.0884f, 0.2487f, 0.0584f, 0.7900f, 0.5126f, 0.3370f, 0.6620f, 0.6306f, 0.9399f, 0.9613f, + 0.6807f, 0.8178f, 0.7924f, 0.4913f, 0.7045f, 0.0783f, 0.7580f, 0.9618f, 0.0850f, 0.8361f, 0.9330f, 0.2262f, + 0.5248f, 0.9279f, 0.9602f, 0.1279f, 0.3490f, 0.6981f, 0.2216f, 0.3248f, 0.0233f, 0.1535f, 0.5623f, 0.6531f, + 0.6489f, 0.7784f, 0.4153f, 0.2735f, 0.0156f, 0.2066f, 0.3124f, 0.1782f, 0.0201f, 0.1574f, 0.6661f, 0.6296f, + 0.9357f, 0.7982f, 0.5678f, 0.1376f, 0.5641f, 0.0616f, 0.4309f, 0.3903f, 0.4278f, 0.2798f, 0.6858f, 0.8409f, + 0.7685f, 0.6278f, 0.5383f, 0.0311f, 0.7229f, 0.5450f, 0.2707f, 0.3278f, 0.9356f, 0.6244f, 0.4759f, 0.6209f, + 0.4137f, 0.4702f, 0.2903f, 0.4399f, 0.6856f, 0.0399f, 0.7950f, 0.2830f, 0.6826f, 0.6427f, 0.6526f, 0.6081f, + 0.9591f, 0.5083f, 0.7323f, 0.7054f, 0.2363f, 0.2833f, 0.4240f, 0.2777f, 0.3667f, 0.3910f, 0.6039f, 0.2199f, + 0.8043f, 0.4375f, 0.7062f, 0.0814f, 0.4700f, 0.0282f, 0.6759f, 0.3437f, 0.9493f, 0.3241f, 0.5638f, 0.2574f, + 0.6201f, 0.4670f, 0.3706f, 0.2037f, 0.1115f, 0.1199f, 0.9990f, 0.4123f, 0.0019f, 0.9529f, 0.0200f, 0.4186f, + 0.7175f, 0.9146f, 0.7129f, 0.4636f, 0.9744f, 0.0393f, 0.9869f, 0.8494f, 0.9289f, 0.2548f, 0.1425f, 0.6633f, + 0.5159f, 0.5232f, 0.9246f, 0.6201f, 0.3111f, 0.4001f, 0.1335f, 0.1923f, 0.1434f, 0.8103f, 0.7049f, 0.5303f, + 0.3744f, 0.6685f, 0.8129f, 0.8812f, 0.5470f, 0.8199f, 0.5113f, 0.4745f, 0.8654f, 0.3864f, 0.3959f, 0.3049f, + 0.5187f, 0.5449f, 0.6605f, 0.4305f, 0.2178f, 0.8668f, 0.3460f, 0.9229f, 0.2074f, 0.5601f, 0.5366f, 0.8286f, + 0.1389f, 0.9099f, 0.5314f, 0.5861f, 0.5102f, 0.0360f, 0.4971f, 0.2635f, 0.3427f, 0.6491f, 0.4977f, 0.0932f, + 0.0730f, 0.1857f, 0.1909f, 0.6083f, 0.1778f, 0.8817f, 0.2098f, 0.0911f, 0.8757f, 0.2953f, 0.4254f, 0.9590f, + 0.9444f, 0.7149f, 0.0689f, 0.5933f, 0.9891f, 0.9469f, 0.1060f, 0.3960f}; + + migraphx::parameter_map host_params; + host_params["boxes"] = migraphx::argument(boxes_s, boxes_vec.data()); + host_params["scores"] = migraphx::argument(scores_s, scores_vec.data()); + + auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params); + indices.resize(static_cast(num_selected) * 3); + std::vector gold = {0, 0, 143, 0, 0, 10, 0, 0, 13, 0, 0, 0, 0, 0, 90, 0, 0, 135, 0, 0, 1, 0, 0, 76, 0, 0, 108, 0, 0, 170, 0, 0, 140, 0, 0, 20, 0, 0, 151, 0, 0, 150, 0, 0, 39, 0, 0, 44, 0, 0, 41, 0, 0, 82, 0, 0, 80, 0, 0, 88, 0, 0, 16, 0, 0, 27, 0, 0, 167, 0, 0, 165, 0, 0, 181, 0, 1, 187, 0, 1, 94, 0, 1, 152, 0, 1, 72, 0, 1, 32, 0, 1, 153, 0, 1, 109, 0, 1, 150, 0, 1, 19, 0, 1, 27, 0, 1, 96, 0, 1, 35, 0, 1, 197, 0, 1, 68, 0, 1, 22, 0, 1, 154, 0, 1, 17, 0, 1, 117, 0, 1, 43, 0, 1, 97, 0, 1, 10, 0, 1, 180, 0, 1, 182, 0, 1, 67, 0, 1, 44, 1, 0, 35, 1, 0, 152, 1, 0, 175, 1, 0, 4, 1, 0, 71, 1, 0, 166, 1, 0, 127, 1, 0, 38, 1, 0, 170, 1, 0, 44, 1, 0, 158, 1, 0, 198, 1, 0, 24, 1, 0, 101, 1, 0, 171, 1, 0, 2, 1, 0, 53, 1, 0, 102, 1, 0, 66, 1, 0, 140, 1, 0, 37, 1, 0, 98, 1, 0, 115, 1, 0, 150, 1, 0, 6, 1, 1, 114, 1, 1, 196, 1, 1, 0, 1, 1, 126, 1, 1, 124, 1, 1, 19, 1, 1, 11, 1, 1, 26, 1, 1, 84, 1, 1, 191, 1, 1, 117, 1, 1, 104, 1, 1, 197, 1, 1, 192, 1, 1, 10, 1, 1, 48, 1, 1, 68, 1, 1, 22, 1, 1, 128, 1, 1, 25, 1, 1, 134, 1, 1, 163, 1, 1, 121, 1, 1, 169, 1, 1, 185}; + EXPECT(migraphx::verify::verify_rms_range(indices, gold)); + EXPECT(num_selected == 100); +} + + int main(int argc, const char* argv[]) { test::run(argc, argv); } From 600d9fbb6cbdb280dd78ed884391e17e58170a8f Mon Sep 17 00:00:00 2001 From: charlie Date: Tue, 19 May 2026 21:22:33 -0500 Subject: [PATCH 15/32] Progress update --- src/targets/gpu/jit/nonmaxsuppression.cpp | 39 ++++++++++++++----- src/targets/gpu/prepare_nonmaxsuppression.cpp | 13 +++---- test/gpu/nonmaxsuppression.cpp | 8 +++- 3 files changed, 40 insertions(+), 20 deletions(-) diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp index dfa5aaffcba..dbfe766882b 100644 --- a/src/targets/gpu/jit/nonmaxsuppression.cpp +++ b/src/targets/gpu/jit/nonmaxsuppression.cpp @@ -46,12 +46,21 @@ extern "C" { MIGRAPHX_GLOBAL void nms_sort_kernel(${params}) { - make_tensors()(${args})([](auto boxes, auto scores, auto sorted) { + make_tensors()(${args})([](auto boxes, + auto scores, + auto sorted_scores, + auto sorted_boxes, + auto sorted_box_indices) { nonmaxsuppression_sort<${center_point_box}, ${num_batches}, ${num_classes}, ${num_boxes}, - ${aligned_num_boxes}>(boxes, scores, sorted); + ${aligned_num_boxes}>( + boxes, + scores, + sorted_scores, + sorted_boxes, + sorted_box_indices); }); } @@ -71,18 +80,28 @@ extern "C" { MIGRAPHX_GLOBAL void nms_filter_kernel(${params}) { - make_tensors()(${args})([](auto sorted, - auto max_p, - auto iou_p, - auto thr_p, - auto mask, - auto output, - auto counts) { + make_tensors()(${args})([](auto sorted_scores, + auto sorted_boxes, + auto sorted_box_indices, + auto max_p, + auto iou_p, + auto thr_p, + auto mask, + auto output, + auto counts) { nonmaxsuppression_filter<${num_batches}, ${num_classes}, ${num_boxes}, ${aligned_num_boxes}>( - sorted, max_p, iou_p, thr_p, mask, output, counts); + sorted_scores, + sorted_boxes, + sorted_box_indices, + max_p, + iou_p, + thr_p, + mask, + output, + counts); }); } diff --git a/src/targets/gpu/prepare_nonmaxsuppression.cpp b/src/targets/gpu/prepare_nonmaxsuppression.cpp index 53514963c13..e6e2c021578 100644 --- a/src/targets/gpu/prepare_nonmaxsuppression.cpp +++ b/src/targets/gpu/prepare_nonmaxsuppression.cpp @@ -38,11 +38,6 @@ namespace migraphx { inline namespace MIGRAPHX_INLINE_NS { namespace gpu { -// nms_data is laid out as { float score; float box[4]; int box_index; } for a -// total of 24 bytes per entry. The scratch workspace is allocated as raw uint8 -// and reinterpreted in the kernel. -static constexpr std::size_t nms_bytes_per_data = 24; - // Sort boxes per (batch, class) into nms_data{} tensor. struct nms_sort { @@ -66,9 +61,11 @@ struct nms_sort const auto num_batches = boxes_s.lens()[0]; const auto num_boxes = boxes_s.lens()[1]; const auto num_classes = scores_s.lens()[1]; - const auto aligned_b = - static_cast(bit_ceil(static_cast(num_boxes))); - return shape{shape::uint8_type, {num_batches * num_classes * aligned_b * nms_bytes_per_data}}; + const auto aligned_b = static_cast(bit_ceil(static_cast(num_boxes))); + shape out_scores_shape{shape::float_type, {num_batches * num_classes, aligned_b}}; + shape out_boxes_shape{shape::float_type, {num_batches * num_classes, aligned_b, 4}}; + shape out_box_index_shape{shape::int32_type, {num_batches * num_classes, aligned_b}}; + return shape{{out_scores_shape, out_boxes_shape, out_box_index_shape}}; } }; MIGRAPHX_REGISTER_OP(nms_sort); diff --git a/test/gpu/nonmaxsuppression.cpp b/test/gpu/nonmaxsuppression.cpp index 3f7aab9b432..119b2ab6a8e 100644 --- a/test/gpu/nonmaxsuppression.cpp +++ b/test/gpu/nonmaxsuppression.cpp @@ -66,7 +66,7 @@ static void add_nms_return(migraphx::module* mm, migraphx::instruction_ref nms) mm->add_return({idx, cnt}); } -TEST_CASE(nms_test) +TEST_CASE(nms_default_test) { migraphx::program p; auto* mm = p.get_main_module(); @@ -362,6 +362,7 @@ TEST_CASE(nms_multi_class_test) EXPECT(num_selected == 4); } +// Values generated from onnxruntime CPU EP TEST_CASE(nms_20boxes_test) { migraphx::program p; @@ -407,6 +408,7 @@ TEST_CASE(nms_20boxes_test) EXPECT(num_selected == 10); } +// Values generated from onnxruntime CPU EP TEST_CASE(nms_50boxes_center_test) { migraphx::program p; @@ -465,6 +467,7 @@ TEST_CASE(nms_50boxes_center_test) EXPECT(num_selected == 20); } +// Values generated from onnxruntime CPU EP TEST_CASE(nms_100boxes_2batch_test) { migraphx::program p; @@ -585,6 +588,7 @@ TEST_CASE(nms_100boxes_2batch_test) EXPECT(num_selected == 30); } +// Values generated from onnxruntime CPU EP TEST_CASE(nms_30boxes_3class_test) { migraphx::program p; @@ -639,6 +643,7 @@ TEST_CASE(nms_30boxes_3class_test) EXPECT(num_selected == 15); } +// Values generated from onnxruntime CPU EP TEST_CASE(nms_200boxes_2batch_2class_test) { migraphx::program p; @@ -876,5 +881,4 @@ TEST_CASE(nms_200boxes_2batch_2class_test) EXPECT(num_selected == 100); } - int main(int argc, const char* argv[]) { test::run(argc, argv); } From d5934c067ee5183e99f82215505e80bb50388fa0 Mon Sep 17 00:00:00 2001 From: charlie Date: Wed, 20 May 2026 15:40:32 -0500 Subject: [PATCH 16/32] Version with iterator nms_data --- src/include/migraphx/op/nonmaxsuppression.hpp | 1 + src/targets/gpu/jit/nonmaxsuppression.cpp | 13 +- .../migraphx/kernels/nonmaxsuppression.hpp | 151 ++++++++++++------ .../kernels/include/migraphx/kernels/sort.hpp | 9 +- src/targets/gpu/lowering.cpp | 21 +++ 5 files changed, 139 insertions(+), 56 deletions(-) diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp index b6cbd4c9bc1..d154733a581 100644 --- a/src/include/migraphx/op/nonmaxsuppression.hpp +++ b/src/include/migraphx/op/nonmaxsuppression.hpp @@ -75,6 +75,7 @@ struct nonmaxsuppression // Per ONNX spec, output is [num_selected_indices, 3] where each row is // [batch_index, class_index, box_index]. The maximum possible // num_selected_indices = num_batches * num_classes * spatial_dimension. + // TODO: can also be limited by max_output_boxes_per_class const auto max_num_boxes = max_batches * max_classes * max_spatial_dimension; auto fixed_shape_error_check = [&]() { diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp index dbfe766882b..ff6b1d32ece 100644 --- a/src/targets/gpu/jit/nonmaxsuppression.cpp +++ b/src/targets/gpu/jit/nonmaxsuppression.cpp @@ -126,7 +126,10 @@ MIGRAPHX_GLOBAL void nms_compact_kernel(${params}) auto selected_indices, auto num_selected) { nonmaxsuppression_compact<${num_batch_class}, ${num_boxes}>( - bc_counts, indices, selected_indices, num_selected); + bc_counts, + indices, + selected_indices, + num_selected); }); } @@ -153,16 +156,16 @@ struct nms_sort_compiler : compiler auto block_size = compute_block_size(ctx, aligned_num_boxes, 1024); hip_compile_options options; - options.inputs = inputs; + options.inputs = flatten_shapes(inputs); options.output = inputs.back(); options.kernel_name = "nms_sort_kernel"; - options.virtual_inputs = inputs; + options.virtual_inputs = options.inputs; options.set_launch_params(v, num_batches * num_classes * block_size, block_size); auto src = interpolate_string( nms_sort_kernel_src, - {{"params", enum_params(inputs.size(), "void * private_p")}, - {"args", enum_params(inputs.size(), "private_p")}, + {{"params", enum_params(options.inputs.size(), "void * private_p")}, + {"args", enum_params(options.inputs.size(), "private_p")}, {"num_batches", std::to_string(num_batches)}, {"num_classes", std::to_string(num_classes)}, {"num_boxes", std::to_string(num_boxes)}, diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp index bde081bbc69..f999855c8e9 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp @@ -38,17 +38,19 @@ namespace migraphx { +template struct nms_data { - float score; - array box; - int box_index; + // should hold iterators + Score score; + Box box; + Index box_index; }; // Decode a single box into (xmin, ymin, xmax, ymax) corners. // Normalize such that [x1, y1] is the bottom left corner. template -__device__ inline array nms_normalize_box(Box box) +__device__ inline array nms_normalize_box(Box box) { if constexpr(CenterPointBox) { @@ -99,11 +101,12 @@ __device__ inline index_int nms_packed_idx(index_int i, index_int j, index_int N } // Comparator for sorting nms_data{}. +template struct nms_score_greater { - constexpr bool operator()(const nms_data& a, const nms_data& b) const + constexpr bool operator()(const nms_data& a, const nms_data& b) const { - return a.score > b.score; + return *(a.score) > *(b.score); } }; @@ -121,8 +124,15 @@ template -__device__ void nonmaxsuppression_sort(Boxes boxes_tv, Scores scores_tv, Output out_tv) + class SortedScores, + class SortedBoxes, + class SortedIndices> +__device__ void nonmaxsuppression_sort( + Boxes boxes_tv, + Scores scores_tv, + SortedScores sorted_scores, + SortedBoxes sorted_boxes, + SortedIndices sorted_indices) { static_assert(NumBatches > 0); static_assert(NumClasses > 0); @@ -134,42 +144,60 @@ __device__ void nonmaxsuppression_sort(Boxes boxes_tv, Scores scores_tv, Output const int batch_idx = block_id / NumClasses; const int class_idx = block_id % NumClasses; - constexpr auto block_out_shape = make_shape(index_ints{}); - auto* p = reinterpret_cast(out_tv.data()) + block_id * AlignedNumBoxes; - auto block_out_tv = make_tensor_view(p, block_out_shape); - // numpy indexing: scores[batch_idx, class_idx, :] const auto my_scores = slice_tensor(scores_tv, array{batch_idx, class_idx, 0}, slice_axes<2>()); + + auto block_out_scores = slice_tensor(sorted_scores, array{block_id, 0}, slice_axes<1>()); + auto block_out_boxes = slice_tensor(sorted_boxes, array{block_id, 0, 0}, slice_axes<1, 2>()); + auto block_out_indices = slice_tensor(sorted_indices, array{block_id, 0}, slice_axes<1>()); + + using scores_type = decltype(block_out_scores.begin()); + using boxes_type = decltype(block_out_boxes.begin()); + using indices_type = decltype(block_out_indices.begin()); + __shared__ uninitialized_buffer, AlignedNumBoxes> block_nms_data; - nms_data tmp_data; idx.local_stride(AlignedNumBoxes, [&](auto i) { if(i < NumBoxes) { - tmp_data.score = my_scores[i]; + block_out_scores[i] = my_scores[i]; // numpy indexing: boxes[batch_idx, i, :] - tmp_data.box = nms_normalize_box(slice_tensor(boxes_tv, array{batch_idx, i, 0}, slice_axes<2>())); - tmp_data.box_index = static_cast(i); + auto normed_box = nms_normalize_box( + slice_tensor(boxes_tv, array{batch_idx, i, 0}, slice_axes<2>()) + ); + // numpy syntax: out_boxes[block_id, i, 0] + auto out_boxes_iter = block_out_boxes.begin_at(array{0, i, 0}); + copy(normed_box.begin(), normed_box.end(), out_boxes_iter); + block_out_indices[i] = i; } else { // Sentinel score so it never beats any real entry - tmp_data.score = numeric_lowest(); - tmp_data.box = array{0.f, 0.f, 0.f, 0.f}; - tmp_data.box_index = -1; + block_out_scores[i] = numeric_lowest(); + auto filler_box = array{0.f, 0.f, 0.f, 0.f}; + auto out_boxes_iter = block_out_boxes.begin_at(array{0, i, 0}); + copy(filler_box.begin(), filler_box.end(), out_boxes_iter); + block_out_indices[i] = -1; } - block_out_tv[i] = tmp_data; + block_nms_data[i] = { + block_out_scores.begin_at(array{0, i}), + block_out_boxes.begin_at(array{0, i, 0}), + block_out_indices.begin_at(array{0, i}) + }; }); __syncthreads(); - bitonic_sort{nms_score_greater{}}.template block_sort(idx, block_out_tv); + bitonic_sort> + { + nms_score_greater{} + }.template block_sort(idx, block_nms_data); } -// Build the packed upper-triangular IoU mask for the NumBoxes sorted boxes. +// Build the packed upper-triangular IoU mask for the NumBoxes nms_data boxes. // Work is striped such that each thread does a multiple of 2 rows so each does roughly the same // amount of work regardless of where it falls in the triangle. -// `sorted`: sorted nms_data{} tensor +// `nms_data`: nms_data nms_data{} tensor // `mask`: bool mask tensor -template -__device__ void nms_make_iou_mask(index idx, const SortedData sorted, Mask mask, IouThreshold iou_threshold) +template +__device__ void nms_make_iou_mask(index idx, const NMSData nms_data, Mask mask, IouThreshold iou_threshold) { static_assert(NumBoxes > 0); constexpr index_int half = NumBoxes / 2; @@ -178,7 +206,7 @@ __device__ void nms_make_iou_mask(index idx, const SortedData sorted, Mask mask, for(index_int j = i + 1; j < NumBoxes; ++j) { mask[nms_packed_idx(i, j, NumBoxes)] = - nms_iou_over_threshold(sorted[i].box, sorted[j].box, iou_threshold) ? 1 : 0; + nms_iou_over_threshold(nms_data[i].box, nms_data[j].box, iou_threshold) ? 1 : 0; } }; @@ -197,9 +225,9 @@ __device__ void nms_make_iou_mask(index idx, const SortedData sorted, Mask mask, // TODO: use template for types // Greedy filter that writes selections into a per-batch per-class region of output. -template +template __device__ void nms_filter_per_block(index idx, - const Sorted sorted, + const NMSData nms_data, const Mask mask, int64_t max_output, float score_thr, @@ -215,7 +243,7 @@ __device__ void nms_filter_per_block(index idx, // Match the ref op: only filter by score when score_threshold > 0. const bool do_filter = score_thr > 0.f; idx.local_stride(NumBoxes, [&](auto i) { - removed[i] = (do_filter and sorted[i].score < score_thr); + removed[i] = (do_filter and *(nms_data[i].score) < score_thr); }); __syncthreads(); @@ -231,9 +259,9 @@ __device__ void nms_filter_per_block(index idx, { if(idx.local == 0) { - output[output_idx * 3 + 0] = batch_idx; - output[output_idx * 3 + 1] = class_idx; - output[output_idx * 3 + 2] = sorted[i].box_index; + array tmp = {batch_idx, class_idx, *(nms_data[i].box_index)}; + auto output_iter = output.begin_at(array{block_id, output_idx, 0}); + copy(tmp.begin(), tmp.end(), output_iter); } ++output_idx; for(index_int j = i + 1 + idx.local; j < NumBoxes; j += idx.nlocal()) @@ -254,14 +282,18 @@ template -__device__ void nonmaxsuppression_filter(Sorted sorted_buf, +__device__ void nonmaxsuppression_filter(SortedScores sorted_scores, + SortedBoxes sorted_boxes, + SortedIndices sorted_indices, MaxOut max_out_p, IouThr iou_thr_p, ScoreThr score_thr_p, @@ -274,31 +306,48 @@ __device__ void nonmaxsuppression_filter(Sorted sorted_buf, static_assert(NumBoxes > 0); auto idx = make_index(); - const index_int block_id = idx.group; - //constexpr index_int iou_packed_size = (NumBoxes > 1) ? (NumBoxes * (NumBoxes - 1)) / 2 : 1; + const index_int block_idx = idx.group; - constexpr auto my_sorted_shape = make_shape(index_ints{}); - nms_data* my_sorted_p = reinterpret_cast(sorted_buf.data()) + block_id * AlignedNumBoxes; - auto my_sorted = make_tensor_view(my_sorted_p, my_sorted_shape); + auto my_sorted_scores = slice_tensor(sorted_scores, array{block_idx, 0}, slice_axes<1>()); + auto my_sorted_boxes = slice_tensor(sorted_boxes, array{block_idx, 0, 0}, slice_axes<1, 2>()); + auto my_sorted_indices = slice_tensor(sorted_indices, array{block_idx, 0}, slice_axes<1>()); + + using scores_type = decltype(my_sorted_scores.begin()); + using boxes_type = decltype(my_sorted_boxes.begin()); + using indices_type = decltype(my_sorted_indices.begin()); + __shared__ uninitialized_buffer, NumBoxes> block_nms_data; + + idx.local_stride(AlignedNumBoxes, [&](auto i) { + if(i < NumBoxes) + { + block_nms_data[i] = { + my_sorted_scores.begin_at(array{0, i}), + my_sorted_boxes.begin_at(array{0, i, 0}), + my_sorted_indices.begin_at(array{0, i}) + }; + } + }); + __syncthreads(); - auto my_mask = slice_tensor(mask, block_id, slice_axes<1>()); - auto my_output = slice_tensor(output, block_id, slice_axes<1, 2>()); + auto my_mask = slice_tensor(mask, array{block_idx, 0}, slice_axes<1>()); + auto my_output = slice_tensor(output, array{block_idx, 0, 0}, slice_axes<1, 2>()); // Read scalar tensor inputs const int64_t max_output_boxes_per_class = max_out_p[0]; const float iou_thr_val = iou_thr_p[0]; const float score_thr_val = score_thr_p[0]; - nms_make_iou_mask(idx, my_sorted, my_mask, iou_thr_val); + nms_make_iou_mask(idx, block_nms_data, my_mask, iou_thr_val); __syncthreads(); - nms_filter_per_block(idx, - my_sorted, - my_mask, - max_output_boxes_per_class, - score_thr_val, - my_output, - bc_counts); + nms_filter_per_block( + idx, + block_nms_data, + my_mask, + max_output_boxes_per_class, + score_thr_val, + my_output, + bc_counts); } @@ -325,6 +374,7 @@ __device__ void nonmaxsuppression_compact(const Counts bc_counts, auto idx = make_index(); __shared__ index_int offsets[NumBatchClass]; // Exclusive prefix sum on bc_counts to get offsets + // TODO: there's probably a better way to get the exclusive prefix sum rather than doing the minus each time. block_scan( idx, op::sum{}, @@ -340,7 +390,8 @@ __device__ void nonmaxsuppression_compact(const Counts bc_counts, num_selected[0] = offsets[NumBatchClass-1] + bc_counts[NumBatchClass-1]; } - // swap index values to make the output packed + // rearrange index values to make the output packed. + // TODO: this could be done in-place to save memory. constexpr index_int index_size = 3; constexpr index_int max_entries = NumBatchClass * NumBoxes; idx.local_stride(max_entries, [&](auto i) { diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp index 980a628682b..fb59d3724c6 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp @@ -160,7 +160,14 @@ struct bitonic_sort { const bool reverse = (tid & k) != 0; if(this->compare(buf[tid], buf[partner], reverse)) - swap(buf[tid], buf[partner]); + { + swap(*(buf[tid].score), *(buf[partner].score)); + swap(*(buf[tid].box), *(buf[partner].box)); + swap(*(buf[tid].box+1), *(buf[partner].box+1)); + swap(*(buf[tid].box+2), *(buf[partner].box+2)); + swap(*(buf[tid].box+3), *(buf[partner].box+3)); + swap(*(buf[tid].box_index), *(buf[partner].box_index)); + } } }); __syncthreads(); diff --git a/src/targets/gpu/lowering.cpp b/src/targets/gpu/lowering.cpp index 1a9275de52b..196a0353a35 100644 --- a/src/targets/gpu/lowering.cpp +++ b/src/targets/gpu/lowering.cpp @@ -109,6 +109,7 @@ struct miopen_apply add_loop_op(); add_neg_op(); add_lrn_op(); + add_nms_op(); add_convolution_backwards_op(); add_select_module_op(); add_reshape_lazy_op(); @@ -446,6 +447,26 @@ struct miopen_apply }); } + void add_nms_op() + { + apply_map.emplace("nonmaxsuppression", [=](instruction_ref ins) { + auto s = ins->get_shape(); + auto output = insert_allocation(ins, s); + std::vector cpu_inputs; + auto inputs = ins->inputs(); + std::transform( + inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) { + return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in); + }); + cpu_inputs.front() = + mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs); + auto cpu_out = mod->insert_instruction(ins, ins->get_operator(), cpu_inputs); + auto gpu_out = + mod->insert_instruction(ins, make_op("hip::copy_to_gpu"), cpu_out, output); + return mod->replace_instruction(ins, gpu_out); + }); + } + void add_lrn_op() { apply_map.emplace("lrn", [=](instruction_ref ins) { From b5c1e7760e14c29444beadd4ca7ca54779b46a56 Mon Sep 17 00:00:00 2001 From: charlie Date: Wed, 20 May 2026 16:40:06 -0500 Subject: [PATCH 17/32] Kernel version using block shared memory for nms_data --- .../migraphx/kernels/nonmaxsuppression.hpp | 149 +++++++++--------- .../kernels/include/migraphx/kernels/sort.hpp | 13 +- 2 files changed, 76 insertions(+), 86 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp index f999855c8e9..5ac4c46f4cb 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp @@ -41,16 +41,26 @@ namespace migraphx { template struct nms_data { - // should hold iterators + // holds a copy of data Score score; - Box box; + array box; Index box_index; }; +// Comparator for sorting nms_data{}. +template +struct nms_score_greater +{ + constexpr bool operator()(const nms_data& a, const nms_data& b) const + { + return a.score > b.score; + } +}; + // Decode a single box into (xmin, ymin, xmax, ymax) corners. // Normalize such that [x1, y1] is the bottom left corner. template -__device__ inline array nms_normalize_box(Box box) +__device__ inline array nms_normalize_box(const Box box) { if constexpr(CenterPointBox) { @@ -77,7 +87,7 @@ __device__ inline array nms_normalize_box(Box box) template __device__ inline bool -nms_iou_over_threshold(const Box a, Box b, Threshold threshold) +nms_iou_over_threshold(const Box a, const Box b, const Threshold threshold) { const float left = max(a[0], b[0]); const float right = min(a[2], b[2]); @@ -100,16 +110,6 @@ __device__ inline index_int nms_packed_idx(index_int i, index_int j, index_int N return (i * N - (i * (i + 1)) / 2) + j - (i + 1); } -// Comparator for sorting nms_data{}. -template -struct nms_score_greater -{ - constexpr bool operator()(const nms_data& a, const nms_data& b) const - { - return *(a.score) > *(b.score); - } -}; - // One block per (batch_idx, class_idx). // Load data into per-block buffer of nms_data. // Pads values after N with sentinel values. @@ -128,8 +128,8 @@ template __device__ void nonmaxsuppression_sort( - Boxes boxes_tv, - Scores scores_tv, + const Boxes boxes_tv, + const Scores scores_tv, SortedScores sorted_scores, SortedBoxes sorted_boxes, SortedIndices sorted_indices) @@ -146,49 +146,44 @@ __device__ void nonmaxsuppression_sort( // numpy indexing: scores[batch_idx, class_idx, :] const auto my_scores = slice_tensor(scores_tv, array{batch_idx, class_idx, 0}, slice_axes<2>()); - - auto block_out_scores = slice_tensor(sorted_scores, array{block_id, 0}, slice_axes<1>()); - auto block_out_boxes = slice_tensor(sorted_boxes, array{block_id, 0, 0}, slice_axes<1, 2>()); - auto block_out_indices = slice_tensor(sorted_indices, array{block_id, 0}, slice_axes<1>()); - - using scores_type = decltype(block_out_scores.begin()); - using boxes_type = decltype(block_out_boxes.begin()); - using indices_type = decltype(block_out_indices.begin()); + + using scores_type = typename SortedScores::type; + using boxes_type = typename SortedBoxes::type; + using indices_type = typename SortedIndices::type; + // Use shared memory for sorting per-block nms_data. Assuming it fits in LDS. + // TODO: can add a static_assert on needed LDS size __shared__ uninitialized_buffer, AlignedNumBoxes> block_nms_data; - idx.local_stride(AlignedNumBoxes, [&](auto i) { if(i < NumBoxes) { - block_out_scores[i] = my_scores[i]; - // numpy indexing: boxes[batch_idx, i, :] - auto normed_box = nms_normalize_box( - slice_tensor(boxes_tv, array{batch_idx, i, 0}, slice_axes<2>()) - ); - // numpy syntax: out_boxes[block_id, i, 0] - auto out_boxes_iter = block_out_boxes.begin_at(array{0, i, 0}); - copy(normed_box.begin(), normed_box.end(), out_boxes_iter); - block_out_indices[i] = i; + block_nms_data[i].score = my_scores[i]; + block_nms_data[i].box = nms_normalize_box( + slice_tensor(boxes_tv, array{batch_idx, i, 0}, slice_axes<2>())); + block_nms_data[i].box_index = static_cast(i); } else { - // Sentinel score so it never beats any real entry - block_out_scores[i] = numeric_lowest(); - auto filler_box = array{0.f, 0.f, 0.f, 0.f}; - auto out_boxes_iter = block_out_boxes.begin_at(array{0, i, 0}); - copy(filler_box.begin(), filler_box.end(), out_boxes_iter); - block_out_indices[i] = -1; + block_nms_data[i].score = numeric_lowest(); + block_nms_data[i].box = array{0.f, 0.f, 0.f, 0.f}; + block_nms_data[i].box_index = -1; } - block_nms_data[i] = { - block_out_scores.begin_at(array{0, i}), - block_out_boxes.begin_at(array{0, i, 0}), - block_out_indices.begin_at(array{0, i}) - }; }); __syncthreads(); - bitonic_sort> - { - nms_score_greater{} - }.template block_sort(idx, block_nms_data); + + bitonic_sort>{nms_score_greater{}} + .template block_sort(idx, block_nms_data); + __syncthreads(); + + // Copy sorted result back to global memory. + auto block_out_scores = slice_tensor(sorted_scores, array{block_id, 0}, slice_axes<1>()); + auto block_out_boxes = slice_tensor(sorted_boxes, array{block_id, 0, 0}, slice_axes<1, 2>()); + auto block_out_indices = slice_tensor(sorted_indices, array{block_id, 0}, slice_axes<1>()); + idx.local_stride(AlignedNumBoxes, [&](auto i) { + block_out_scores[i] = block_nms_data[i].score; + auto out_box_iter = block_out_boxes.begin_at(array{0, i, 0}); + copy(block_nms_data[i].box.begin(), block_nms_data[i].box.end(), out_box_iter); + block_out_indices[i] = block_nms_data[i].box_index; + }); } // Build the packed upper-triangular IoU mask for the NumBoxes nms_data boxes. @@ -197,7 +192,7 @@ __device__ void nonmaxsuppression_sort( // `nms_data`: nms_data nms_data{} tensor // `mask`: bool mask tensor template -__device__ void nms_make_iou_mask(index idx, const NMSData nms_data, Mask mask, IouThreshold iou_threshold) +__device__ void nms_make_iou_mask(const index idx, const NMSData nms_data, Mask mask, const IouThreshold iou_threshold) { static_assert(NumBoxes > 0); constexpr index_int half = NumBoxes / 2; @@ -226,12 +221,12 @@ __device__ void nms_make_iou_mask(index idx, const NMSData nms_data, Mask mask, // TODO: use template for types // Greedy filter that writes selections into a per-batch per-class region of output. template -__device__ void nms_filter_per_block(index idx, +__device__ void nms_filter_per_block(const index idx, const NMSData nms_data, const Mask mask, - int64_t max_output, - float score_thr, - Output output, + const int64_t max_output, + const float score_thr, + Output block_output, Counts bc_counts) { static_assert(NumBoxes > 0); @@ -243,7 +238,7 @@ __device__ void nms_filter_per_block(index idx, // Match the ref op: only filter by score when score_threshold > 0. const bool do_filter = score_thr > 0.f; idx.local_stride(NumBoxes, [&](auto i) { - removed[i] = (do_filter and *(nms_data[i].score) < score_thr); + removed[i] = (do_filter and nms_data[i].score < score_thr); }); __syncthreads(); @@ -259,8 +254,8 @@ __device__ void nms_filter_per_block(index idx, { if(idx.local == 0) { - array tmp = {batch_idx, class_idx, *(nms_data[i].box_index)}; - auto output_iter = output.begin_at(array{block_id, output_idx, 0}); + array tmp = {batch_idx, class_idx, nms_data[i].box_index}; + auto output_iter = block_output.begin_at(array{0, output_idx, 0}); copy(tmp.begin(), tmp.end(), output_iter); } ++output_idx; @@ -278,6 +273,8 @@ __device__ void nms_filter_per_block(index idx, // Per-block filter driver: one block per (batch_idx, class_idx).`. // Expecting box-coordinate convention has already been normalized into corner form. +// TODO: Merge the nonmaxsuppression_sort and nonmaxsuppression_filter kernels by relaxing +// the AlignedNumBoxes resitriction for the sort. template -__device__ void nonmaxsuppression_filter(SortedScores sorted_scores, - SortedBoxes sorted_boxes, - SortedIndices sorted_indices, - MaxOut max_out_p, - IouThr iou_thr_p, - ScoreThr score_thr_p, +__device__ void nonmaxsuppression_filter(const SortedScores sorted_scores, + const SortedBoxes sorted_boxes, + const SortedIndices sorted_indices, + const MaxOut max_out_p, + const IouThr iou_thr_p, + const ScoreThr score_thr_p, Mask mask, Output output, Counts bc_counts) @@ -305,30 +302,29 @@ __device__ void nonmaxsuppression_filter(SortedScores sorted_scores, static_assert(NumClasses > 0); static_assert(NumBoxes > 0); - auto idx = make_index(); - const index_int block_idx = idx.group; + auto idx = make_index(); + const index_int block_idx = idx.group; auto my_sorted_scores = slice_tensor(sorted_scores, array{block_idx, 0}, slice_axes<1>()); auto my_sorted_boxes = slice_tensor(sorted_boxes, array{block_idx, 0, 0}, slice_axes<1, 2>()); auto my_sorted_indices = slice_tensor(sorted_indices, array{block_idx, 0}, slice_axes<1>()); - using scores_type = decltype(my_sorted_scores.begin()); - using boxes_type = decltype(my_sorted_boxes.begin()); - using indices_type = decltype(my_sorted_indices.begin()); + using scores_type = typename SortedScores::type; + using boxes_type = typename SortedBoxes::type; + using indices_type = typename SortedIndices::type; + // Use shared memory for sorting per-block nms_data. Assuming it fits in LDS. + // TODO: can add a static_assert on needed LDS size __shared__ uninitialized_buffer, NumBoxes> block_nms_data; idx.local_stride(AlignedNumBoxes, [&](auto i) { if(i < NumBoxes) { - block_nms_data[i] = { - my_sorted_scores.begin_at(array{0, i}), - my_sorted_boxes.begin_at(array{0, i, 0}), - my_sorted_indices.begin_at(array{0, i}) - }; + block_nms_data[i].score = my_sorted_scores[i]; + auto boxes_iter = my_sorted_boxes.begin_at(array{0, i, 0}); + copy(boxes_iter, boxes_iter + 4, block_nms_data[i].box.begin()); + block_nms_data[i].box_index = my_sorted_indices[i]; } }); - __syncthreads(); - auto my_mask = slice_tensor(mask, array{block_idx, 0}, slice_axes<1>()); auto my_output = slice_tensor(output, array{block_idx, 0, 0}, slice_axes<1, 2>()); @@ -337,9 +333,10 @@ __device__ void nonmaxsuppression_filter(SortedScores sorted_scores, const float iou_thr_val = iou_thr_p[0]; const float score_thr_val = score_thr_p[0]; - nms_make_iou_mask(idx, block_nms_data, my_mask, iou_thr_val); __syncthreads(); + nms_make_iou_mask(idx, block_nms_data, my_mask, iou_thr_val); + __syncthreads(); nms_filter_per_block( idx, block_nms_data, diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp index fb59d3724c6..b49d78ca572 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp @@ -146,8 +146,8 @@ struct bitonic_sort // (e.g. greater{} -> descending). The buffer must be sized to N (a // compile-time power of 2); callers pad with sentinel values when the // logical length is smaller. - template - __device__ void block_sort(index idx, Array& buf) const + template + __device__ void block_sort(index idx, T& buf) const { static_assert(is_power_of_2(N), "N must be a power of 2"); for(index_int k = 2; k <= N; k <<= 1) @@ -160,14 +160,7 @@ struct bitonic_sort { const bool reverse = (tid & k) != 0; if(this->compare(buf[tid], buf[partner], reverse)) - { - swap(*(buf[tid].score), *(buf[partner].score)); - swap(*(buf[tid].box), *(buf[partner].box)); - swap(*(buf[tid].box+1), *(buf[partner].box+1)); - swap(*(buf[tid].box+2), *(buf[partner].box+2)); - swap(*(buf[tid].box+3), *(buf[partner].box+3)); - swap(*(buf[tid].box_index), *(buf[partner].box_index)); - } + swap(buf[tid], buf[partner]); } }); __syncthreads(); From 101125604a67afb9fb5edb56941534394c5e0961 Mon Sep 17 00:00:00 2001 From: charlie Date: Wed, 20 May 2026 17:20:44 -0500 Subject: [PATCH 18/32] Progress on polish --- src/targets/gpu/jit/nonmaxsuppression.cpp | 4 +- .../migraphx/kernels/nonmaxsuppression.hpp | 40 +++++++++---------- 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp index ff6b1d32ece..9d0c8cc3efd 100644 --- a/src/targets/gpu/jit/nonmaxsuppression.cpp +++ b/src/targets/gpu/jit/nonmaxsuppression.cpp @@ -195,8 +195,8 @@ struct nms_filter_compiler : compiler const auto aligned_num_boxes = static_cast(bit_ceil(static_cast(num_boxes))); // TODO: tune for max block size? - // num_boxes/2 because of strided thread work distribution - const auto block_size = compute_block_size(ctx, num_boxes/2, 256); + // ceil_div(num_boxes, 2) because of strided thread work distribution + const auto block_size = compute_block_size(ctx, (num_boxes + 1)/2, 256); hip_compile_options options; options.inputs = flatten_shapes(inputs); diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp index 5ac4c46f4cb..ab4eb053cd6 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp @@ -47,11 +47,11 @@ struct nms_data Index box_index; }; -// Comparator for sorting nms_data{}. -template +// Comparator for sorting nms_data{} (or anything else with a `.score` field). struct nms_score_greater { - constexpr bool operator()(const nms_data& a, const nms_data& b) const + template + constexpr bool operator()(const T& a, const T& b) const { return a.score > b.score; } @@ -114,9 +114,11 @@ __device__ inline index_int nms_packed_idx(index_int i, index_int j, index_int N // Load data into per-block buffer of nms_data. // Pads values after N with sentinel values. // Sorts the nms_data in descending order by score. -// boxes_tv: dims([N, 4]) of float. -// scores_tv: dims([N]) of float. -// sorted_tv: dims([N]) of nms_data{}. +// boxes_tv: dims([NumBatches, NumBoxes, 4]) +// scores_tv: dims([NumBatches, NumClasses, NumBoxes]) +// sorted_scores: output, dims([B, C, AlignedNumBoxes]) +// sorted_boxes: output, dims([B, C, AlignedNumBoxes, 4]) +// sorted_indices: output, dims([B, C, AlignedNumBoxes]) template (); - block_nms_data[i].box = array{0.f, 0.f, 0.f, 0.f}; + block_nms_data[i].score = numeric_lowest(); + block_nms_data[i].box = array{0.f, 0.f, 0.f, 0.f}; block_nms_data[i].box_index = -1; } }); __syncthreads(); - bitonic_sort>{nms_score_greater{}} - .template block_sort(idx, block_nms_data); - __syncthreads(); + bitonic_sort{nms_score_greater{}}.template block_sort(idx, block_nms_data); // Copy sorted result back to global memory. auto block_out_scores = slice_tensor(sorted_scores, array{block_id, 0}, slice_axes<1>()); @@ -201,7 +201,7 @@ __device__ void nms_make_iou_mask(const index idx, const NMSData nms_data, Mask for(index_int j = i + 1; j < NumBoxes; ++j) { mask[nms_packed_idx(i, j, NumBoxes)] = - nms_iou_over_threshold(nms_data[i].box, nms_data[j].box, iou_threshold) ? 1 : 0; + nms_iou_over_threshold(nms_data[i].box, nms_data[j].box, iou_threshold); } }; @@ -316,14 +316,11 @@ __device__ void nonmaxsuppression_filter(const SortedScores sorted_scores, // TODO: can add a static_assert on needed LDS size __shared__ uninitialized_buffer, NumBoxes> block_nms_data; - idx.local_stride(AlignedNumBoxes, [&](auto i) { - if(i < NumBoxes) - { - block_nms_data[i].score = my_sorted_scores[i]; - auto boxes_iter = my_sorted_boxes.begin_at(array{0, i, 0}); - copy(boxes_iter, boxes_iter + 4, block_nms_data[i].box.begin()); - block_nms_data[i].box_index = my_sorted_indices[i]; - } + idx.local_stride(NumBoxes, [&](auto i) { + block_nms_data[i].score = my_sorted_scores[i]; + auto boxes_iter = my_sorted_boxes.begin_at(array{0, i, 0}); + copy(boxes_iter, boxes_iter + 4, block_nms_data[i].box.begin()); + block_nms_data[i].box_index = my_sorted_indices[i]; }); auto my_mask = slice_tensor(mask, array{block_idx, 0}, slice_axes<1>()); auto my_output = slice_tensor(output, array{block_idx, 0, 0}, slice_axes<1, 2>()); @@ -366,7 +363,8 @@ __device__ void nonmaxsuppression_compact(const Counts bc_counts, { static_assert(NumBatchClass > 0); static_assert(NumBoxes > 0); - static_assert(NumBatchClass <= 16000, "nms_compact: NumBatchClass exceeds the LDS budget for offsets[]"); + // TODO: get a better bound on this + static_assert(NumBatchClass <= 8192, "nms_compact: NumBatchClass exceeds the LDS budget for offsets[]"); auto idx = make_index(); __shared__ index_int offsets[NumBatchClass]; From b5a95684db5512fbc26f14d72c0611b103b65c05 Mon Sep 17 00:00:00 2001 From: charlie Date: Wed, 20 May 2026 18:58:10 -0500 Subject: [PATCH 19/32] Minor cleanup --- .../include/migraphx/kernels/nonmaxsuppression.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp index ab4eb053cd6..7e050e1b51b 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp @@ -191,8 +191,8 @@ __device__ void nonmaxsuppression_sort( // amount of work regardless of where it falls in the triangle. // `nms_data`: nms_data nms_data{} tensor // `mask`: bool mask tensor -template -__device__ void nms_make_iou_mask(const index idx, const NMSData nms_data, Mask mask, const IouThreshold iou_threshold) +template +__device__ void nms_make_iou_mask(const index idx, const NMSData nms_data, Mask mask, const float iou_threshold) { static_assert(NumBoxes > 0); constexpr index_int half = NumBoxes / 2; @@ -224,7 +224,7 @@ template {block_idx, 0, 0}, slice_axes<1, 2>()); // Read scalar tensor inputs - const int64_t max_output_boxes_per_class = max_out_p[0]; + const int max_output_boxes_per_class = max_out_p[0]; const float iou_thr_val = iou_thr_p[0]; const float score_thr_val = score_thr_p[0]; From 32c779d94effdedbaeb18c562b51a2c10e1fcb10 Mon Sep 17 00:00:00 2001 From: charlie Date: Thu, 21 May 2026 13:19:20 -0500 Subject: [PATCH 20/32] Move prepare_nonmaxsuppression into lowering --- src/targets/gpu/CMakeLists.txt | 2 +- .../gpu/prepare_nonmaxsuppression.hpp | 48 ---------- .../migraphx/kernels/nonmaxsuppression.hpp | 1 - src/targets/gpu/lowering.cpp | 77 +++++++++++++--- ...pare_nonmaxsuppression.cpp => nms_ops.cpp} | 91 +------------------ src/targets/gpu/target.cpp | 3 - 6 files changed, 68 insertions(+), 154 deletions(-) delete mode 100644 src/targets/gpu/include/migraphx/gpu/prepare_nonmaxsuppression.hpp rename src/targets/gpu/{prepare_nonmaxsuppression.cpp => nms_ops.cpp} (56%) diff --git a/src/targets/gpu/CMakeLists.txt b/src/targets/gpu/CMakeLists.txt index b8e92310b99..eee696a85a9 100644 --- a/src/targets/gpu/CMakeLists.txt +++ b/src/targets/gpu/CMakeLists.txt @@ -179,11 +179,11 @@ add_library(migraphx_gpu loop.cpp lrn.cpp mlir.cpp + nms_ops.cpp no_device.cpp pack_args.cpp prefuse_ops.cpp prepare_mlir.cpp - prepare_nonmaxsuppression.cpp prepare_reduce.cpp perfdb.cpp pooling.cpp diff --git a/src/targets/gpu/include/migraphx/gpu/prepare_nonmaxsuppression.hpp b/src/targets/gpu/include/migraphx/gpu/prepare_nonmaxsuppression.hpp deleted file mode 100644 index bf47c8607b9..00000000000 --- a/src/targets/gpu/include/migraphx/gpu/prepare_nonmaxsuppression.hpp +++ /dev/null @@ -1,48 +0,0 @@ -/* - * The MIT License (MIT) - * - * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - */ -#ifndef MIGRAPHX_GUARD_GPU_PREPARE_NONMAXSUPPRESSION_HPP -#define MIGRAPHX_GUARD_GPU_PREPARE_NONMAXSUPPRESSION_HPP - -#include -#include -#include - -namespace migraphx { -inline namespace MIGRAPHX_INLINE_NS { - -struct module; - -namespace gpu { - -struct MIGRAPHX_GPU_EXPORT prepare_nonmaxsuppression -{ - std::string name() const { return "gpu::prepare_nonmaxsuppression"; } - void apply(module& m) const; -}; - -} // namespace gpu -} // namespace MIGRAPHX_INLINE_NS -} // namespace migraphx -#endif // MIGRAPHX_GUARD_GPU_PREPARE_NONMAXSUPPRESSION_HPP diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp index 7e050e1b51b..5ba50f436d4 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp @@ -218,7 +218,6 @@ __device__ void nms_make_iou_mask(const index idx, const NMSData nms_data, Mask } } -// TODO: use template for types // Greedy filter that writes selections into a per-batch per-class region of output. template __device__ void nms_filter_per_block(const index idx, diff --git a/src/targets/gpu/lowering.cpp b/src/targets/gpu/lowering.cpp index 196a0353a35..976d84ade0e 100644 --- a/src/targets/gpu/lowering.cpp +++ b/src/targets/gpu/lowering.cpp @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -447,23 +448,73 @@ struct miopen_apply }); } + // Rewrites onnx `nonmaxsuppression` into the GPU op pipeline: + // gpu::nms_sort -> gpu::nms_filter -> gpu::nms_compact + // Each gpu::nms_* op is wrapped in gpu::precompile_op inline so the JIT + // compile pass can pick them up later. We can't rely on the main lowering + // loop to wrap them: it walks forward, and the new instructions land + // before `ins` so they would never be revisited. void add_nms_op() { apply_map.emplace("nonmaxsuppression", [=](instruction_ref ins) { - auto s = ins->get_shape(); - auto output = insert_allocation(ins, s); - std::vector cpu_inputs; auto inputs = ins->inputs(); - std::transform( - inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) { - return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in); - }); - cpu_inputs.front() = - mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs); - auto cpu_out = mod->insert_instruction(ins, ins->get_operator(), cpu_inputs); - auto gpu_out = - mod->insert_instruction(ins, make_op("hip::copy_to_gpu"), cpu_out, output); - return mod->replace_instruction(ins, gpu_out); + const auto& boxes_s = inputs[0]->get_shape(); + const auto& scores_s = inputs[1]->get_shape(); + const auto num_batches = boxes_s.lens()[0]; + const auto num_boxes = boxes_s.lens()[1]; + const auto num_classes = scores_s.lens()[1]; + const auto iou_packed = num_boxes * (num_boxes - 1) / 2; + + // Fill in missing optional scalar inputs with default literals. + const shape default_max_s{shape::int64_type, {1}}; + const shape default_iou_s{shape::float_type, {1}}; + const shape default_thr_s{shape::float_type, {1}}; + if(inputs.size() < 3) + inputs.push_back( + mod->insert_literal(ins, literal{default_max_s, {std::int64_t{0}}})); + if(inputs.size() < 4) + inputs.push_back(mod->insert_literal(ins, literal{default_iou_s, {0.0f}})); + if(inputs.size() < 5) + inputs.push_back(mod->insert_literal(ins, literal{default_thr_s, {0.0f}})); + + bool center_point_box = + ins->get_operator().to_value().at("center_point_box").to(); + + // Mask is scratch only; allocate up-front so the standard + // replace_allocate pass can later turn it into hip::allocate. + shape mask_shape{shape::uint8_type, {num_batches * num_classes, iou_packed}}; + auto mask_alloc = insert_allocation(ins, mask_shape); + + auto sorted = mod->insert_instruction( + ins, + make_op("gpu::nms_sort", {{"center_point_box", center_point_box}}), + inputs[0], + inputs[1]); + sorted = insert_precompile_op(sorted); + + auto filter = mod->insert_instruction( + ins, + make_op("gpu::nms_filter", + {{"num_batches", num_batches}, + {"num_classes", num_classes}, + {"num_boxes", num_boxes}}), + sorted, + inputs[2], + inputs[3], + inputs[4], + mask_alloc); + filter = insert_precompile_op(filter); + + auto raw_output = + mod->insert_instruction(ins, make_op("get_tuple_elem", {{"index", 0}}), filter); + auto bc_counts = + mod->insert_instruction(ins, make_op("get_tuple_elem", {{"index", 1}}), filter); + + auto compact = mod->insert_instruction( + ins, make_op("gpu::nms_compact"), bc_counts, raw_output); + compact = insert_precompile_op(compact); + + return mod->replace_instruction(ins, compact); }); } diff --git a/src/targets/gpu/prepare_nonmaxsuppression.cpp b/src/targets/gpu/nms_ops.cpp similarity index 56% rename from src/targets/gpu/prepare_nonmaxsuppression.cpp rename to src/targets/gpu/nms_ops.cpp index e6e2c021578..a1fb8fdfe48 100644 --- a/src/targets/gpu/prepare_nonmaxsuppression.cpp +++ b/src/targets/gpu/nms_ops.cpp @@ -22,14 +22,8 @@ * THE SOFTWARE. * */ -#include #include #include -#include -#include -#include -#include -#include #include #include @@ -59,9 +53,10 @@ struct nms_sort if(boxes_s.lens().size() != 3 or scores_s.lens().size() != 3) MIGRAPHX_THROW("gpu::nms_sort: boxes and scores must be 3-D"); const auto num_batches = boxes_s.lens()[0]; - const auto num_boxes = boxes_s.lens()[1]; + const auto num_boxes = boxes_s.lens()[1]; const auto num_classes = scores_s.lens()[1]; - const auto aligned_b = static_cast(bit_ceil(static_cast(num_boxes))); + const auto aligned_b = + static_cast(bit_ceil(static_cast(num_boxes))); shape out_scores_shape{shape::float_type, {num_batches * num_classes, aligned_b}}; shape out_boxes_shape{shape::float_type, {num_batches * num_classes, aligned_b, 4}}; shape out_box_index_shape{shape::int32_type, {num_batches * num_classes, aligned_b}}; @@ -120,86 +115,6 @@ struct nms_compact }; MIGRAPHX_REGISTER_OP(nms_compact); -namespace { - -struct find_nonmaxsuppression -{ - auto matcher() const { return match::name("nonmaxsuppression"); } - - void apply(module& m, const match::matcher_result& r) const - { - auto ins = r.result; - auto inputs = ins->inputs(); - if(inputs.size() < 2 or inputs.size() > 5) - MIGRAPHX_THROW("prepare_nonmaxsuppression: unexpected input count " + - std::to_string(inputs.size())); - - const auto& boxes_s = inputs[0]->get_shape(); - const auto& scores_s = inputs[1]->get_shape(); - if(boxes_s.ndim() != 3 or scores_s.ndim() != 3) - MIGRAPHX_THROW("prepare_nonmaxsuppression: boxes and scores must be 3-D"); - - const auto num_batches = boxes_s.lens()[0]; - const auto num_boxes = boxes_s.lens()[1]; - const auto num_classes = scores_s.lens()[1]; - const auto iou_packed = (num_boxes * (num_boxes - 1) / 2); - - // Fill in missing optional scalar inputs with default literals. - // TODO: this is the wrong way to handle this. Should be checking if the input is eval'able. - const shape default_max_s{shape::int64_type, {1}}; - const shape default_iou_s{shape::float_type, {1}}; - const shape default_thr_s{shape::float_type, {1}}; - if(inputs.size() < 3) - inputs.push_back(m.insert_literal(ins, literal{default_max_s, {std::int64_t{0}}})); - if(inputs.size() < 4) - inputs.push_back(m.insert_literal(ins, literal{default_iou_s, {0.0f}})); - if(inputs.size() < 5) - inputs.push_back(m.insert_literal(ins, literal{default_thr_s, {0.0f}})); - - auto op_val = ins->get_operator().to_value(); - bool center_point_box = op_val.at("center_point_box").to(); - - // Mask is scratch only; allocate up-front so the standard - // replace_allocate pass can later turn it into hip::allocate. - shape mask_shape{shape::uint8_type, {num_batches * num_classes, iou_packed}}; - auto mask_alloc = - m.insert_instruction(ins, make_op("allocate", {{"shape", to_value(mask_shape)}})); - - auto sorted = m.insert_instruction( - ins, - make_op("gpu::nms_sort", {{"center_point_box", center_point_box}}), - inputs[0], - inputs[1]); - - auto filter = m.insert_instruction( - ins, - make_op("gpu::nms_filter", - {{"num_batches", num_batches}, {"num_classes", num_classes}, {"num_boxes", num_boxes}}), - sorted, - inputs[2], - inputs[3], - inputs[4], - mask_alloc); - - auto raw_output = - m.insert_instruction(ins, make_op("get_tuple_elem", {{"index", 0}}), filter); - auto bc_counts = - m.insert_instruction(ins, make_op("get_tuple_elem", {{"index", 1}}), filter); - - auto compact = - m.insert_instruction(ins, make_op("gpu::nms_compact"), bc_counts, raw_output); - - m.replace_instruction(ins, compact); - } -}; - -} // namespace - -void prepare_nonmaxsuppression::apply(module& m) const -{ - match::find_matches(m, find_nonmaxsuppression{}); -} - } // namespace gpu } // namespace MIGRAPHX_INLINE_NS } // namespace migraphx diff --git a/src/targets/gpu/target.cpp b/src/targets/gpu/target.cpp index 8ff00a75b7b..3ed3e72033d 100644 --- a/src/targets/gpu/target.cpp +++ b/src/targets/gpu/target.cpp @@ -73,7 +73,6 @@ #include #include #include -#include #include #include #include @@ -164,8 +163,6 @@ std::vector target::get_passes(migraphx::context& gctx, const compile_opti dead_code_elimination{}, auto_contiguous{}, dead_code_elimination{}, - prepare_nonmaxsuppression{}, - dead_code_elimination{}, lowering{&ctx, options.offload_copy}, eliminate_contiguous{"gpu::contiguous"}, dead_code_elimination{}, From c5fb1070777d6eb49204075a505e983ed67d806d Mon Sep 17 00:00:00 2001 From: charlie Date: Thu, 21 May 2026 13:34:17 -0500 Subject: [PATCH 21/32] Add env var for retaining current NMS behavior for now --- docs/reference/MIGraphX-dev-env-vars.rst | 8 ++++++++ src/include/migraphx/op/nonmaxsuppression.hpp | 2 ++ src/onnx/parse_nonmaxsuppression.cpp | 15 ++++++++++++--- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/docs/reference/MIGraphX-dev-env-vars.rst b/docs/reference/MIGraphX-dev-env-vars.rst index d84879717ec..c554dc31fee 100644 --- a/docs/reference/MIGraphX-dev-env-vars.rst +++ b/docs/reference/MIGraphX-dev-env-vars.rst @@ -298,6 +298,14 @@ Model performance tunable variables change the compilation behavior of a model. | Default: Full dynamic shape support is disabled. + * - | ``MIGRAPHX_USE_DYNAMIC_NMS`` + | When set, the ``NonMaxSuppression`` ONNX parser performs a dynamic slice on the raw indices tensor to trim it to the number of selected boxes, producing an output with a dynamic shape. + + - | ``1``: A dynamic slice is applied to the raw indices tensor, producing a dynamic-shaped output. + | ``0``: Returns to default behavior. + + | Default: The whole raw indices tensor is returned without slicing. + Matching ********** diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp index d154733a581..22cc28aec41 100644 --- a/src/include/migraphx/op/nonmaxsuppression.hpp +++ b/src/include/migraphx/op/nonmaxsuppression.hpp @@ -231,6 +231,8 @@ struct nonmaxsuppression double iou_threshold, double score_threshold) const { + // NOTE: should not need to fill with 0 + std::fill(output.begin(), output.end(), 0); const auto& lens = scores.get_shape().lens(); const auto num_batches = lens[0]; const auto num_classes = lens[1]; diff --git a/src/onnx/parse_nonmaxsuppression.cpp b/src/onnx/parse_nonmaxsuppression.cpp index 0ffffa03bcd..b67f33d880c 100644 --- a/src/onnx/parse_nonmaxsuppression.cpp +++ b/src/onnx/parse_nonmaxsuppression.cpp @@ -25,6 +25,8 @@ #include #include +MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_USE_DYNAMIC_NMS); + namespace migraphx { inline namespace MIGRAPHX_INLINE_NS { namespace onnx { @@ -42,9 +44,16 @@ struct parse_nonmaxsuppression : op_parser auto nms_ins = info.add_instruction(op, args); // variable ends input slice to handle dynamic shape output auto indices = info.add_instruction(make_op("get_tuple_elem", {{"index", 0}}), nms_ins); - auto num_selected = info.add_instruction(make_op("get_tuple_elem", {{"index", 1}}), nms_ins); - auto slice_ins = info.add_instruction(make_op("slice", {{"axes", {0}}, {"starts", {0}}}), indices, num_selected); - return slice_ins; + if(enabled(MIGRAPHX_USE_DYNAMIC_NMS{})) + { + return indices; + } + else + { + auto num_selected = info.add_instruction(make_op("get_tuple_elem", {{"index", 1}}), nms_ins); + auto slice_ins = info.add_instruction(make_op("slice", {{"axes", {0}}, {"starts", {0}}}), indices, num_selected); + return slice_ins; + } } }; From 289d5adf2047e56fef1117c5d57716ad60a8b41f Mon Sep 17 00:00:00 2001 From: charlie Date: Thu, 21 May 2026 13:43:05 -0500 Subject: [PATCH 22/32] Formatting --- src/include/migraphx/op/nonmaxsuppression.hpp | 13 +- src/onnx/parse_nonmaxsuppression.cpp | 6 +- src/targets/gpu/compile_hip_code_object.cpp | 1 - .../migraphx/gpu/compile_hip_code_object.hpp | 2 +- src/targets/gpu/jit/nonmaxsuppression.cpp | 44 +- .../migraphx/kernels/nonmaxsuppression.hpp | 135 +-- src/targets/gpu/lowering.cpp | 31 +- test/gpu/nonmaxsuppression.cpp | 921 ++++++++++-------- 8 files changed, 654 insertions(+), 499 deletions(-) diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp index 22cc28aec41..b71bc4822eb 100644 --- a/src/include/migraphx/op/nonmaxsuppression.hpp +++ b/src/include/migraphx/op/nonmaxsuppression.hpp @@ -47,7 +47,8 @@ * optional(max_output_boxes_per_class), * optional(iou_threshold), * optional(score_threshold)); - * Outputs tuple of {tensor with dims[max_num_boxes, 3]: selected_box_indices, scalar int64_t: num_selected_indices} + * Outputs tuple of {tensor with dims[max_num_boxes, 3]: selected_box_indices, scalar int64_t: + * num_selected_indices} */ namespace migraphx { inline namespace MIGRAPHX_INLINE_NS { @@ -298,7 +299,7 @@ struct nonmaxsuppression argument compute(const shape& output_shape, std::vector args) const { // make buffer of maximum size - auto output_shapes = flatten_shapes({output_shape}); + auto output_shapes = flatten_shapes({output_shape}); shape max_output_shape = {output_shapes.at(0).type(), output_shapes.at(0).max_lens()}; argument result{max_output_shape}; argument num_selected_result{output_shapes.at(1)}; @@ -307,9 +308,7 @@ struct nonmaxsuppression (args.size() > 2) ? (args.at(2).at()) : 0; if(max_output_boxes_per_class == 0) { - num_selected_result.visit([&](auto output){ - output[0] = 0; - }); + num_selected_result.visit([&](auto output) { output[0] = 0; }); return {{result, num_selected_result}}; } double iou_threshold = (args.size() > 3) ? (args.at(3).at()) : 0.0f; @@ -326,9 +325,7 @@ struct nonmaxsuppression score_threshold); }); }); - num_selected_result.visit([&](auto output){ - output[0] = num_selected; - }); + num_selected_result.visit([&](auto output) { output[0] = num_selected; }); return {{result, num_selected_result}}; } }; diff --git a/src/onnx/parse_nonmaxsuppression.cpp b/src/onnx/parse_nonmaxsuppression.cpp index b67f33d880c..74427a9d5b8 100644 --- a/src/onnx/parse_nonmaxsuppression.cpp +++ b/src/onnx/parse_nonmaxsuppression.cpp @@ -50,8 +50,10 @@ struct parse_nonmaxsuppression : op_parser } else { - auto num_selected = info.add_instruction(make_op("get_tuple_elem", {{"index", 1}}), nms_ins); - auto slice_ins = info.add_instruction(make_op("slice", {{"axes", {0}}, {"starts", {0}}}), indices, num_selected); + auto num_selected = + info.add_instruction(make_op("get_tuple_elem", {{"index", 1}}), nms_ins); + auto slice_ins = info.add_instruction( + make_op("slice", {{"axes", {0}}, {"starts", {0}}}), indices, num_selected); return slice_ins; } } diff --git a/src/targets/gpu/compile_hip_code_object.cpp b/src/targets/gpu/compile_hip_code_object.cpp index efe3b4f80bd..868153c2c9e 100644 --- a/src/targets/gpu/compile_hip_code_object.cpp +++ b/src/targets/gpu/compile_hip_code_object.cpp @@ -192,7 +192,6 @@ compute_global_for(const context& ctx, std::size_t n, std::size_t over) }; } - // `n`: The amount of parallel work within a block. // `max_block_size`: Upper limit on block size. std::size_t compute_block_size(const context& ctx, std::size_t n, std::size_t max_block_size) diff --git a/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp b/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp index 592e32b9af4..f11051916cf 100644 --- a/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp +++ b/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp @@ -36,7 +36,7 @@ namespace gpu { struct context; struct hip_compile_options -{ +{ // Total number of threads std::size_t global; // Threads per block diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp index 9d0c8cc3efd..732e8702410 100644 --- a/src/targets/gpu/jit/nonmaxsuppression.cpp +++ b/src/targets/gpu/jit/nonmaxsuppression.cpp @@ -145,10 +145,10 @@ struct nms_sort_compiler : compiler operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { - const auto& boxes_s = inputs[0]; - const auto& scores_s = inputs[1]; + const auto& boxes_s = inputs[0]; + const auto& scores_s = inputs[1]; const auto num_batches = boxes_s.lens()[0]; - const auto num_boxes = boxes_s.lens()[1]; + const auto num_boxes = boxes_s.lens()[1]; const auto num_classes = scores_s.lens()[1]; const auto aligned_num_boxes = static_cast(bit_ceil(static_cast(num_boxes))); @@ -191,12 +191,12 @@ struct nms_filter_compiler : compiler { const auto num_batches = v.at("num_batches").to(); const auto num_classes = v.at("num_classes").to(); - const auto num_boxes = v.at("num_boxes").to(); + const auto num_boxes = v.at("num_boxes").to(); const auto aligned_num_boxes = static_cast(bit_ceil(static_cast(num_boxes))); // TODO: tune for max block size? // ceil_div(num_boxes, 2) because of strided thread work distribution - const auto block_size = compute_block_size(ctx, (num_boxes + 1)/2, 256); + const auto block_size = compute_block_size(ctx, (num_boxes + 1) / 2, 256); hip_compile_options options; options.inputs = flatten_shapes(inputs); @@ -205,14 +205,14 @@ struct nms_filter_compiler : compiler options.virtual_inputs = options.inputs; options.set_launch_params(v, num_batches * num_classes * block_size, block_size); - auto src = interpolate_string( - nms_filter_kernel_src, - {{"params", enum_params(options.inputs.size(), "void * private_p")}, - {"args", enum_params(options.inputs.size(), "private_p")}, - {"num_batches", std::to_string(num_batches)}, - {"num_classes", std::to_string(num_classes)}, - {"num_boxes", std::to_string(num_boxes)}, - {"aligned_num_boxes", std::to_string(aligned_num_boxes)}}); + auto src = + interpolate_string(nms_filter_kernel_src, + {{"params", enum_params(options.inputs.size(), "void * private_p")}, + {"args", enum_params(options.inputs.size(), "private_p")}, + {"num_batches", std::to_string(num_batches)}, + {"num_classes", std::to_string(num_classes)}, + {"num_boxes", std::to_string(num_boxes)}, + {"aligned_num_boxes", std::to_string(aligned_num_boxes)}}); return compile_hip_code_object(ctx, src, options); } @@ -231,10 +231,10 @@ struct nms_compact_compiler : compiler operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { - const auto& cnt_s = inputs[0]; - const auto& indices_s = inputs[1]; + const auto& cnt_s = inputs[0]; + const auto& indices_s = inputs[1]; const auto num_batch_class = cnt_s.elements(); - const auto num_boxes = indices_s.elements() / (num_batch_class * std::size_t{3}); + const auto num_boxes = indices_s.elements() / (num_batch_class * std::size_t{3}); // TODO: tune for block size? // num_boxes block size could also work? const auto block_size = compute_block_size(ctx, num_batch_class * num_boxes, 256); @@ -246,12 +246,12 @@ struct nms_compact_compiler : compiler options.virtual_inputs = options.inputs; options.set_launch_params(v, block_size, block_size); - auto src = interpolate_string( - nms_compact_kernel_src, - {{"params", enum_params(options.inputs.size(), "void * private_p")}, - {"args", enum_params(options.inputs.size(), "private_p")}, - {"num_batch_class", std::to_string(num_batch_class)}, - {"num_boxes", std::to_string(num_boxes)}}); + auto src = + interpolate_string(nms_compact_kernel_src, + {{"params", enum_params(options.inputs.size(), "void * private_p")}, + {"args", enum_params(options.inputs.size(), "private_p")}, + {"num_batch_class", std::to_string(num_batch_class)}, + {"num_boxes", std::to_string(num_boxes)}}); return compile_hip_code_object(ctx, src, options); } diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp index 5ba50f436d4..7ae9638e173 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp @@ -86,8 +86,7 @@ __device__ inline array nms_normalize_box(const Box box) } template -__device__ inline bool -nms_iou_over_threshold(const Box a, const Box b, const Threshold threshold) +__device__ inline bool nms_iou_over_threshold(const Box a, const Box b, const Threshold threshold) { const float left = max(a[0], b[0]); const float right = min(a[2], b[2]); @@ -129,38 +128,40 @@ template -__device__ void nonmaxsuppression_sort( - const Boxes boxes_tv, - const Scores scores_tv, - SortedScores sorted_scores, - SortedBoxes sorted_boxes, - SortedIndices sorted_indices) +__device__ void nonmaxsuppression_sort(const Boxes boxes_tv, + const Scores scores_tv, + SortedScores sorted_scores, + SortedBoxes sorted_boxes, + SortedIndices sorted_indices) { static_assert(NumBatches > 0); static_assert(NumClasses > 0); static_assert(NumBoxes > 0); static_assert(AlignedNumBoxes > 0); - auto idx = make_index(); + auto idx = make_index(); const index_int block_id = idx.group; - const int batch_idx = block_id / NumClasses; - const int class_idx = block_id % NumClasses; - + const int batch_idx = block_id / NumClasses; + const int class_idx = block_id % NumClasses; + // numpy indexing: scores[batch_idx, class_idx, :] - const auto my_scores = slice_tensor(scores_tv, array{batch_idx, class_idx, 0}, slice_axes<2>()); - - using scores_type = typename SortedScores::type; - using boxes_type = typename SortedBoxes::type; + const auto my_scores = + slice_tensor(scores_tv, array{batch_idx, class_idx, 0}, slice_axes<2>()); + + using scores_type = typename SortedScores::type; + using boxes_type = typename SortedBoxes::type; using indices_type = typename SortedIndices::type; // Use shared memory for sorting per-block nms_data. Assuming it fits in LDS. // TODO: can add a static_assert on needed LDS size - __shared__ uninitialized_buffer, AlignedNumBoxes> block_nms_data; + __shared__ + uninitialized_buffer, AlignedNumBoxes> + block_nms_data; idx.local_stride(AlignedNumBoxes, [&](auto i) { if(i < NumBoxes) { - block_nms_data[i].score = my_scores[i]; - block_nms_data[i].box = nms_normalize_box( - slice_tensor(boxes_tv, array{batch_idx, i, 0}, slice_axes<2>())); + block_nms_data[i].score = my_scores[i]; + block_nms_data[i].box = nms_normalize_box( + slice_tensor(boxes_tv, array{batch_idx, i, 0}, slice_axes<2>())); block_nms_data[i].box_index = static_cast(i); } else @@ -175,12 +176,15 @@ __device__ void nonmaxsuppression_sort( bitonic_sort{nms_score_greater{}}.template block_sort(idx, block_nms_data); // Copy sorted result back to global memory. - auto block_out_scores = slice_tensor(sorted_scores, array{block_id, 0}, slice_axes<1>()); - auto block_out_boxes = slice_tensor(sorted_boxes, array{block_id, 0, 0}, slice_axes<1, 2>()); - auto block_out_indices = slice_tensor(sorted_indices, array{block_id, 0}, slice_axes<1>()); + auto block_out_scores = + slice_tensor(sorted_scores, array{block_id, 0}, slice_axes<1>()); + auto block_out_boxes = + slice_tensor(sorted_boxes, array{block_id, 0, 0}, slice_axes<1, 2>()); + auto block_out_indices = + slice_tensor(sorted_indices, array{block_id, 0}, slice_axes<1>()); idx.local_stride(AlignedNumBoxes, [&](auto i) { - block_out_scores[i] = block_nms_data[i].score; - auto out_box_iter = block_out_boxes.begin_at(array{0, i, 0}); + block_out_scores[i] = block_nms_data[i].score; + auto out_box_iter = block_out_boxes.begin_at(array{0, i, 0}); copy(block_nms_data[i].box.begin(), block_nms_data[i].box.end(), out_box_iter); block_out_indices[i] = block_nms_data[i].box_index; }); @@ -192,7 +196,8 @@ __device__ void nonmaxsuppression_sort( // `nms_data`: nms_data nms_data{} tensor // `mask`: bool mask tensor template -__device__ void nms_make_iou_mask(const index idx, const NMSData nms_data, Mask mask, const float iou_threshold) +__device__ void +nms_make_iou_mask(const index idx, const NMSData nms_data, Mask mask, const float iou_threshold) { static_assert(NumBoxes > 0); constexpr index_int half = NumBoxes / 2; @@ -219,7 +224,12 @@ __device__ void nms_make_iou_mask(const index idx, const NMSData nms_data, Mask } // Greedy filter that writes selections into a per-batch per-class region of output. -template +template __device__ void nms_filter_per_block(const index idx, const NMSData nms_data, const Mask mask, @@ -230,15 +240,14 @@ __device__ void nms_filter_per_block(const index idx, { static_assert(NumBoxes > 0); const index_int block_id = idx.group; - const int batch_idx = block_id / NumClasses; - const int class_idx = block_id % NumClasses; + const int batch_idx = block_id / NumClasses; + const int class_idx = block_id % NumClasses; // TODO: use bits for removed mask __shared__ uint8_t removed[NumBoxes]; // Match the ref op: only filter by score when score_threshold > 0. const bool do_filter = score_thr > 0.f; - idx.local_stride(NumBoxes, [&](auto i) { - removed[i] = (do_filter and nms_data[i].score < score_thr); - }); + idx.local_stride(NumBoxes, + [&](auto i) { removed[i] = (do_filter and nms_data[i].score < score_thr); }); __syncthreads(); index_int output_idx = 0; @@ -301,49 +310,51 @@ __device__ void nonmaxsuppression_filter(const SortedScores sorted_scores, static_assert(NumClasses > 0); static_assert(NumBoxes > 0); - auto idx = make_index(); + auto idx = make_index(); const index_int block_idx = idx.group; - auto my_sorted_scores = slice_tensor(sorted_scores, array{block_idx, 0}, slice_axes<1>()); - auto my_sorted_boxes = slice_tensor(sorted_boxes, array{block_idx, 0, 0}, slice_axes<1, 2>()); - auto my_sorted_indices = slice_tensor(sorted_indices, array{block_idx, 0}, slice_axes<1>()); + auto my_sorted_scores = + slice_tensor(sorted_scores, array{block_idx, 0}, slice_axes<1>()); + auto my_sorted_boxes = + slice_tensor(sorted_boxes, array{block_idx, 0, 0}, slice_axes<1, 2>()); + auto my_sorted_indices = + slice_tensor(sorted_indices, array{block_idx, 0}, slice_axes<1>()); - using scores_type = typename SortedScores::type; - using boxes_type = typename SortedBoxes::type; + using scores_type = typename SortedScores::type; + using boxes_type = typename SortedBoxes::type; using indices_type = typename SortedIndices::type; // Use shared memory for sorting per-block nms_data. Assuming it fits in LDS. // TODO: can add a static_assert on needed LDS size - __shared__ uninitialized_buffer, NumBoxes> block_nms_data; + __shared__ uninitialized_buffer, NumBoxes> + block_nms_data; idx.local_stride(NumBoxes, [&](auto i) { block_nms_data[i].score = my_sorted_scores[i]; - auto boxes_iter = my_sorted_boxes.begin_at(array{0, i, 0}); + auto boxes_iter = my_sorted_boxes.begin_at(array{0, i, 0}); copy(boxes_iter, boxes_iter + 4, block_nms_data[i].box.begin()); block_nms_data[i].box_index = my_sorted_indices[i]; }); - auto my_mask = slice_tensor(mask, array{block_idx, 0}, slice_axes<1>()); + auto my_mask = slice_tensor(mask, array{block_idx, 0}, slice_axes<1>()); auto my_output = slice_tensor(output, array{block_idx, 0, 0}, slice_axes<1, 2>()); // Read scalar tensor inputs const int max_output_boxes_per_class = max_out_p[0]; - const float iou_thr_val = iou_thr_p[0]; - const float score_thr_val = score_thr_p[0]; + const float iou_thr_val = iou_thr_p[0]; + const float score_thr_val = score_thr_p[0]; __syncthreads(); nms_make_iou_mask(idx, block_nms_data, my_mask, iou_thr_val); __syncthreads(); - nms_filter_per_block( - idx, - block_nms_data, - my_mask, - max_output_boxes_per_class, - score_thr_val, - my_output, - bc_counts); + nms_filter_per_block(idx, + block_nms_data, + my_mask, + max_output_boxes_per_class, + score_thr_val, + my_output, + bc_counts); } - // Move batch/class box index entries to the beginning of the output buffer. Runs with 1 block. // `bc_counts`: Number of selected boxes per batch per class. (read-only) // `indices`: Box indices, kernel packs selected boxes in-place to the beginning @@ -355,20 +366,20 @@ template -__device__ void nonmaxsuppression_compact(const Counts bc_counts, - const Idx indices, - Out output, - Num num_selected) +__device__ void +nonmaxsuppression_compact(const Counts bc_counts, const Idx indices, Out output, Num num_selected) { static_assert(NumBatchClass > 0); static_assert(NumBoxes > 0); // TODO: get a better bound on this - static_assert(NumBatchClass <= 8192, "nms_compact: NumBatchClass exceeds the LDS budget for offsets[]"); + static_assert(NumBatchClass <= 8192, + "nms_compact: NumBatchClass exceeds the LDS budget for offsets[]"); auto idx = make_index(); __shared__ index_int offsets[NumBatchClass]; // Exclusive prefix sum on bc_counts to get offsets - // TODO: there's probably a better way to get the exclusive prefix sum rather than doing the minus each time. + // TODO: there's probably a better way to get the exclusive prefix sum rather than doing the + // minus each time. block_scan( idx, op::sum{}, @@ -381,22 +392,22 @@ __device__ void nonmaxsuppression_compact(const Counts bc_counts, // Get num_selected_boxes from last value of exclusive scan and add last bc_counts value. if(idx.local == 0) { - num_selected[0] = offsets[NumBatchClass-1] + bc_counts[NumBatchClass-1]; + num_selected[0] = offsets[NumBatchClass - 1] + bc_counts[NumBatchClass - 1]; } // rearrange index values to make the output packed. // TODO: this could be done in-place to save memory. - constexpr index_int index_size = 3; + constexpr index_int index_size = 3; constexpr index_int max_entries = NumBatchClass * NumBoxes; idx.local_stride(max_entries, [&](auto i) { const index_int batch_class_idx = i / NumBoxes; - const index_int box_idx = i % NumBoxes; + const index_int box_idx = i % NumBoxes; if(box_idx < bc_counts[batch_class_idx]) { for(int k = 0; k < 3; ++k) { output[(offsets[batch_class_idx] + box_idx) * index_size + k] = - indices[(batch_class_idx * NumBoxes + box_idx) * index_size + k] ; + indices[(batch_class_idx * NumBoxes + box_idx) * index_size + k]; } } }); diff --git a/src/targets/gpu/lowering.cpp b/src/targets/gpu/lowering.cpp index 976d84ade0e..a7baf80a755 100644 --- a/src/targets/gpu/lowering.cpp +++ b/src/targets/gpu/lowering.cpp @@ -458,8 +458,8 @@ struct miopen_apply { apply_map.emplace("nonmaxsuppression", [=](instruction_ref ins) { auto inputs = ins->inputs(); - const auto& boxes_s = inputs[0]->get_shape(); - const auto& scores_s = inputs[1]->get_shape(); + const auto& boxes_s = inputs[0]->get_shape(); + const auto& scores_s = inputs[1]->get_shape(); const auto num_batches = boxes_s.lens()[0]; const auto num_boxes = boxes_s.lens()[1]; const auto num_classes = scores_s.lens()[1]; @@ -492,26 +492,25 @@ struct miopen_apply inputs[1]); sorted = insert_precompile_op(sorted); - auto filter = mod->insert_instruction( - ins, - make_op("gpu::nms_filter", - {{"num_batches", num_batches}, - {"num_classes", num_classes}, - {"num_boxes", num_boxes}}), - sorted, - inputs[2], - inputs[3], - inputs[4], - mask_alloc); - filter = insert_precompile_op(filter); + auto filter = mod->insert_instruction(ins, + make_op("gpu::nms_filter", + {{"num_batches", num_batches}, + {"num_classes", num_classes}, + {"num_boxes", num_boxes}}), + sorted, + inputs[2], + inputs[3], + inputs[4], + mask_alloc); + filter = insert_precompile_op(filter); auto raw_output = mod->insert_instruction(ins, make_op("get_tuple_elem", {{"index", 0}}), filter); auto bc_counts = mod->insert_instruction(ins, make_op("get_tuple_elem", {{"index", 1}}), filter); - auto compact = mod->insert_instruction( - ins, make_op("gpu::nms_compact"), bc_counts, raw_output); + auto compact = + mod->insert_instruction(ins, make_op("gpu::nms_compact"), bc_counts, raw_output); compact = insert_precompile_op(compact); return mod->replace_instruction(ins, compact); diff --git a/test/gpu/nonmaxsuppression.cpp b/test/gpu/nonmaxsuppression.cpp index 119b2ab6a8e..f123263b596 100644 --- a/test/gpu/nonmaxsuppression.cpp +++ b/test/gpu/nonmaxsuppression.cpp @@ -90,12 +90,12 @@ TEST_CASE(nms_default_test) score_threshold); add_nms_return(mm, nms); - std::vector boxes_vec = {0.5, 0.5, 1.0, 1.0, 0.5, 0.6, 1.0, 1.0, 0.5, 0.4, 1.0, 1.0, - 0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0}; + std::vector boxes_vec = {0.5, 0.5, 1.0, 1.0, 0.5, 0.6, 1.0, 1.0, 0.5, 0.4, 1.0, 1.0, + 0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0}; std::vector scores_vec = {0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f}; - int64_t max_out_val = 4; - float iou_val = 0.5f; - float score_val = 0.0f; + int64_t max_out_val = 4; + float iou_val = 0.5f; + float score_val = 0.0f; migraphx::parameter_map host_params; host_params["boxes"] = migraphx::argument(boxes_s, boxes_vec.data()); @@ -133,10 +133,10 @@ TEST_CASE(nms_identical_all_test) score_threshold); add_nms_return(mm, nms); - std::vector boxes_vec = {0.5, 0.5, 0.7, 0.7, 0.7, 0.7, 0.5, 0.5, 0.7, 0.7, 0.5, 0.5, - 0.5, 0.5, 0.7, 0.7, 0.5, 0.5, 0.7, 0.7, 0.7, 0.7, 0.5, 0.5}; + std::vector boxes_vec = {0.5, 0.5, 0.7, 0.7, 0.7, 0.7, 0.5, 0.5, 0.7, 0.7, 0.5, 0.5, + 0.5, 0.5, 0.7, 0.7, 0.5, 0.5, 0.7, 0.7, 0.7, 0.7, 0.5, 0.5}; std::vector scores_vec = {0.9f, 0.9f, 0.9f, 0.9f, 0.9f, 0.9f}; - float iou_val = 0.1f; + float iou_val = 0.1f; migraphx::parameter_map host_params; host_params["boxes"] = migraphx::argument(boxes_s, boxes_vec.data()); @@ -171,9 +171,9 @@ TEST_CASE(nms_not_center_test) score_threshold); add_nms_return(mm, nms); - std::vector boxes_vec = {1.0, 1.0, 0.0, 0.0, 0.0, 0.1, 1.0, 1.1, - 0.0, 0.9, 1.0, -0.1, 0.0, 10.0, 1.0, 11.0, - 1.0, 10.1, 0.0, 11.1, 1.0, 101.0, 0.0, 100.0}; + std::vector boxes_vec = {1.0, 1.0, 0.0, 0.0, 0.0, 0.1, 1.0, 1.1, + 0.0, 0.9, 1.0, -0.1, 0.0, 10.0, 1.0, 11.0, + 1.0, 10.1, 0.0, 11.1, 1.0, 101.0, 0.0, 100.0}; std::vector scores_vec = {0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f}; migraphx::parameter_map host_params; @@ -217,7 +217,7 @@ TEST_CASE(nms_transpose1_test) 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, }; std::vector scores_vec = {0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f}; - int64_t max_out_val = 4; + int64_t max_out_val = 4; migraphx::parameter_map host_params; host_params["boxes"] = migraphx::argument(boxes_s, boxes_vec.data()); @@ -296,10 +296,10 @@ TEST_CASE(nms_multi_batch_test) score_threshold); add_nms_return(mm, nms); - std::vector boxes_vec = {0.5, 0.5, 1.0, 1.0, 0.5, 0.6, 1.0, 1.0, 0.5, 0.4, 1.0, 1.0, - 0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0, - 0.5, 0.5, 1.0, 1.0, 0.5, 0.6, 1.0, 1.0, 0.5, 0.4, 1.0, 1.0, - 0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0}; + std::vector boxes_vec = {0.5, 0.5, 1.0, 1.0, 0.5, 0.6, 1.0, 1.0, 0.5, 0.4, 1.0, 1.0, + 0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0, + 0.5, 0.5, 1.0, 1.0, 0.5, 0.6, 1.0, 1.0, 0.5, 0.4, 1.0, 1.0, + 0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0}; std::vector scores_vec = { 0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f, 0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f}; int64_t max_out_val = 4; @@ -343,9 +343,9 @@ TEST_CASE(nms_multi_class_test) score_threshold); add_nms_return(mm, nms); - std::vector boxes_vec = {0.0, 0.0, 1.0, 1.0, 0.0, 0.1, 1.0, 1.1, - 0.0, -0.1, 1.0, 0.9, 0.0, 10.0, 1.0, 11.0, - 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0}; + std::vector boxes_vec = {0.0, 0.0, 1.0, 1.0, 0.0, 0.1, 1.0, 1.1, + 0.0, -0.1, 1.0, 0.9, 0.0, 10.0, 1.0, 11.0, + 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0}; std::vector scores_vec = { 0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f, 0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f}; float score_val = 0.0f; @@ -376,26 +376,27 @@ TEST_CASE(nms_20boxes_test) auto iou_threshold = mm->add_literal(0.5000f); auto score_threshold = mm->add_literal(0.0000f); - auto nms = - mm->add_instruction(migraphx::make_op("nonmaxsuppression"), - boxes_p, - scores_p, - max_out_l, - iou_threshold, - score_threshold); + auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"), + boxes_p, + scores_p, + max_out_l, + iou_threshold, + score_threshold); add_nms_return(mm, nms); std::vector boxes_vec = { - 32.7256f, 35.1377f, 43.0832f, 42.2579f, 13.9286f, 15.6152f, 21.5240f, 28.2727f, 44.0782f, 37.5280f, 52.9916f, 48.3318f, - 38.8011f, 32.1818f, 50.5110f, 37.5550f, 33.9761f, -1.6170f, 43.8622f, 11.0347f, 5.3569f, 42.6478f, 14.1070f, 54.9145f, - 18.9216f, 34.8446f, 27.7505f, 41.2693f, -0.4375f, 36.7849f, 4.8178f, 41.8215f, 6.9987f, 1.1282f, 8.4302f, 11.6832f, - 30.5954f, 21.0410f, 37.7095f, 23.9976f, 35.2360f, 16.6405f, 39.2402f, 20.4393f, 45.0158f, 45.7867f, 51.7352f, 46.8898f, - 9.8174f, 26.1848f, 22.7651f, 38.2017f, 16.3854f, 35.9841f, 20.6606f, 46.2920f, 22.5697f, 16.7346f, 24.3859f, 27.6069f, - 7.0039f, 5.3968f, 11.9433f, 17.3270f, 3.9409f, 24.0168f, 9.0512f, 31.4417f, 18.6518f, -1.2903f, 28.9187f, 7.6721f, - 6.9462f, 39.9030f, 15.7447f, 42.8601f, 27.5034f, 30.2815f, 39.4780f, 32.8849f}; - std::vector scores_vec = { - 0.6979f, 0.4657f, 0.8326f, 0.2503f, 0.1204f, 0.1810f, 0.7501f, 0.5157f, 0.2451f, 0.5509f, 0.2371f, 0.7267f, - 0.5015f, 0.4429f, 0.3714f, 0.6673f, 0.4256f, 0.1789f, 0.2062f, 0.9657f}; + 32.7256f, 35.1377f, 43.0832f, 42.2579f, 13.9286f, 15.6152f, 21.5240f, 28.2727f, 44.0782f, + 37.5280f, 52.9916f, 48.3318f, 38.8011f, 32.1818f, 50.5110f, 37.5550f, 33.9761f, -1.6170f, + 43.8622f, 11.0347f, 5.3569f, 42.6478f, 14.1070f, 54.9145f, 18.9216f, 34.8446f, 27.7505f, + 41.2693f, -0.4375f, 36.7849f, 4.8178f, 41.8215f, 6.9987f, 1.1282f, 8.4302f, 11.6832f, + 30.5954f, 21.0410f, 37.7095f, 23.9976f, 35.2360f, 16.6405f, 39.2402f, 20.4393f, 45.0158f, + 45.7867f, 51.7352f, 46.8898f, 9.8174f, 26.1848f, 22.7651f, 38.2017f, 16.3854f, 35.9841f, + 20.6606f, 46.2920f, 22.5697f, 16.7346f, 24.3859f, 27.6069f, 7.0039f, 5.3968f, 11.9433f, + 17.3270f, 3.9409f, 24.0168f, 9.0512f, 31.4417f, 18.6518f, -1.2903f, 28.9187f, 7.6721f, + 6.9462f, 39.9030f, 15.7447f, 42.8601f, 27.5034f, 30.2815f, 39.4780f, 32.8849f}; + std::vector scores_vec = {0.6979f, 0.4657f, 0.8326f, 0.2503f, 0.1204f, 0.1810f, 0.7501f, + 0.5157f, 0.2451f, 0.5509f, 0.2371f, 0.7267f, 0.5015f, 0.4429f, + 0.3714f, 0.6673f, 0.4256f, 0.1789f, 0.2062f, 0.9657f}; migraphx::parameter_map host_params; host_params["boxes"] = migraphx::argument(boxes_s, boxes_vec.data()); @@ -403,7 +404,8 @@ TEST_CASE(nms_20boxes_test) auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params); indices.resize(static_cast(num_selected) * 3); - std::vector gold = {0, 0, 19, 0, 0, 2, 0, 0, 6, 0, 0, 11, 0, 0, 0, 0, 0, 15, 0, 0, 9, 0, 0, 7, 0, 0, 12, 0, 0, 1}; + std::vector gold = {0, 0, 19, 0, 0, 2, 0, 0, 6, 0, 0, 11, 0, 0, 0, + 0, 0, 15, 0, 0, 9, 0, 0, 7, 0, 0, 12, 0, 0, 1}; EXPECT(migraphx::verify::verify_rms_range(indices, gold)); EXPECT(num_selected == 10); } @@ -432,29 +434,35 @@ TEST_CASE(nms_50boxes_center_test) add_nms_return(mm, nms); std::vector boxes_vec = { - 90.8581f, 82.6292f, 23.5447f, 19.9060f, 69.9707f, 89.6161f, 29.1830f, 26.1572f, 26.5870f, 14.0249f, 15.5215f, 14.1630f, - 96.9176f, 55.4036f, 5.1730f, 8.1873f, 77.8751f, 10.8576f, 1.4042f, 7.8632f, 71.6890f, 67.2240f, 7.6600f, 22.6344f, - 44.9361f, 28.1234f, 4.8228f, 24.6805f, 27.2242f, 65.9423f, 20.6521f, 4.0531f, 9.6391f, 72.6995f, 4.5331f, 2.9302f, - 90.2602f, 76.8647f, 15.6836f, 18.2386f, 45.5776f, 10.7741f, 21.1336f, 5.2390f, 20.2363f, 91.6012f, 17.8524f, 24.9153f, - 30.5957f, 23.0214f, 6.7935f, 9.9997f, 57.9220f, 3.7413f, 24.3196f, 5.1723f, 17.6773f, 55.4852f, 21.7468f, 27.7081f, - 85.6614f, 37.0922f, 22.4305f, 5.8004f, 75.8520f, 82.9790f, 4.8007f, 9.2569f, 71.9463f, 80.8251f, 4.5889f, 5.4548f, - 43.2093f, 31.7139f, 27.8993f, 4.3492f, 62.7309f, 95.2899f, 12.5298f, 1.6133f, 58.4098f, 29.0918f, 9.7275f, 2.6065f, - 64.9847f, 51.5057f, 15.1689f, 6.0646f, 8.4444f, 25.5965f, 20.2231f, 2.5481f, 41.5807f, 93.6044f, 28.7131f, 18.1432f, - 4.1614f, 16.4608f, 9.3069f, 20.7407f, 49.3991f, 4.4911f, 27.8194f, 12.4153f, 32.9861f, 43.5097f, 1.7209f, 10.2217f, - 14.4524f, 99.2376f, 17.1007f, 15.6313f, 10.3403f, 89.1677f, 19.3853f, 26.3751f, 58.7645f, 74.8608f, 4.0710f, 25.6828f, - 17.0593f, 89.0792f, 5.0698f, 2.2608f, 92.5120f, 89.3447f, 13.1543f, 6.2635f, 58.1061f, 51.8858f, 29.0207f, 7.8656f, - 34.6870f, 31.5929f, 18.2852f, 8.2322f, 59.0915f, 77.2012f, 28.0577f, 17.5657f, 2.2804f, 66.1661f, 24.3265f, 13.0716f, - 95.8559f, 37.3658f, 14.5541f, 2.4284f, 48.2303f, 9.4467f, 23.7581f, 11.8348f, 78.2735f, 74.6790f, 1.5173f, 16.1888f, - 8.2730f, 26.2461f, 4.1652f, 3.9485f, 48.6658f, 93.6813f, 25.0534f, 25.1703f, 49.0707f, 24.0971f, 24.1077f, 2.5069f, - 93.7826f, 12.2758f, 7.7466f, 27.8204f, 57.1728f, 83.1113f, 16.3923f, 3.8743f, 47.3489f, 15.3284f, 18.5745f, 25.4637f, - 26.6976f, 17.9268f, 26.1644f, 27.1769f, 33.1569f, 59.9383f, 18.4901f, 29.4075f, 52.0672f, 87.4562f, 12.9646f, 24.2588f, - 43.8911f, 19.6435f, 11.8513f, 23.6048f, 2.1612f, 31.0324f, 13.3506f, 19.6320f}; + 90.8581f, 82.6292f, 23.5447f, 19.9060f, 69.9707f, 89.6161f, 29.1830f, 26.1572f, 26.5870f, + 14.0249f, 15.5215f, 14.1630f, 96.9176f, 55.4036f, 5.1730f, 8.1873f, 77.8751f, 10.8576f, + 1.4042f, 7.8632f, 71.6890f, 67.2240f, 7.6600f, 22.6344f, 44.9361f, 28.1234f, 4.8228f, + 24.6805f, 27.2242f, 65.9423f, 20.6521f, 4.0531f, 9.6391f, 72.6995f, 4.5331f, 2.9302f, + 90.2602f, 76.8647f, 15.6836f, 18.2386f, 45.5776f, 10.7741f, 21.1336f, 5.2390f, 20.2363f, + 91.6012f, 17.8524f, 24.9153f, 30.5957f, 23.0214f, 6.7935f, 9.9997f, 57.9220f, 3.7413f, + 24.3196f, 5.1723f, 17.6773f, 55.4852f, 21.7468f, 27.7081f, 85.6614f, 37.0922f, 22.4305f, + 5.8004f, 75.8520f, 82.9790f, 4.8007f, 9.2569f, 71.9463f, 80.8251f, 4.5889f, 5.4548f, + 43.2093f, 31.7139f, 27.8993f, 4.3492f, 62.7309f, 95.2899f, 12.5298f, 1.6133f, 58.4098f, + 29.0918f, 9.7275f, 2.6065f, 64.9847f, 51.5057f, 15.1689f, 6.0646f, 8.4444f, 25.5965f, + 20.2231f, 2.5481f, 41.5807f, 93.6044f, 28.7131f, 18.1432f, 4.1614f, 16.4608f, 9.3069f, + 20.7407f, 49.3991f, 4.4911f, 27.8194f, 12.4153f, 32.9861f, 43.5097f, 1.7209f, 10.2217f, + 14.4524f, 99.2376f, 17.1007f, 15.6313f, 10.3403f, 89.1677f, 19.3853f, 26.3751f, 58.7645f, + 74.8608f, 4.0710f, 25.6828f, 17.0593f, 89.0792f, 5.0698f, 2.2608f, 92.5120f, 89.3447f, + 13.1543f, 6.2635f, 58.1061f, 51.8858f, 29.0207f, 7.8656f, 34.6870f, 31.5929f, 18.2852f, + 8.2322f, 59.0915f, 77.2012f, 28.0577f, 17.5657f, 2.2804f, 66.1661f, 24.3265f, 13.0716f, + 95.8559f, 37.3658f, 14.5541f, 2.4284f, 48.2303f, 9.4467f, 23.7581f, 11.8348f, 78.2735f, + 74.6790f, 1.5173f, 16.1888f, 8.2730f, 26.2461f, 4.1652f, 3.9485f, 48.6658f, 93.6813f, + 25.0534f, 25.1703f, 49.0707f, 24.0971f, 24.1077f, 2.5069f, 93.7826f, 12.2758f, 7.7466f, + 27.8204f, 57.1728f, 83.1113f, 16.3923f, 3.8743f, 47.3489f, 15.3284f, 18.5745f, 25.4637f, + 26.6976f, 17.9268f, 26.1644f, 27.1769f, 33.1569f, 59.9383f, 18.4901f, 29.4075f, 52.0672f, + 87.4562f, 12.9646f, 24.2588f, 43.8911f, 19.6435f, 11.8513f, 23.6048f, 2.1612f, 31.0324f, + 13.3506f, 19.6320f}; std::vector scores_vec = { - 0.8011f, 0.2211f, 0.5825f, 0.5628f, 0.8718f, 0.5165f, 0.4466f, 0.6756f, 0.3398f, 0.2258f, 0.5301f, 0.4752f, - 0.3093f, 0.4308f, 0.4298f, 0.3947f, 0.4415f, 0.7172f, 0.3672f, 0.9540f, 0.9247f, 0.5328f, 0.3955f, 0.5819f, - 0.8637f, 0.6873f, 0.8240f, 0.5795f, 0.6696f, 0.3593f, 0.7614f, 0.2822f, 0.7253f, 0.8746f, 0.2189f, 0.6529f, - 0.1856f, 0.7531f, 0.1760f, 0.9423f, 0.2237f, 0.9630f, 0.8208f, 0.6343f, 0.8044f, 0.8156f, 0.9514f, 0.3280f, - 0.6311f, 0.1855f}; + 0.8011f, 0.2211f, 0.5825f, 0.5628f, 0.8718f, 0.5165f, 0.4466f, 0.6756f, 0.3398f, 0.2258f, + 0.5301f, 0.4752f, 0.3093f, 0.4308f, 0.4298f, 0.3947f, 0.4415f, 0.7172f, 0.3672f, 0.9540f, + 0.9247f, 0.5328f, 0.3955f, 0.5819f, 0.8637f, 0.6873f, 0.8240f, 0.5795f, 0.6696f, 0.3593f, + 0.7614f, 0.2822f, 0.7253f, 0.8746f, 0.2189f, 0.6529f, 0.1856f, 0.7531f, 0.1760f, 0.9423f, + 0.2237f, 0.9630f, 0.8208f, 0.6343f, 0.8044f, 0.8156f, 0.9514f, 0.3280f, 0.6311f, 0.1855f}; migraphx::parameter_map host_params; host_params["boxes"] = migraphx::argument(boxes_s, boxes_vec.data()); @@ -462,7 +470,10 @@ TEST_CASE(nms_50boxes_center_test) auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params); indices.resize(static_cast(num_selected) * 3); - std::vector gold = {0, 0, 41, 0, 0, 19, 0, 0, 46, 0, 0, 39, 0, 0, 20, 0, 0, 33, 0, 0, 4, 0, 0, 24, 0, 0, 26, 0, 0, 42, 0, 0, 45, 0, 0, 44, 0, 0, 0, 0, 0, 30, 0, 0, 32, 0, 0, 17, 0, 0, 25, 0, 0, 7, 0, 0, 28, 0, 0, 35}; + std::vector gold = {0, 0, 41, 0, 0, 19, 0, 0, 46, 0, 0, 39, 0, 0, 20, + 0, 0, 33, 0, 0, 4, 0, 0, 24, 0, 0, 26, 0, 0, 42, + 0, 0, 45, 0, 0, 44, 0, 0, 0, 0, 0, 30, 0, 0, 32, + 0, 0, 17, 0, 0, 25, 0, 0, 7, 0, 0, 28, 0, 0, 35}; EXPECT(migraphx::verify::verify_rms_range(indices, gold)); EXPECT(num_selected == 20); } @@ -481,101 +492,136 @@ TEST_CASE(nms_100boxes_2batch_test) auto iou_threshold = mm->add_literal(0.5000f); auto score_threshold = mm->add_literal(0.1000f); - auto nms = - mm->add_instruction(migraphx::make_op("nonmaxsuppression"), - boxes_p, - scores_p, - max_out_l, - iou_threshold, - score_threshold); + auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"), + boxes_p, + scores_p, + max_out_l, + iou_threshold, + score_threshold); add_nms_return(mm, nms); std::vector boxes_vec = { - -3.8699f, 108.8880f, 20.8101f, 137.5783f, 149.9079f, 29.3134f, 203.7504f, 39.2031f, 121.6031f, 107.1528f, 162.2282f, 118.8275f, - 27.1146f, 87.2265f, 42.1365f, 141.7457f, -7.3128f, 91.3799f, 44.0012f, 95.0142f, 25.9397f, 97.1572f, 47.4736f, 111.8955f, - 170.3318f, 143.6689f, 221.6791f, 161.9004f, 82.3933f, 144.8881f, 101.0310f, 174.8098f, 138.9017f, 80.6305f, 174.7306f, 116.2308f, - 115.0719f, 104.8666f, 139.4914f, 134.9707f, 105.8753f, 183.2658f, 123.0900f, 189.2287f, 2.3726f, 16.2585f, 55.6795f, 31.6349f, - 183.1709f, -1.9651f, 195.2389f, 48.8066f, 57.2666f, -1.7671f, 63.2705f, 36.8507f, 105.0166f, 111.9228f, 126.1903f, 151.2225f, - 118.2848f, 63.4507f, 161.6255f, 103.9927f, 105.5274f, 131.8586f, 154.1659f, 177.8699f, 158.1560f, 132.0321f, 218.0818f, 136.4605f, - 20.4451f, 55.4126f, 38.9305f, 78.0425f, 89.1363f, 163.2572f, 114.2048f, 196.0894f, 76.2707f, 142.0220f, 85.3431f, 162.9909f, - 77.3750f, 28.6949f, 112.2925f, 79.5191f, -6.0851f, 58.1025f, 53.7721f, 87.5743f, 5.6429f, 39.7135f, 47.9949f, 86.0625f, - 37.5563f, 5.8879f, 73.6739f, 57.1568f, 48.8660f, 14.1653f, 73.0158f, 44.9480f, 58.0793f, 159.8937f, 113.0820f, 214.5573f, - 107.0385f, 69.7607f, 137.3566f, 105.4010f, 122.4620f, 51.0809f, 131.3896f, 102.2471f, 71.0835f, 135.3897f, 93.6408f, 156.4846f, - 79.2752f, 95.3835f, 84.2380f, 125.8137f, 37.0673f, 171.0514f, 49.9841f, 203.4046f, 116.6400f, 152.4634f, 118.6825f, 159.6572f, - 49.5364f, 83.6166f, 77.2799f, 108.1312f, -12.0070f, 47.7104f, 26.4309f, 102.8334f, 73.0529f, 178.2168f, 94.3071f, 216.4359f, - 81.9253f, 137.8156f, 107.7278f, 149.2885f, 16.3219f, 179.7427f, 73.9152f, 200.7352f, 91.8087f, 17.5434f, 137.1745f, 29.8480f, - 96.6991f, 168.8745f, 129.6096f, 171.3390f, 131.5065f, 99.5547f, 149.2944f, 155.2749f, 102.6283f, 10.6622f, 156.5511f, 38.1065f, - 123.0512f, 108.0793f, 137.9220f, 127.2239f, 53.1452f, 119.0642f, 73.3404f, 155.3743f, 130.1690f, 1.7448f, 184.8039f, 3.1763f, - 93.7074f, 82.1619f, 125.9504f, 99.5652f, 63.8853f, 143.8404f, 108.6820f, 186.3194f, 107.2755f, 39.8756f, 143.1295f, 78.2680f, - 52.3550f, 62.2463f, 91.9079f, 121.1729f, 93.2160f, 69.6623f, 111.8797f, 107.2634f, 139.7207f, 45.7991f, 154.9616f, 74.9719f, - 167.2671f, 160.7261f, 187.2941f, 206.6506f, 179.1259f, 129.1106f, 189.2970f, 183.4070f, 74.4343f, 0.3572f, 127.0189f, 43.8782f, - 95.1992f, 170.4922f, 112.9108f, 228.3217f, 142.9101f, 152.2709f, 177.0380f, 199.4092f, 39.0269f, 30.7110f, 86.7534f, 82.8523f, - 143.8537f, 163.5132f, 191.0993f, 171.2454f, 85.3959f, -0.8223f, 112.2607f, 43.3901f, 8.6218f, 186.3383f, 37.7209f, 213.3036f, - -15.4319f, 116.3204f, 44.2555f, 149.9535f, 147.9980f, 110.2290f, 188.7993f, 149.8210f, -13.4183f, -11.0214f, 35.6454f, 47.1977f, - 28.9969f, 149.8616f, 83.2476f, 208.9517f, 43.0921f, -3.2028f, 90.5599f, 14.8026f, 28.6361f, 26.0199f, 40.5617f, 70.3113f, - 45.6946f, 5.9799f, 79.8627f, 51.2289f, 145.0326f, 144.6320f, 152.0444f, 166.0751f, -16.8246f, 35.4867f, 22.6978f, 43.7950f, - 136.7519f, 180.4197f, 194.1175f, 183.8356f, 155.6840f, 107.8222f, 186.9352f, 154.6854f, 61.1796f, -7.7136f, 87.7250f, 22.1787f, - 29.1652f, -28.4875f, 32.2799f, 30.6594f, 91.3547f, -3.8851f, 148.9814f, 24.5483f, 20.3959f, 91.8365f, 27.4731f, 150.5336f, - 71.2720f, 147.6549f, 74.6957f, 172.9379f, 183.9269f, 23.7969f, 199.4448f, 71.6242f, 196.6597f, 166.8796f, 201.5260f, 172.8839f, - 140.4950f, -5.4397f, 168.3470f, 28.3325f, 46.4677f, 136.0320f, 77.9169f, 184.3535f, 127.8122f, 157.7804f, 147.2538f, 213.3378f, - 139.0779f, 129.6555f, 143.0846f, 179.1879f, 73.7761f, 138.0335f, 81.3605f, 141.2148f, 116.3348f, 156.1013f, 140.0206f, 179.0908f, - -0.1401f, 6.0937f, 4.4311f, 9.9669f, 20.7149f, 36.6326f, 62.9081f, 44.0802f, 98.4106f, 4.5632f, 111.6248f, 45.4062f, - 23.3391f, 79.3651f, 42.1614f, 122.4473f, 21.0547f, 125.7129f, 45.3081f, 172.3624f, 154.4709f, 99.9714f, 180.0508f, 152.0333f, - 197.2776f, 147.9130f, 198.3756f, 192.5394f, 107.3878f, 6.9169f, 115.0000f, 55.1683f, 141.8624f, 144.9798f, 193.7655f, 148.8687f, - 197.5280f, 31.1895f, 198.6007f, 46.0271f, 12.8282f, 35.3058f, 43.8101f, 72.9977f, 74.7088f, 116.1662f, 104.5894f, 167.7956f, - 68.1883f, 195.4082f, 88.8408f, 196.6737f, 2.7857f, 106.6272f, 29.2340f, 137.9903f, 127.5389f, -9.5799f, 174.5932f, 31.3800f, - 61.4403f, 121.8884f, 112.0713f, 124.6352f, 15.4868f, 35.9096f, 55.8899f, 68.2298f, 35.5922f, 56.6701f, 44.2246f, 72.3261f, - 163.1796f, 40.7751f, 180.4136f, 56.2181f, 177.9262f, 90.7157f, 187.1069f, 101.2297f, 33.5656f, 108.4211f, 51.2933f, 164.8822f, - 73.5555f, 18.9549f, 114.3649f, 72.3462f, 119.3443f, 42.7151f, 174.0536f, 89.5792f, 169.1987f, 170.3059f, 182.1476f, 201.8479f, - 59.3192f, -5.2591f, 92.3019f, 24.6868f, 82.2129f, 76.0264f, 124.5949f, 108.2814f, 119.7321f, 125.9828f, 176.9545f, 158.6404f, - 127.7304f, 16.7712f, 164.7240f, 43.4104f, 148.5664f, 5.0880f, 164.6177f, 13.8616f, 95.0352f, 23.4340f, 132.9384f, 31.8482f, - 10.9685f, 155.1733f, 30.8775f, 212.3560f, 151.4989f, -12.8680f, 210.0904f, 16.5719f, 160.8241f, 9.0448f, 185.4050f, 66.2840f, - 138.8994f, 0.9312f, 180.3396f, 11.5822f, 18.7873f, 5.2706f, 21.1577f, 38.9812f, 28.5777f, 117.4022f, 53.1813f, 130.6575f, - 122.4044f, 40.3588f, 175.0358f, 56.2967f, -13.8737f, 112.4558f, 23.1297f, 115.2290f, 182.2486f, 114.0300f, 209.4412f, 122.0482f, - 47.3188f, 142.3400f, 103.5391f, 197.4341f, 118.1700f, -9.0369f, 169.5550f, 10.9335f, 167.5089f, 152.2341f, 187.5196f, 189.1137f, - 62.3618f, 109.6059f, 95.4902f, 138.0417f, 48.8767f, 20.2354f, 78.7763f, 44.8620f, 102.5983f, 138.3968f, 140.8982f, 170.7781f, - 105.8416f, 165.0748f, 126.5542f, 177.1219f, 74.1239f, 21.1889f, 89.5320f, 80.5165f, 92.9311f, 159.1187f, 147.7788f, 208.3988f, - 159.3220f, 68.5139f, 214.8306f, 113.2691f, 68.1500f, 106.3565f, 118.9061f, 135.0133f, 9.9914f, 191.9200f, 68.7055f, 201.9398f, - 52.9639f, 44.6476f, 97.9184f, 99.9669f, 55.7637f, 152.0609f, 101.8791f, 173.2028f, 3.2253f, 61.7017f, 49.2181f, 65.6580f, - 17.8964f, 149.2418f, 47.2522f, 170.4436f, 122.9471f, 96.2103f, 150.8778f, 144.0833f, 60.3089f, 24.4012f, 75.4822f, 62.1410f, - 171.4575f, 60.1555f, 210.5018f, 105.4550f, 39.6844f, 39.6149f, 57.7543f, 87.4394f, 11.6796f, 8.8690f, 27.8902f, 22.3743f, - 132.9151f, -21.7847f, 168.4868f, 33.7186f, 163.6127f, 55.8750f, 188.8017f, 82.7164f, 48.6664f, -15.5441f, 62.5789f, 23.1577f, - 15.8440f, 32.5294f, 64.9913f, 33.6657f, 11.2664f, 115.2323f, 63.0400f, 174.8410f, 98.9553f, 132.8318f, 109.8496f, 150.4047f, - 92.9619f, 145.3852f, 94.4048f, 150.0469f, 41.4721f, 49.4119f, 62.3038f, 77.4494f, -14.9919f, 173.6975f, 33.0612f, 182.3103f, - 71.0426f, 113.7725f, 121.5539f, 123.7598f, 187.2858f, 6.0529f, 196.4472f, 44.3576f, 107.1609f, 16.6524f, 153.8468f, 40.8351f, - 95.1880f, 110.9244f, 103.0146f, 166.3137f, 10.1316f, 24.6737f, 34.1453f, 44.5039f, 20.5283f, 79.5362f, 80.4462f, 123.3809f, - 52.7734f, 184.2525f, 65.1362f, 212.4573f, 147.9188f, -19.1670f, 158.0026f, 20.7701f, 162.3696f, -14.8751f, 188.3148f, 21.5070f, - 161.5482f, 184.1698f, 199.1086f, 213.0640f, 168.8931f, 88.4010f, 224.9343f, 145.4546f, 167.0391f, 14.7719f, 225.9076f, 35.9920f, - 188.0454f, 173.7320f, 193.1542f, 185.1889f, 9.7935f, 155.5723f, 18.9354f, 196.5798f, 3.7319f, 81.7829f, 51.3855f, 132.6973f, - 52.4097f, 122.6709f, 69.3770f, 126.0459f, 83.9766f, 40.8733f, 137.1827f, 68.4016f, -0.6763f, -16.7244f, 39.4674f, 36.9323f, - 165.3600f, 96.2998f, 172.9588f, 141.5273f, 98.2916f, 29.1927f, 148.4108f, 88.7094f, 102.7704f, 116.5475f, 114.1754f, 148.9009f, - 20.0692f, 147.2792f, 46.0554f, 187.2189f, 33.8616f, -5.7911f, 67.4406f, 13.0553f, 16.7898f, 90.6905f, 47.3350f, 147.5951f, - 149.6448f, 34.9492f, 191.1284f, 57.5630f, 97.0913f, 152.4916f, 136.5998f, 197.0638f, 117.2606f, 38.3403f, 176.7911f, 63.1255f, - 29.2236f, 105.0804f, 89.1895f, 139.2277f, 58.5150f, 88.9746f, 89.9861f, 132.4418f, 77.6626f, 63.7197f, 84.2794f, 94.7469f, - 130.0316f, 108.2651f, 173.9744f, 162.7832f, 125.1590f, 132.2845f, 183.7822f, 158.0233f, 31.4721f, 93.7989f, 51.2533f, 132.9762f, - 174.2021f, 141.0848f, 202.4134f, 162.2841f, 11.1001f, 184.1428f, 37.1620f, 209.2240f, 177.2076f, 70.3730f, 181.2413f, 97.3360f, - -0.2527f, 98.7053f, 40.4109f, 107.1279f, 41.9845f, -0.7119f, 63.8314f, 5.6998f, 145.5655f, 139.0148f, 193.0259f, 179.3967f, - 10.8509f, 84.2082f, 60.9460f, 123.8838f, 57.9873f, 61.5364f, 107.4399f, 101.6481f, 77.1802f, 17.7313f, 102.7635f, 19.8975f, - 39.0662f, 167.7982f, 59.0374f, 188.0644f, 119.4588f, 72.6661f, 164.6393f, 85.3368f, 146.1259f, 113.0609f, 194.4079f, 159.9718f, - 159.9229f, 3.9862f, 189.9071f, 55.7634f, 41.0200f, 184.5329f, 94.7088f, 200.0870f}; + -3.8699f, 108.8880f, 20.8101f, 137.5783f, 149.9079f, 29.3134f, 203.7504f, 39.2031f, + 121.6031f, 107.1528f, 162.2282f, 118.8275f, 27.1146f, 87.2265f, 42.1365f, 141.7457f, + -7.3128f, 91.3799f, 44.0012f, 95.0142f, 25.9397f, 97.1572f, 47.4736f, 111.8955f, + 170.3318f, 143.6689f, 221.6791f, 161.9004f, 82.3933f, 144.8881f, 101.0310f, 174.8098f, + 138.9017f, 80.6305f, 174.7306f, 116.2308f, 115.0719f, 104.8666f, 139.4914f, 134.9707f, + 105.8753f, 183.2658f, 123.0900f, 189.2287f, 2.3726f, 16.2585f, 55.6795f, 31.6349f, + 183.1709f, -1.9651f, 195.2389f, 48.8066f, 57.2666f, -1.7671f, 63.2705f, 36.8507f, + 105.0166f, 111.9228f, 126.1903f, 151.2225f, 118.2848f, 63.4507f, 161.6255f, 103.9927f, + 105.5274f, 131.8586f, 154.1659f, 177.8699f, 158.1560f, 132.0321f, 218.0818f, 136.4605f, + 20.4451f, 55.4126f, 38.9305f, 78.0425f, 89.1363f, 163.2572f, 114.2048f, 196.0894f, + 76.2707f, 142.0220f, 85.3431f, 162.9909f, 77.3750f, 28.6949f, 112.2925f, 79.5191f, + -6.0851f, 58.1025f, 53.7721f, 87.5743f, 5.6429f, 39.7135f, 47.9949f, 86.0625f, + 37.5563f, 5.8879f, 73.6739f, 57.1568f, 48.8660f, 14.1653f, 73.0158f, 44.9480f, + 58.0793f, 159.8937f, 113.0820f, 214.5573f, 107.0385f, 69.7607f, 137.3566f, 105.4010f, + 122.4620f, 51.0809f, 131.3896f, 102.2471f, 71.0835f, 135.3897f, 93.6408f, 156.4846f, + 79.2752f, 95.3835f, 84.2380f, 125.8137f, 37.0673f, 171.0514f, 49.9841f, 203.4046f, + 116.6400f, 152.4634f, 118.6825f, 159.6572f, 49.5364f, 83.6166f, 77.2799f, 108.1312f, + -12.0070f, 47.7104f, 26.4309f, 102.8334f, 73.0529f, 178.2168f, 94.3071f, 216.4359f, + 81.9253f, 137.8156f, 107.7278f, 149.2885f, 16.3219f, 179.7427f, 73.9152f, 200.7352f, + 91.8087f, 17.5434f, 137.1745f, 29.8480f, 96.6991f, 168.8745f, 129.6096f, 171.3390f, + 131.5065f, 99.5547f, 149.2944f, 155.2749f, 102.6283f, 10.6622f, 156.5511f, 38.1065f, + 123.0512f, 108.0793f, 137.9220f, 127.2239f, 53.1452f, 119.0642f, 73.3404f, 155.3743f, + 130.1690f, 1.7448f, 184.8039f, 3.1763f, 93.7074f, 82.1619f, 125.9504f, 99.5652f, + 63.8853f, 143.8404f, 108.6820f, 186.3194f, 107.2755f, 39.8756f, 143.1295f, 78.2680f, + 52.3550f, 62.2463f, 91.9079f, 121.1729f, 93.2160f, 69.6623f, 111.8797f, 107.2634f, + 139.7207f, 45.7991f, 154.9616f, 74.9719f, 167.2671f, 160.7261f, 187.2941f, 206.6506f, + 179.1259f, 129.1106f, 189.2970f, 183.4070f, 74.4343f, 0.3572f, 127.0189f, 43.8782f, + 95.1992f, 170.4922f, 112.9108f, 228.3217f, 142.9101f, 152.2709f, 177.0380f, 199.4092f, + 39.0269f, 30.7110f, 86.7534f, 82.8523f, 143.8537f, 163.5132f, 191.0993f, 171.2454f, + 85.3959f, -0.8223f, 112.2607f, 43.3901f, 8.6218f, 186.3383f, 37.7209f, 213.3036f, + -15.4319f, 116.3204f, 44.2555f, 149.9535f, 147.9980f, 110.2290f, 188.7993f, 149.8210f, + -13.4183f, -11.0214f, 35.6454f, 47.1977f, 28.9969f, 149.8616f, 83.2476f, 208.9517f, + 43.0921f, -3.2028f, 90.5599f, 14.8026f, 28.6361f, 26.0199f, 40.5617f, 70.3113f, + 45.6946f, 5.9799f, 79.8627f, 51.2289f, 145.0326f, 144.6320f, 152.0444f, 166.0751f, + -16.8246f, 35.4867f, 22.6978f, 43.7950f, 136.7519f, 180.4197f, 194.1175f, 183.8356f, + 155.6840f, 107.8222f, 186.9352f, 154.6854f, 61.1796f, -7.7136f, 87.7250f, 22.1787f, + 29.1652f, -28.4875f, 32.2799f, 30.6594f, 91.3547f, -3.8851f, 148.9814f, 24.5483f, + 20.3959f, 91.8365f, 27.4731f, 150.5336f, 71.2720f, 147.6549f, 74.6957f, 172.9379f, + 183.9269f, 23.7969f, 199.4448f, 71.6242f, 196.6597f, 166.8796f, 201.5260f, 172.8839f, + 140.4950f, -5.4397f, 168.3470f, 28.3325f, 46.4677f, 136.0320f, 77.9169f, 184.3535f, + 127.8122f, 157.7804f, 147.2538f, 213.3378f, 139.0779f, 129.6555f, 143.0846f, 179.1879f, + 73.7761f, 138.0335f, 81.3605f, 141.2148f, 116.3348f, 156.1013f, 140.0206f, 179.0908f, + -0.1401f, 6.0937f, 4.4311f, 9.9669f, 20.7149f, 36.6326f, 62.9081f, 44.0802f, + 98.4106f, 4.5632f, 111.6248f, 45.4062f, 23.3391f, 79.3651f, 42.1614f, 122.4473f, + 21.0547f, 125.7129f, 45.3081f, 172.3624f, 154.4709f, 99.9714f, 180.0508f, 152.0333f, + 197.2776f, 147.9130f, 198.3756f, 192.5394f, 107.3878f, 6.9169f, 115.0000f, 55.1683f, + 141.8624f, 144.9798f, 193.7655f, 148.8687f, 197.5280f, 31.1895f, 198.6007f, 46.0271f, + 12.8282f, 35.3058f, 43.8101f, 72.9977f, 74.7088f, 116.1662f, 104.5894f, 167.7956f, + 68.1883f, 195.4082f, 88.8408f, 196.6737f, 2.7857f, 106.6272f, 29.2340f, 137.9903f, + 127.5389f, -9.5799f, 174.5932f, 31.3800f, 61.4403f, 121.8884f, 112.0713f, 124.6352f, + 15.4868f, 35.9096f, 55.8899f, 68.2298f, 35.5922f, 56.6701f, 44.2246f, 72.3261f, + 163.1796f, 40.7751f, 180.4136f, 56.2181f, 177.9262f, 90.7157f, 187.1069f, 101.2297f, + 33.5656f, 108.4211f, 51.2933f, 164.8822f, 73.5555f, 18.9549f, 114.3649f, 72.3462f, + 119.3443f, 42.7151f, 174.0536f, 89.5792f, 169.1987f, 170.3059f, 182.1476f, 201.8479f, + 59.3192f, -5.2591f, 92.3019f, 24.6868f, 82.2129f, 76.0264f, 124.5949f, 108.2814f, + 119.7321f, 125.9828f, 176.9545f, 158.6404f, 127.7304f, 16.7712f, 164.7240f, 43.4104f, + 148.5664f, 5.0880f, 164.6177f, 13.8616f, 95.0352f, 23.4340f, 132.9384f, 31.8482f, + 10.9685f, 155.1733f, 30.8775f, 212.3560f, 151.4989f, -12.8680f, 210.0904f, 16.5719f, + 160.8241f, 9.0448f, 185.4050f, 66.2840f, 138.8994f, 0.9312f, 180.3396f, 11.5822f, + 18.7873f, 5.2706f, 21.1577f, 38.9812f, 28.5777f, 117.4022f, 53.1813f, 130.6575f, + 122.4044f, 40.3588f, 175.0358f, 56.2967f, -13.8737f, 112.4558f, 23.1297f, 115.2290f, + 182.2486f, 114.0300f, 209.4412f, 122.0482f, 47.3188f, 142.3400f, 103.5391f, 197.4341f, + 118.1700f, -9.0369f, 169.5550f, 10.9335f, 167.5089f, 152.2341f, 187.5196f, 189.1137f, + 62.3618f, 109.6059f, 95.4902f, 138.0417f, 48.8767f, 20.2354f, 78.7763f, 44.8620f, + 102.5983f, 138.3968f, 140.8982f, 170.7781f, 105.8416f, 165.0748f, 126.5542f, 177.1219f, + 74.1239f, 21.1889f, 89.5320f, 80.5165f, 92.9311f, 159.1187f, 147.7788f, 208.3988f, + 159.3220f, 68.5139f, 214.8306f, 113.2691f, 68.1500f, 106.3565f, 118.9061f, 135.0133f, + 9.9914f, 191.9200f, 68.7055f, 201.9398f, 52.9639f, 44.6476f, 97.9184f, 99.9669f, + 55.7637f, 152.0609f, 101.8791f, 173.2028f, 3.2253f, 61.7017f, 49.2181f, 65.6580f, + 17.8964f, 149.2418f, 47.2522f, 170.4436f, 122.9471f, 96.2103f, 150.8778f, 144.0833f, + 60.3089f, 24.4012f, 75.4822f, 62.1410f, 171.4575f, 60.1555f, 210.5018f, 105.4550f, + 39.6844f, 39.6149f, 57.7543f, 87.4394f, 11.6796f, 8.8690f, 27.8902f, 22.3743f, + 132.9151f, -21.7847f, 168.4868f, 33.7186f, 163.6127f, 55.8750f, 188.8017f, 82.7164f, + 48.6664f, -15.5441f, 62.5789f, 23.1577f, 15.8440f, 32.5294f, 64.9913f, 33.6657f, + 11.2664f, 115.2323f, 63.0400f, 174.8410f, 98.9553f, 132.8318f, 109.8496f, 150.4047f, + 92.9619f, 145.3852f, 94.4048f, 150.0469f, 41.4721f, 49.4119f, 62.3038f, 77.4494f, + -14.9919f, 173.6975f, 33.0612f, 182.3103f, 71.0426f, 113.7725f, 121.5539f, 123.7598f, + 187.2858f, 6.0529f, 196.4472f, 44.3576f, 107.1609f, 16.6524f, 153.8468f, 40.8351f, + 95.1880f, 110.9244f, 103.0146f, 166.3137f, 10.1316f, 24.6737f, 34.1453f, 44.5039f, + 20.5283f, 79.5362f, 80.4462f, 123.3809f, 52.7734f, 184.2525f, 65.1362f, 212.4573f, + 147.9188f, -19.1670f, 158.0026f, 20.7701f, 162.3696f, -14.8751f, 188.3148f, 21.5070f, + 161.5482f, 184.1698f, 199.1086f, 213.0640f, 168.8931f, 88.4010f, 224.9343f, 145.4546f, + 167.0391f, 14.7719f, 225.9076f, 35.9920f, 188.0454f, 173.7320f, 193.1542f, 185.1889f, + 9.7935f, 155.5723f, 18.9354f, 196.5798f, 3.7319f, 81.7829f, 51.3855f, 132.6973f, + 52.4097f, 122.6709f, 69.3770f, 126.0459f, 83.9766f, 40.8733f, 137.1827f, 68.4016f, + -0.6763f, -16.7244f, 39.4674f, 36.9323f, 165.3600f, 96.2998f, 172.9588f, 141.5273f, + 98.2916f, 29.1927f, 148.4108f, 88.7094f, 102.7704f, 116.5475f, 114.1754f, 148.9009f, + 20.0692f, 147.2792f, 46.0554f, 187.2189f, 33.8616f, -5.7911f, 67.4406f, 13.0553f, + 16.7898f, 90.6905f, 47.3350f, 147.5951f, 149.6448f, 34.9492f, 191.1284f, 57.5630f, + 97.0913f, 152.4916f, 136.5998f, 197.0638f, 117.2606f, 38.3403f, 176.7911f, 63.1255f, + 29.2236f, 105.0804f, 89.1895f, 139.2277f, 58.5150f, 88.9746f, 89.9861f, 132.4418f, + 77.6626f, 63.7197f, 84.2794f, 94.7469f, 130.0316f, 108.2651f, 173.9744f, 162.7832f, + 125.1590f, 132.2845f, 183.7822f, 158.0233f, 31.4721f, 93.7989f, 51.2533f, 132.9762f, + 174.2021f, 141.0848f, 202.4134f, 162.2841f, 11.1001f, 184.1428f, 37.1620f, 209.2240f, + 177.2076f, 70.3730f, 181.2413f, 97.3360f, -0.2527f, 98.7053f, 40.4109f, 107.1279f, + 41.9845f, -0.7119f, 63.8314f, 5.6998f, 145.5655f, 139.0148f, 193.0259f, 179.3967f, + 10.8509f, 84.2082f, 60.9460f, 123.8838f, 57.9873f, 61.5364f, 107.4399f, 101.6481f, + 77.1802f, 17.7313f, 102.7635f, 19.8975f, 39.0662f, 167.7982f, 59.0374f, 188.0644f, + 119.4588f, 72.6661f, 164.6393f, 85.3368f, 146.1259f, 113.0609f, 194.4079f, 159.9718f, + 159.9229f, 3.9862f, 189.9071f, 55.7634f, 41.0200f, 184.5329f, 94.7088f, 200.0870f}; std::vector scores_vec = { - 0.1439f, 0.8791f, 0.0961f, 0.1535f, 0.5338f, 0.0675f, 0.0528f, 0.0005f, 0.4363f, 0.7746f, 0.0348f, 0.6523f, - 0.8231f, 0.1680f, 0.1469f, 0.8608f, 0.8231f, 0.5389f, 0.8192f, 0.0928f, 0.3945f, 0.7378f, 0.2575f, 0.7523f, - 0.5042f, 0.7503f, 0.4647f, 0.3679f, 0.2192f, 0.2084f, 0.7515f, 0.1189f, 0.0860f, 0.1763f, 0.1753f, 0.8231f, - 0.3985f, 0.9904f, 0.1372f, 0.6535f, 0.4487f, 0.3929f, 0.8751f, 0.9756f, 0.8729f, 0.1923f, 0.2208f, 0.6561f, - 0.2891f, 0.7347f, 0.5664f, 0.5509f, 0.8285f, 0.7105f, 0.0266f, 0.0495f, 0.6016f, 0.4862f, 0.2602f, 0.4187f, - 0.7579f, 0.8266f, 0.5612f, 0.3854f, 0.2707f, 0.5219f, 0.3147f, 0.5641f, 0.6767f, 0.0661f, 0.0011f, 0.2123f, - 0.8945f, 0.6463f, 0.1720f, 0.8903f, 0.4700f, 0.4761f, 0.9355f, 0.0595f, 0.2152f, 0.5858f, 0.1955f, 0.6795f, - 0.2141f, 0.0992f, 0.2070f, 0.4227f, 0.1761f, 0.1347f, 0.8603f, 0.3204f, 0.3608f, 0.0553f, 0.3574f, 0.2648f, - 0.6105f, 0.2054f, 0.8884f, 0.9297f, 0.0998f, 0.1074f, 0.1153f, 0.6196f, 0.1220f, 0.8524f, 0.7543f, 0.8198f, - 0.5261f, 0.9967f, 0.0442f, 0.4013f, 0.3239f, 0.9486f, 0.5769f, 0.8062f, 0.1703f, 0.9786f, 0.4986f, 0.4937f, - 0.9709f, 0.3807f, 0.3975f, 0.5848f, 0.1281f, 0.3211f, 0.1932f, 0.1033f, 0.8661f, 0.5893f, 0.3587f, 0.4087f, - 0.4315f, 0.6331f, 0.9268f, 0.9328f, 0.3915f, 0.3293f, 0.4510f, 0.5679f, 0.4618f, 0.6588f, 0.5544f, 0.3207f, - 0.3457f, 0.3786f, 0.0946f, 0.1661f, 0.7231f, 0.3891f, 0.2145f, 0.5627f, 0.7555f, 0.2574f, 0.8268f, 0.9275f, - 0.5974f, 0.6689f, 0.0526f, 0.9455f, 0.3925f, 0.9239f, 0.5790f, 0.0046f, 0.0385f, 0.6804f, 0.5627f, 0.0265f, - 0.7435f, 0.8521f, 0.4964f, 0.4658f, 0.0055f, 0.7866f, 0.3307f, 0.8788f, 0.3731f, 0.5651f, 0.2703f, 0.1606f, - 0.7749f, 0.4966f, 0.5365f, 0.9654f, 0.9636f, 0.8556f, 0.1876f, 0.5943f, 0.8781f, 0.3745f, 0.1011f, 0.8110f, - 0.4818f, 0.5644f, 0.9821f, 0.6072f, 0.4250f, 0.3700f, 0.4176f, 0.1184f}; + 0.1439f, 0.8791f, 0.0961f, 0.1535f, 0.5338f, 0.0675f, 0.0528f, 0.0005f, 0.4363f, 0.7746f, + 0.0348f, 0.6523f, 0.8231f, 0.1680f, 0.1469f, 0.8608f, 0.8231f, 0.5389f, 0.8192f, 0.0928f, + 0.3945f, 0.7378f, 0.2575f, 0.7523f, 0.5042f, 0.7503f, 0.4647f, 0.3679f, 0.2192f, 0.2084f, + 0.7515f, 0.1189f, 0.0860f, 0.1763f, 0.1753f, 0.8231f, 0.3985f, 0.9904f, 0.1372f, 0.6535f, + 0.4487f, 0.3929f, 0.8751f, 0.9756f, 0.8729f, 0.1923f, 0.2208f, 0.6561f, 0.2891f, 0.7347f, + 0.5664f, 0.5509f, 0.8285f, 0.7105f, 0.0266f, 0.0495f, 0.6016f, 0.4862f, 0.2602f, 0.4187f, + 0.7579f, 0.8266f, 0.5612f, 0.3854f, 0.2707f, 0.5219f, 0.3147f, 0.5641f, 0.6767f, 0.0661f, + 0.0011f, 0.2123f, 0.8945f, 0.6463f, 0.1720f, 0.8903f, 0.4700f, 0.4761f, 0.9355f, 0.0595f, + 0.2152f, 0.5858f, 0.1955f, 0.6795f, 0.2141f, 0.0992f, 0.2070f, 0.4227f, 0.1761f, 0.1347f, + 0.8603f, 0.3204f, 0.3608f, 0.0553f, 0.3574f, 0.2648f, 0.6105f, 0.2054f, 0.8884f, 0.9297f, + 0.0998f, 0.1074f, 0.1153f, 0.6196f, 0.1220f, 0.8524f, 0.7543f, 0.8198f, 0.5261f, 0.9967f, + 0.0442f, 0.4013f, 0.3239f, 0.9486f, 0.5769f, 0.8062f, 0.1703f, 0.9786f, 0.4986f, 0.4937f, + 0.9709f, 0.3807f, 0.3975f, 0.5848f, 0.1281f, 0.3211f, 0.1932f, 0.1033f, 0.8661f, 0.5893f, + 0.3587f, 0.4087f, 0.4315f, 0.6331f, 0.9268f, 0.9328f, 0.3915f, 0.3293f, 0.4510f, 0.5679f, + 0.4618f, 0.6588f, 0.5544f, 0.3207f, 0.3457f, 0.3786f, 0.0946f, 0.1661f, 0.7231f, 0.3891f, + 0.2145f, 0.5627f, 0.7555f, 0.2574f, 0.8268f, 0.9275f, 0.5974f, 0.6689f, 0.0526f, 0.9455f, + 0.3925f, 0.9239f, 0.5790f, 0.0046f, 0.0385f, 0.6804f, 0.5627f, 0.0265f, 0.7435f, 0.8521f, + 0.4964f, 0.4658f, 0.0055f, 0.7866f, 0.3307f, 0.8788f, 0.3731f, 0.5651f, 0.2703f, 0.1606f, + 0.7749f, 0.4966f, 0.5365f, 0.9654f, 0.9636f, 0.8556f, 0.1876f, 0.5943f, 0.8781f, 0.3745f, + 0.1011f, 0.8110f, 0.4818f, 0.5644f, 0.9821f, 0.6072f, 0.4250f, 0.3700f, 0.4176f, 0.1184f}; migraphx::parameter_map host_params; host_params["boxes"] = migraphx::argument(boxes_s, boxes_vec.data()); @@ -583,7 +629,11 @@ TEST_CASE(nms_100boxes_2batch_test) auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params); indices.resize(static_cast(num_selected) * 3); - std::vector gold = {0, 0, 37, 0, 0, 43, 0, 0, 78, 0, 0, 99, 0, 0, 72, 0, 0, 75, 0, 0, 98, 0, 0, 1, 0, 0, 42, 0, 0, 44, 0, 0, 15, 0, 0, 90, 0, 0, 52, 0, 0, 61, 0, 0, 12, 1, 0, 9, 1, 0, 94, 1, 0, 17, 1, 0, 20, 1, 0, 83, 1, 0, 84, 1, 0, 13, 1, 0, 59, 1, 0, 35, 1, 0, 55, 1, 0, 34, 1, 0, 61, 1, 0, 75, 1, 0, 88, 1, 0, 28}; + std::vector gold = {0, 0, 37, 0, 0, 43, 0, 0, 78, 0, 0, 99, 0, 0, 72, 0, 0, 75, + 0, 0, 98, 0, 0, 1, 0, 0, 42, 0, 0, 44, 0, 0, 15, 0, 0, 90, + 0, 0, 52, 0, 0, 61, 0, 0, 12, 1, 0, 9, 1, 0, 94, 1, 0, 17, + 1, 0, 20, 1, 0, 83, 1, 0, 84, 1, 0, 13, 1, 0, 59, 1, 0, 35, + 1, 0, 55, 1, 0, 34, 1, 0, 61, 1, 0, 75, 1, 0, 88, 1, 0, 28}; EXPECT(migraphx::verify::verify_rms_range(indices, gold)); EXPECT(num_selected == 30); } @@ -602,35 +652,39 @@ TEST_CASE(nms_30boxes_3class_test) auto iou_threshold = mm->add_literal(0.4500f); auto score_threshold = mm->add_literal(0.1500f); - auto nms = - mm->add_instruction(migraphx::make_op("nonmaxsuppression"), - boxes_p, - scores_p, - max_out_l, - iou_threshold, - score_threshold); + auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"), + boxes_p, + scores_p, + max_out_l, + iou_threshold, + score_threshold); add_nms_return(mm, nms); std::vector boxes_vec = { - 31.2680f, 53.5348f, 37.7043f, 73.6253f, 1.8071f, 55.2945f, 3.9368f, 78.7402f, 40.5016f, 12.5670f, 45.0345f, 32.9366f, - 78.2552f, 12.9548f, 80.7117f, 35.6526f, 73.9527f, 67.9870f, 79.4405f, 71.9065f, -3.8066f, -7.7339f, 10.2705f, 11.5692f, - 45.4706f, 34.8613f, 67.4569f, 48.4119f, 17.4632f, 30.3439f, 30.8192f, 43.8443f, 64.5403f, 44.3725f, 79.9380f, 66.0477f, - 0.7877f, 1.3956f, 6.4307f, 24.7471f, 65.1632f, 44.8608f, 84.5766f, 62.0721f, 59.3935f, 24.0849f, 74.6026f, 36.1925f, - -1.0372f, 43.7485f, 19.8379f, 55.2458f, -6.6257f, -1.7353f, 16.1976f, 8.1505f, 62.2758f, 32.2798f, 71.2775f, 41.5966f, - 10.9190f, 36.7777f, 14.0023f, 46.7824f, 39.6937f, 15.6139f, 45.8900f, 18.6783f, 67.7244f, 9.7794f, 78.7948f, 12.5604f, - 34.0204f, 5.6094f, 56.7713f, 24.5464f, 26.9281f, 21.9014f, 36.6292f, 33.1611f, 26.2374f, -3.4581f, 44.9652f, 18.9477f, - -1.6661f, 68.2450f, 11.7649f, 83.3261f, 74.8979f, 31.4950f, 80.1025f, 33.3041f, 20.6639f, 62.4061f, 29.0408f, 67.0291f, - 7.1374f, 75.0864f, 23.1608f, 80.8203f, 14.6460f, -5.2621f, 31.1216f, 18.1798f, 71.6501f, 49.1185f, 82.6496f, 55.1487f, - 4.4135f, 63.2815f, 10.6723f, 76.1439f, 60.5823f, 39.4727f, 78.1862f, 62.0048f, 54.1855f, 22.5844f, 59.0696f, 46.0598f}; + 31.2680f, 53.5348f, 37.7043f, 73.6253f, 1.8071f, 55.2945f, 3.9368f, 78.7402f, 40.5016f, + 12.5670f, 45.0345f, 32.9366f, 78.2552f, 12.9548f, 80.7117f, 35.6526f, 73.9527f, 67.9870f, + 79.4405f, 71.9065f, -3.8066f, -7.7339f, 10.2705f, 11.5692f, 45.4706f, 34.8613f, 67.4569f, + 48.4119f, 17.4632f, 30.3439f, 30.8192f, 43.8443f, 64.5403f, 44.3725f, 79.9380f, 66.0477f, + 0.7877f, 1.3956f, 6.4307f, 24.7471f, 65.1632f, 44.8608f, 84.5766f, 62.0721f, 59.3935f, + 24.0849f, 74.6026f, 36.1925f, -1.0372f, 43.7485f, 19.8379f, 55.2458f, -6.6257f, -1.7353f, + 16.1976f, 8.1505f, 62.2758f, 32.2798f, 71.2775f, 41.5966f, 10.9190f, 36.7777f, 14.0023f, + 46.7824f, 39.6937f, 15.6139f, 45.8900f, 18.6783f, 67.7244f, 9.7794f, 78.7948f, 12.5604f, + 34.0204f, 5.6094f, 56.7713f, 24.5464f, 26.9281f, 21.9014f, 36.6292f, 33.1611f, 26.2374f, + -3.4581f, 44.9652f, 18.9477f, -1.6661f, 68.2450f, 11.7649f, 83.3261f, 74.8979f, 31.4950f, + 80.1025f, 33.3041f, 20.6639f, 62.4061f, 29.0408f, 67.0291f, 7.1374f, 75.0864f, 23.1608f, + 80.8203f, 14.6460f, -5.2621f, 31.1216f, 18.1798f, 71.6501f, 49.1185f, 82.6496f, 55.1487f, + 4.4135f, 63.2815f, 10.6723f, 76.1439f, 60.5823f, 39.4727f, 78.1862f, 62.0048f, 54.1855f, + 22.5844f, 59.0696f, 46.0598f}; std::vector scores_vec = { - 0.9367f, 0.1879f, 0.1073f, 0.4976f, 0.5195f, 0.5082f, 0.4367f, 0.9948f, 0.4863f, 0.4779f, 0.4218f, 0.0668f, - 0.5930f, 0.2280f, 0.6376f, 0.0508f, 0.9814f, 0.4690f, 0.8968f, 0.4756f, 0.0603f, 0.8222f, 0.6482f, 0.7818f, - 0.4282f, 0.6379f, 0.8562f, 0.6311f, 0.3477f, 0.6625f, 0.6719f, 0.9606f, 0.3709f, 0.4251f, 0.8121f, 0.5058f, - 0.7366f, 0.4597f, 0.2155f, 0.7452f, 0.1312f, 0.1986f, 0.6268f, 0.7473f, 0.8947f, 0.2726f, 0.1107f, 0.9560f, - 0.1544f, 0.1977f, 0.2913f, 0.5294f, 0.8828f, 0.7605f, 0.7082f, 0.1752f, 0.3577f, 0.4784f, 0.1474f, 0.2734f, - 0.3083f, 0.1273f, 0.5502f, 0.7050f, 0.0699f, 0.4811f, 0.7822f, 0.7480f, 0.8151f, 0.4482f, 0.8206f, 0.2408f, - 0.3608f, 0.1764f, 0.4675f, 0.3921f, 0.2409f, 0.7518f, 0.3138f, 0.2728f, 0.1309f, 0.4388f, 0.3030f, 0.3693f, - 0.2360f, 0.7632f, 0.9300f, 0.4979f, 0.6430f, 0.8672f}; + 0.9367f, 0.1879f, 0.1073f, 0.4976f, 0.5195f, 0.5082f, 0.4367f, 0.9948f, 0.4863f, 0.4779f, + 0.4218f, 0.0668f, 0.5930f, 0.2280f, 0.6376f, 0.0508f, 0.9814f, 0.4690f, 0.8968f, 0.4756f, + 0.0603f, 0.8222f, 0.6482f, 0.7818f, 0.4282f, 0.6379f, 0.8562f, 0.6311f, 0.3477f, 0.6625f, + 0.6719f, 0.9606f, 0.3709f, 0.4251f, 0.8121f, 0.5058f, 0.7366f, 0.4597f, 0.2155f, 0.7452f, + 0.1312f, 0.1986f, 0.6268f, 0.7473f, 0.8947f, 0.2726f, 0.1107f, 0.9560f, 0.1544f, 0.1977f, + 0.2913f, 0.5294f, 0.8828f, 0.7605f, 0.7082f, 0.1752f, 0.3577f, 0.4784f, 0.1474f, 0.2734f, + 0.3083f, 0.1273f, 0.5502f, 0.7050f, 0.0699f, 0.4811f, 0.7822f, 0.7480f, 0.8151f, 0.4482f, + 0.8206f, 0.2408f, 0.3608f, 0.1764f, 0.4675f, 0.3921f, 0.2409f, 0.7518f, 0.3138f, 0.2728f, + 0.1309f, 0.4388f, 0.3030f, 0.3693f, 0.2360f, 0.7632f, 0.9300f, 0.4979f, 0.6430f, 0.8672f}; migraphx::parameter_map host_params; host_params["boxes"] = migraphx::argument(boxes_s, boxes_vec.data()); @@ -638,7 +692,9 @@ TEST_CASE(nms_30boxes_3class_test) auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params); indices.resize(static_cast(num_selected) * 3); - std::vector gold = {0, 0, 7, 0, 0, 16, 0, 0, 0, 0, 0, 18, 0, 0, 26, 0, 1, 1, 0, 1, 17, 0, 1, 14, 0, 1, 22, 0, 1, 4, 0, 2, 26, 0, 2, 29, 0, 2, 10, 0, 2, 6, 0, 2, 25}; + std::vector gold = {0, 0, 7, 0, 0, 16, 0, 0, 0, 0, 0, 18, 0, 0, 26, + 0, 1, 1, 0, 1, 17, 0, 1, 14, 0, 1, 22, 0, 1, 4, + 0, 2, 26, 0, 2, 29, 0, 2, 10, 0, 2, 6, 0, 2, 25}; EXPECT(migraphx::verify::verify_rms_range(indices, gold)); EXPECT(num_selected == 15); } @@ -657,218 +713,296 @@ TEST_CASE(nms_200boxes_2batch_2class_test) auto iou_threshold = mm->add_literal(0.3000f); auto score_threshold = mm->add_literal(0.2500f); - auto nms = - mm->add_instruction(migraphx::make_op("nonmaxsuppression"), - boxes_p, - scores_p, - max_out_l, - iou_threshold, - score_threshold); + auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"), + boxes_p, + scores_p, + max_out_l, + iou_threshold, + score_threshold); add_nms_return(mm, nms); std::vector boxes_vec = { - 132.1894f, 453.1169f, 199.9736f, 545.7127f, 64.3090f, 275.1729f, 104.8258f, 338.3436f, 76.1273f, 401.7875f, 135.6448f, 487.9920f, - 12.8305f, 442.3624f, 77.1708f, 466.2458f, -5.9609f, 340.1129f, 126.0715f, 451.3386f, 15.0119f, 224.3769f, 56.2927f, 236.5545f, - 427.8277f, -14.2917f, 561.9954f, 95.4457f, 4.7940f, -55.8461f, 69.2637f, 71.6517f, 41.3494f, 202.9014f, 91.1927f, 274.2992f, - 375.6902f, 208.6749f, 451.5645f, 285.6396f, 258.4982f, 179.9212f, 321.7420f, 227.4412f, 367.5344f, 211.3590f, 406.8828f, 356.8083f, - 277.5064f, 220.9636f, 353.4056f, 331.1991f, 429.2783f, 390.3169f, 452.8968f, 446.2962f, 292.5150f, 40.8054f, 345.9525f, 67.8517f, - 218.4112f, 95.7302f, 303.7139f, 129.4475f, 325.0759f, 361.4403f, 387.6738f, 431.5647f, 161.8149f, 353.1971f, 285.5779f, 494.6398f, - 153.4061f, 442.2182f, 192.6577f, 552.6060f, 161.0782f, 419.9203f, 306.5742f, 452.9917f, 25.8953f, 380.4122f, 143.8188f, 509.4868f, - 325.7002f, 128.4980f, 470.8716f, 185.8499f, 67.4107f, 136.8775f, 193.2931f, 264.7841f, 65.6790f, 115.5359f, 87.8525f, 152.5492f, - 83.4548f, 256.5595f, 162.8974f, 349.7399f, 407.8717f, 399.8657f, 434.1985f, 538.9396f, 103.6427f, 152.6073f, 226.5586f, 192.0336f, - 299.0049f, 226.3779f, 387.0450f, 330.6239f, 408.0779f, 74.0950f, 448.3318f, 222.2046f, -30.8828f, 73.1804f, 108.6275f, 96.6196f, - 373.4308f, 90.5068f, 391.5936f, 104.6787f, 111.3250f, -21.7549f, 196.3405f, 79.7002f, 54.0937f, 448.8364f, 162.5287f, 500.4571f, - 339.5665f, 195.6321f, 349.3349f, 207.2475f, 409.8580f, 381.1502f, 499.9386f, 452.9707f, 86.2250f, 284.0088f, 208.7943f, 397.3206f, - 278.8861f, 74.2190f, 289.9477f, 117.7022f, 106.2550f, 62.2701f, 183.5792f, 113.1921f, 257.3803f, 342.4895f, 296.9053f, 469.4987f, - 261.0432f, 93.1105f, 360.8189f, 171.6012f, 295.8262f, 393.3591f, 314.5092f, 519.9261f, 241.4629f, 36.2717f, 382.0835f, 103.7837f, - 0.3826f, 267.3577f, 134.6972f, 410.3510f, 332.4151f, 358.2527f, 361.1253f, 456.2211f, 312.7919f, 108.4937f, 361.9585f, 126.7627f, - 297.0153f, 71.6643f, 385.5729f, 204.5431f, -16.9604f, 445.3092f, 91.0309f, 519.2097f, 189.9415f, 121.2467f, 256.8973f, 143.3509f, - 192.3739f, 203.1031f, 216.6613f, 226.8539f, 35.0965f, 164.5365f, 51.6150f, 267.9791f, 36.2014f, 122.4881f, 186.1665f, 130.5466f, - 186.0576f, 366.0443f, 254.9050f, 409.7468f, 305.9496f, 375.0105f, 436.9568f, 396.8388f, 82.0940f, 155.7987f, 154.9680f, 222.5193f, - 345.6593f, 386.1935f, 484.0906f, 448.9323f, 265.8611f, 67.1577f, 279.9372f, 145.9173f, 371.2164f, -19.1800f, 389.2053f, 23.4858f, - 166.5204f, 282.6964f, 306.0356f, 288.4709f, 178.5089f, 450.7671f, 320.6853f, 543.3107f, 285.9132f, -9.0198f, 333.8062f, 47.6641f, - 437.0255f, 54.9746f, 490.9451f, 153.0235f, 211.6987f, 250.8616f, 280.1138f, 268.0530f, 232.8247f, 403.4440f, 295.8328f, 406.4968f, - 286.3401f, 25.5231f, 315.6569f, 63.5189f, 301.3286f, 163.1046f, 436.1865f, 232.1301f, 16.5538f, 343.6795f, 55.2966f, 403.3963f, - 204.8009f, 124.9041f, 310.8865f, 246.6391f, 235.2927f, 65.7693f, 246.2989f, 123.0671f, 457.4555f, 57.7300f, 464.2295f, 137.7658f, - 197.5504f, 160.3075f, 295.9562f, 249.7413f, 208.4036f, 237.5821f, 259.9170f, 241.8350f, 431.7683f, 392.0298f, 530.4317f, 469.7846f, - 217.7836f, 294.9363f, 232.7928f, 347.3161f, 19.1783f, 313.3156f, 161.7061f, 377.0863f, 52.1937f, 483.5222f, 164.7224f, 499.4650f, - -18.1881f, 147.1016f, 113.3757f, 264.7419f, -10.3830f, 130.9681f, 10.9511f, 272.3863f, 191.6208f, 459.5145f, 240.3248f, 463.8325f, - 356.6797f, 77.6355f, 412.5629f, 168.2401f, 326.2139f, 307.5013f, 407.2526f, 422.3140f, -6.5422f, 355.5684f, 38.6912f, 399.0047f, - 279.9745f, -10.2789f, 290.0085f, 108.0669f, 49.1601f, 186.5052f, 105.1230f, 281.7262f, 451.0742f, 30.5586f, 490.0021f, 170.0038f, - 54.4314f, 19.1028f, 112.9336f, 166.2725f, 298.1461f, 228.2593f, 328.4931f, 235.5688f, 143.1079f, 111.0670f, 183.1305f, 178.3627f, - 273.5727f, 356.7796f, 367.9886f, 439.2808f, 176.7118f, 442.3701f, 235.5468f, 465.2348f, 353.5905f, 375.8070f, 406.0526f, 426.9136f, - 75.0636f, 58.9357f, 155.6155f, 207.0952f, 394.8923f, 135.3580f, 510.8995f, 138.7764f, 221.3792f, 93.1523f, 278.8305f, 161.5760f, - 333.7764f, 4.2413f, 422.3168f, 130.7968f, 352.3830f, 447.2686f, 497.3472f, 496.5298f, 460.0268f, 164.7789f, 538.8018f, 237.2689f, - 43.6929f, 38.9803f, 180.2527f, 185.7092f, 83.8176f, 387.4572f, 203.0748f, 459.2138f, 120.3420f, 189.3440f, 130.0911f, 209.8513f, - 98.9678f, 13.2052f, 163.9035f, 21.9117f, 238.6976f, 10.0373f, 343.7471f, 151.9043f, 422.7512f, 299.3224f, 570.7713f, 339.9280f, - 460.4900f, 353.3999f, 529.7881f, 429.5054f, 255.9741f, 98.2099f, 270.7991f, 112.7245f, 277.1439f, 426.6355f, 361.8833f, 490.7601f, - 420.0563f, 355.7057f, 439.9143f, 495.2914f, 409.9785f, 386.2606f, 522.9550f, 462.1201f, 63.6084f, 40.9810f, 140.2522f, 186.6801f, - 209.8752f, 5.4847f, 318.6665f, 45.0513f, 351.1511f, 395.6231f, 481.6860f, 471.8004f, 104.2444f, 88.3651f, 198.9577f, 217.4352f, - 173.7778f, 275.5634f, 266.0312f, 343.3530f, 436.0951f, 358.6616f, 549.5261f, 401.3052f, 429.2604f, -0.0863f, 555.7863f, 128.3795f, - 387.8089f, 360.8724f, 518.2979f, 419.9659f, 396.0101f, 429.2169f, 402.4382f, 509.2946f, 92.6291f, 290.9362f, 176.5014f, 437.4388f, - 143.8130f, 206.2184f, 177.0371f, 235.0044f, 209.0457f, 415.3847f, 338.2372f, 461.2934f, 231.5831f, 260.9141f, 329.1943f, 266.5435f, - 220.9448f, 342.6935f, 284.5580f, 402.0774f, 303.8214f, 394.8393f, 332.8489f, 425.6666f, 178.4043f, 323.5138f, 229.9188f, 425.8390f, - 321.6556f, 129.9190f, 427.5185f, 157.9359f, 151.0502f, 8.1484f, 182.4998f, 109.6955f, 157.8666f, 99.0403f, 172.8104f, 139.2982f, - -3.0452f, 224.4737f, 130.2711f, 278.4012f, 36.9224f, 226.1483f, 151.7898f, 279.1286f, 409.8757f, 237.4242f, 440.6452f, 345.2202f, - 200.8640f, 162.2960f, 245.4184f, 232.8059f, 41.0147f, 366.0289f, 186.8531f, 420.8625f, 326.4108f, 392.5565f, 432.9303f, 520.5973f, - 231.0067f, 80.2522f, 322.9745f, 166.4729f, -12.8403f, 351.8312f, 33.9963f, 384.6920f, 135.3959f, 271.4291f, 180.9655f, 406.5427f, - 85.0562f, 235.5178f, 91.9452f, 287.5727f, 273.1645f, 90.8612f, 382.7083f, 97.6691f, 133.7990f, 360.2684f, 141.2321f, 434.9638f, - 31.6115f, 470.5798f, 33.3353f, 490.0465f, -27.3799f, 342.6524f, 82.3149f, 379.1839f, 219.6726f, 402.7702f, 362.0547f, 515.0898f, - -45.9977f, 481.8516f, 67.7212f, 502.3336f, 388.7589f, 115.4080f, 460.0333f, 236.6427f, 40.9882f, 248.8122f, 114.4089f, 389.4114f, - 270.2910f, 191.2797f, 336.2753f, 282.6530f, 197.6581f, 439.8926f, 247.0300f, 546.7361f, 182.0580f, -6.7583f, 260.7935f, 100.5661f, - 3.2778f, 131.7233f, 68.5193f, 280.6516f, 356.3126f, 411.8249f, 446.4396f, 463.7141f, 379.1163f, 129.3928f, 513.9362f, 154.6585f, - -69.1199f, 354.7185f, 80.1365f, 433.0744f, 82.9357f, 151.1645f, 95.6685f, 231.6187f, 422.7932f, 476.2348f, 481.1110f, 503.7437f, - 260.7842f, 395.5883f, 288.7094f, 487.9416f, 48.2868f, 149.1079f, 101.7528f, 152.2125f, 79.4785f, 315.4853f, 123.3120f, 454.7079f, - 316.4901f, 148.2175f, 343.4961f, 188.6391f, 304.9847f, 299.7342f, 419.8321f, 306.6287f, 262.2399f, 320.6758f, 337.1869f, 337.8050f, - 407.5904f, 396.3992f, 545.5580f, 433.1963f, 244.1037f, -8.6806f, 249.9599f, 33.1314f, 144.6461f, 107.1346f, 155.6258f, 113.0233f, - 208.0726f, 334.6470f, 269.1603f, 377.2708f, 173.3525f, 266.8875f, 186.3138f, 296.6358f, 92.1346f, 219.0953f, 132.2813f, 276.5098f, - -50.9776f, -1.5900f, 96.9408f, 56.8000f, 160.0388f, 148.3819f, 192.1737f, 199.8940f, 340.4449f, 407.6198f, 370.9644f, 457.4804f, - -34.0173f, 8.2614f, 52.4551f, 22.6314f, 181.9884f, 195.8403f, 257.1901f, 200.5959f, 278.2621f, 457.0166f, 365.7473f, 488.1317f, - 276.6353f, -31.4300f, 333.7688f, 82.3108f, 326.2304f, 300.5375f, 450.4180f, 449.1682f, 394.4356f, 59.1311f, 416.0841f, 198.4815f, - 323.4377f, 395.2401f, 388.2682f, 471.3687f, -0.4884f, 332.9131f, 103.2861f, 413.1549f, 172.3276f, 418.9163f, 302.6948f, 466.7889f, - 273.6699f, 49.8039f, 329.7361f, 166.1209f, 79.9860f, 208.1720f, 165.5801f, 323.1208f, 15.6250f, 326.2367f, 26.9268f, 453.0333f, - 98.6064f, 55.6348f, 124.9839f, 190.0650f, 221.7964f, 82.5141f, 233.0980f, 148.2322f, 152.2380f, -44.0412f, 261.6923f, 71.2233f, - 66.3730f, 418.6809f, 110.2940f, 539.8344f, 357.7888f, 331.5282f, 466.6268f, 378.4887f, 457.3967f, 248.0516f, 468.2900f, 387.5087f, - 35.9143f, 364.4689f, 165.4340f, 379.5258f, 402.0395f, 191.2334f, 527.5334f, 340.3795f, 1.8053f, 180.1951f, 16.0557f, 295.9387f, - 460.2114f, 217.3174f, 464.7511f, 232.2148f, 471.2709f, 270.8305f, 480.6579f, 369.6087f, -58.0695f, 97.7211f, 70.1214f, 103.8139f, - 363.5242f, 386.1504f, 399.4951f, 501.9083f, 443.7544f, 345.8341f, 526.4471f, 465.9183f, 420.6959f, 129.4022f, 485.2063f, 220.1614f, - 425.5884f, 224.9686f, 545.1217f, 353.6407f, 238.2388f, 62.7213f, 312.0847f, 78.3060f, 1.2788f, 465.1168f, 76.8773f, 507.2295f, - 350.7072f, 420.0901f, 499.0819f, 482.8026f, 295.2295f, 457.2856f, 318.5988f, 464.6119f, 248.9387f, 366.2193f, 368.7308f, 464.4846f, - 266.4057f, -43.0988f, 411.9049f, 94.8485f, 365.3591f, 230.8355f, 381.3726f, 246.8133f, 213.6699f, 419.1429f, 302.9046f, 467.1919f, - 282.3146f, 326.7091f, 321.6300f, 338.5049f, 157.0835f, 271.7193f, 238.9818f, 413.4953f, -3.7474f, 97.9864f, 45.0004f, 165.3309f, - 28.3577f, 158.4742f, 71.5941f, 260.1006f, 284.2465f, 120.1271f, 370.7495f, 246.4540f, 483.6205f, 186.3921f, 511.9348f, 335.0511f, - -27.5488f, 218.5612f, 43.3521f, 243.6668f, 229.8062f, 103.3855f, 327.7773f, 223.5129f, 365.4548f, 86.1273f, 385.5540f, 219.3533f, - 343.5581f, 121.2852f, 483.2167f, 129.5677f, 234.4260f, 125.8439f, 310.7789f, 239.2034f, 248.4032f, 48.0437f, 371.5128f, 101.8978f, - 299.1465f, 387.2317f, 397.5784f, 484.8726f, 376.0880f, 262.2631f, 482.8782f, 339.8563f, 7.2930f, 47.0424f, 114.9965f, 86.7440f, - 397.3961f, 336.3557f, 528.7860f, 357.5037f, -33.2049f, 414.6207f, 59.2223f, 433.0458f, 396.8727f, 110.5703f, 439.3271f, 126.9654f, - 30.4567f, 27.2849f, 46.2837f, 123.3157f, 51.6484f, -22.3715f, 142.9798f, 30.9887f, -3.4962f, 6.9860f, 7.3904f, 40.2644f, - 204.1520f, 329.0802f, 241.1047f, 433.1711f, 162.1569f, 441.9229f, 172.2023f, 545.2635f, 41.6043f, -18.2279f, 124.3886f, 63.1082f, - 213.0999f, 303.8811f, 237.9903f, 444.1898f, 155.2101f, 6.7177f, 247.1608f, 65.1444f, 324.4111f, 233.2946f, 443.2500f, 358.8382f, - 384.8351f, 371.9398f, 508.2953f, 384.1355f, 302.7226f, 123.9848f, 349.8446f, 235.2196f, 20.8081f, -68.6720f, 103.6023f, 79.6067f, - 105.2511f, 234.0231f, 190.1397f, 361.1662f, 420.9290f, 451.9373f, 492.3893f, 539.3073f, -4.9387f, 81.6146f, 93.6732f, 176.0028f, - 187.2764f, 67.9256f, 219.5794f, 121.5657f, 397.7987f, 10.8413f, 544.7059f, 113.0846f, 467.5255f, 219.7334f, 483.1394f, 335.5223f, - 143.3246f, 223.3545f, 267.8786f, 373.0906f, 288.9383f, 358.9469f, 378.4586f, 433.3239f, 209.6311f, 371.4695f, 247.1145f, 381.6038f, - 320.6775f, 401.3793f, 432.7831f, 491.1622f, 8.9968f, 393.5190f, 22.5845f, 412.2537f, 13.8844f, 104.8985f, 130.2727f, 142.3685f, - 262.6455f, 252.9446f, 351.5533f, 302.9328f, 107.5252f, 93.7443f, 125.0270f, 203.6677f, 326.6030f, 150.6990f, 339.4493f, 179.0864f, - 119.1742f, 453.1236f, 232.0488f, 478.8208f, 420.9991f, 337.0981f, 465.6465f, 344.7978f, 342.8767f, 421.7388f, 476.3827f, 552.8516f, - 189.1445f, 156.2901f, 303.6933f, 260.6224f, 333.9324f, 265.2428f, 438.9627f, 272.1948f, 114.3128f, 240.9499f, 156.8251f, 246.1655f, - 193.8135f, 11.5223f, 300.4463f, 95.7648f, 27.6040f, 96.8022f, 169.8780f, 139.8998f, 423.1219f, 218.8621f, 437.7643f, 308.7743f, - 386.7347f, 0.8091f, 436.3329f, 66.5652f, 433.0917f, 396.4442f, 469.0579f, 535.0178f, 408.9413f, 39.9801f, 468.5356f, 83.8636f, - 423.9944f, 47.8940f, 535.6019f, 150.0867f, 78.3370f, 378.1336f, 149.9992f, 387.1877f, 422.8927f, -23.2443f, 508.9316f, 120.1789f, - 261.7021f, 376.5726f, 309.5111f, 523.7055f, 200.2215f, 307.2894f, 222.2736f, 418.4116f, 259.8004f, -0.8479f, 300.5735f, 69.4688f, - 106.7550f, 329.0340f, 235.8474f, 362.8130f, 98.8964f, 254.7818f, 189.6566f, 376.8467f, 91.9970f, 323.3163f, 149.3173f, 434.0331f, - -18.1340f, 397.0634f, 100.5620f, 431.1345f, 242.9804f, 325.0598f, 253.5845f, 393.2908f, 424.4659f, 258.1096f, 463.2957f, 328.0667f, - 297.4333f, 99.1641f, 332.7187f, 223.2992f, 186.5782f, 297.1904f, 334.3975f, 400.0833f, 161.1921f, 430.0698f, 267.4008f, 526.9018f, - 185.6758f, 244.8488f, 278.7259f, 342.6730f, 103.7673f, 311.5224f, 105.5101f, 352.8224f, 397.2368f, 190.3715f, 425.6990f, 246.7565f, - 51.3437f, 374.1586f, 147.0393f, 381.9622f, 329.5223f, 439.7066f, 387.1005f, 557.9608f, 310.6336f, 47.4363f, 449.3514f, 112.9530f, - 229.9626f, 68.0539f, 344.9065f, 134.3514f, 397.6331f, 250.9398f, 465.2933f, 288.4979f, 89.1863f, 224.5854f, 201.8640f, 256.7900f, - 367.6410f, 241.4922f, 513.9763f, 330.0776f, 329.8622f, 6.7118f, 399.5483f, 42.3622f, 351.0067f, 196.8547f, 447.7431f, 207.4218f, - 263.3493f, 233.8098f, 401.2304f, 349.1684f, 404.1452f, 264.0487f, 442.1978f, 321.1426f, 430.0009f, 299.8394f, 563.0980f, 357.4945f, - 202.3143f, 327.4748f, 217.8485f, 392.7412f, 358.1485f, 259.5528f, 455.7672f, 381.9944f, 313.4684f, 370.7192f, 431.1113f, 419.5239f, - 180.1469f, 255.4066f, 272.7232f, 369.3540f, 426.0572f, 198.2577f, 500.8918f, 339.2499f, 150.7206f, 253.3635f, 243.7053f, 352.8329f, - 270.9340f, 17.9364f, 294.5319f, 83.2569f, 36.4112f, 80.3679f, 69.5312f, 192.7886f, 92.2801f, 229.0865f, 133.4951f, 298.3132f, - 375.3135f, 405.1188f, 465.3827f, 467.8684f, 164.8547f, 299.8922f, 231.6980f, 379.1594f, 178.3286f, 21.0337f, 215.7555f, 69.3744f, - 56.7212f, 287.8708f, 189.2598f, 304.4041f, 217.4480f, 79.4625f, 274.1624f, 142.2755f, 369.1791f, 357.2809f, 436.6378f, 376.7356f, - 416.5593f, 382.6425f, 478.6048f, 444.7983f, 21.0025f, 254.7366f, 49.1120f, 338.7197f, 232.4042f, 225.8433f, 342.4166f, 365.5193f, - 199.7265f, 166.0972f, 267.5468f, 172.4943f, 305.4298f, 176.3264f, 308.8521f, 269.9237f, 151.3188f, 397.4529f, 295.9569f, 466.6555f, - 138.0480f, 359.6507f, 260.5968f, 363.6696f, 181.5352f, 240.7855f, 290.3455f, 278.9682f, 225.7522f, 174.7890f, 356.2469f, 193.4433f, - 182.4345f, 8.5387f, 318.5487f, 41.8410f, 210.4292f, 50.5482f, 261.7152f, 92.4592f, 362.9012f, 66.1153f, 454.9341f, 126.9099f, - 326.9678f, 146.7783f, 418.6802f, 226.6052f, 150.2754f, 471.4981f, 191.1031f, 472.6456f, 383.2531f, 240.0174f, 417.3240f, 265.1360f, - 417.8392f, 109.9494f, 435.8114f, 124.8908f, 27.1272f, 11.4244f, 126.3650f, 94.3257f, 232.6628f, 144.1367f, 350.0197f, 194.1688f, - 85.4650f, 366.5097f, 199.8470f, 449.2209f, 345.5237f, 174.6456f, 393.6487f, 208.6972f, 103.6008f, 383.9478f, 135.1845f, 388.5580f, - 301.4075f, 330.7206f, 369.9960f, 471.9843f, 86.3247f, 46.8414f, 168.7999f, 63.9793f, 186.5999f, 294.3789f, 324.5439f, 314.2809f, - 408.6489f, 468.1303f, 539.9976f, 490.9658f, 121.9074f, 127.4639f, 259.4001f, 274.6741f, 374.0247f, -21.0436f, 501.7138f, 71.9877f, - 421.1110f, 415.6848f, 565.8336f, 507.6180f, 402.2457f, 367.8241f, 472.6052f, 515.8422f, 78.8962f, 253.9820f, 86.9698f, 268.1594f, - 403.1037f, 203.0262f, 416.5545f, 349.2269f, -13.5009f, 90.1716f, 45.6503f, 121.5695f, 176.9532f, 362.8065f, 216.3486f, 456.6442f, - 422.2061f, 217.5038f, 448.5273f, 281.0963f, 272.8624f, -12.1655f, 415.8898f, 46.0433f, 251.3114f, 271.6299f, 281.4290f, 411.3851f, - 121.9583f, 463.6307f, 265.9058f, 486.8656f, 348.9660f, 339.7936f, 463.3310f, 489.3569f, 306.5287f, 109.8543f, 403.0297f, 167.3439f, - 183.3392f, -22.1712f, 285.0661f, 75.4963f, 421.0473f, 397.5667f, 471.4370f, 542.7847f, 66.3152f, 463.7401f, 163.6328f, 473.3226f, - 70.7872f, 196.9543f, 99.6043f, 335.4611f, 251.0428f, 278.3568f, 391.7609f, 363.9607f, 463.0136f, 178.3225f, 508.9808f, 284.2776f, - 104.1169f, 198.2685f, 143.1397f, 221.4969f, 71.3536f, 19.4869f, 178.3168f, 99.9616f, 20.3440f, -2.3003f, 119.1549f, 99.0532f, - 396.1600f, 81.8756f, 464.4035f, 150.8565f, 65.5815f, 406.2740f, 160.8160f, 430.3668f, 239.2070f, 54.2293f, 263.9715f, 91.6030f, - 444.7733f, 49.1971f, 546.0992f, 177.5016f, -14.5900f, 271.2390f, 26.7309f, 277.3751f, 257.4168f, 54.2554f, 299.0693f, 160.8758f, - 243.5621f, 6.6488f, 268.7269f, 156.5579f, 378.4616f, 280.6006f, 428.9858f, 282.7156f, 152.4626f, 171.5487f, 202.8190f, 196.5445f, - 170.8344f, 262.3559f, 239.5070f, 363.8034f, 69.2827f, 451.1334f, 98.6552f, 461.0720f, 355.5286f, 31.0572f, 385.2867f, 119.9359f, - 351.4949f, 405.2588f, 433.2140f, 508.1748f, 58.2303f, 406.9281f, 78.4330f, 495.5619f, 144.9057f, 386.8375f, 248.5514f, 442.2501f, - 375.6284f, 263.1954f, 517.2766f, 368.0905f, -30.9426f, 265.2984f, 33.6499f, 354.8483f, 81.7472f, 303.6374f, 217.0119f, 335.5753f, - 269.6966f, 302.7942f, 285.3457f, 387.7014f, 163.3466f, -57.9610f, 170.7473f, 74.4432f, 81.7806f, 428.8672f, 190.2646f, 529.2253f, - 172.8226f, 257.1534f, 287.2148f, 328.4503f, 27.4537f, 366.2749f, 154.0694f, 415.1909f, 260.0797f, 181.7424f, 269.5455f, 195.5394f, - 294.9684f, -12.5261f, 411.7275f, 24.9233f, 259.0953f, 253.5339f, 316.1996f, 256.2007f, 23.4560f, 179.5914f, 69.6533f, 327.5987f, - 408.8140f, 201.4197f, 435.5946f, 235.5696f, 12.7857f, 108.6503f, 162.1921f, 231.0668f, 377.1631f, 111.8490f, 387.6489f, 137.9771f, - 118.1705f, 242.1441f, 242.3947f, 285.4007f, 343.2383f, 155.9774f, 439.5230f, 219.3007f, 47.8730f, 460.2977f, 158.3999f, 509.6342f, - 39.8081f, 26.4865f, 146.8540f, 146.4408f, 184.0596f, 87.9846f, 312.9663f, 231.6809f, 2.2755f, 81.2708f, 30.6605f, 212.6897f, - 112.0872f, 259.7130f, 113.2101f, 283.5961f, 316.9157f, 191.2768f, 407.0965f, 308.0034f, 391.8293f, 310.3482f, 445.5542f, 333.3923f, - 30.6705f, 406.4540f, 50.1148f, 543.5478f, 426.6715f, 103.5286f, 455.4062f, 181.6925f, 373.5433f, 320.8254f, 423.9739f, 371.9462f, - 429.1098f, 0.3217f, 440.5745f, 24.7185f, 344.4742f, 129.8145f, 353.9543f, 132.5740f, 268.3326f, 212.8878f, 405.8205f, 250.8319f, - 238.7950f, -53.0971f, 286.2983f, 84.0919f}; + 132.1894f, 453.1169f, 199.9736f, 545.7127f, 64.3090f, 275.1729f, 104.8258f, 338.3436f, + 76.1273f, 401.7875f, 135.6448f, 487.9920f, 12.8305f, 442.3624f, 77.1708f, 466.2458f, + -5.9609f, 340.1129f, 126.0715f, 451.3386f, 15.0119f, 224.3769f, 56.2927f, 236.5545f, + 427.8277f, -14.2917f, 561.9954f, 95.4457f, 4.7940f, -55.8461f, 69.2637f, 71.6517f, + 41.3494f, 202.9014f, 91.1927f, 274.2992f, 375.6902f, 208.6749f, 451.5645f, 285.6396f, + 258.4982f, 179.9212f, 321.7420f, 227.4412f, 367.5344f, 211.3590f, 406.8828f, 356.8083f, + 277.5064f, 220.9636f, 353.4056f, 331.1991f, 429.2783f, 390.3169f, 452.8968f, 446.2962f, + 292.5150f, 40.8054f, 345.9525f, 67.8517f, 218.4112f, 95.7302f, 303.7139f, 129.4475f, + 325.0759f, 361.4403f, 387.6738f, 431.5647f, 161.8149f, 353.1971f, 285.5779f, 494.6398f, + 153.4061f, 442.2182f, 192.6577f, 552.6060f, 161.0782f, 419.9203f, 306.5742f, 452.9917f, + 25.8953f, 380.4122f, 143.8188f, 509.4868f, 325.7002f, 128.4980f, 470.8716f, 185.8499f, + 67.4107f, 136.8775f, 193.2931f, 264.7841f, 65.6790f, 115.5359f, 87.8525f, 152.5492f, + 83.4548f, 256.5595f, 162.8974f, 349.7399f, 407.8717f, 399.8657f, 434.1985f, 538.9396f, + 103.6427f, 152.6073f, 226.5586f, 192.0336f, 299.0049f, 226.3779f, 387.0450f, 330.6239f, + 408.0779f, 74.0950f, 448.3318f, 222.2046f, -30.8828f, 73.1804f, 108.6275f, 96.6196f, + 373.4308f, 90.5068f, 391.5936f, 104.6787f, 111.3250f, -21.7549f, 196.3405f, 79.7002f, + 54.0937f, 448.8364f, 162.5287f, 500.4571f, 339.5665f, 195.6321f, 349.3349f, 207.2475f, + 409.8580f, 381.1502f, 499.9386f, 452.9707f, 86.2250f, 284.0088f, 208.7943f, 397.3206f, + 278.8861f, 74.2190f, 289.9477f, 117.7022f, 106.2550f, 62.2701f, 183.5792f, 113.1921f, + 257.3803f, 342.4895f, 296.9053f, 469.4987f, 261.0432f, 93.1105f, 360.8189f, 171.6012f, + 295.8262f, 393.3591f, 314.5092f, 519.9261f, 241.4629f, 36.2717f, 382.0835f, 103.7837f, + 0.3826f, 267.3577f, 134.6972f, 410.3510f, 332.4151f, 358.2527f, 361.1253f, 456.2211f, + 312.7919f, 108.4937f, 361.9585f, 126.7627f, 297.0153f, 71.6643f, 385.5729f, 204.5431f, + -16.9604f, 445.3092f, 91.0309f, 519.2097f, 189.9415f, 121.2467f, 256.8973f, 143.3509f, + 192.3739f, 203.1031f, 216.6613f, 226.8539f, 35.0965f, 164.5365f, 51.6150f, 267.9791f, + 36.2014f, 122.4881f, 186.1665f, 130.5466f, 186.0576f, 366.0443f, 254.9050f, 409.7468f, + 305.9496f, 375.0105f, 436.9568f, 396.8388f, 82.0940f, 155.7987f, 154.9680f, 222.5193f, + 345.6593f, 386.1935f, 484.0906f, 448.9323f, 265.8611f, 67.1577f, 279.9372f, 145.9173f, + 371.2164f, -19.1800f, 389.2053f, 23.4858f, 166.5204f, 282.6964f, 306.0356f, 288.4709f, + 178.5089f, 450.7671f, 320.6853f, 543.3107f, 285.9132f, -9.0198f, 333.8062f, 47.6641f, + 437.0255f, 54.9746f, 490.9451f, 153.0235f, 211.6987f, 250.8616f, 280.1138f, 268.0530f, + 232.8247f, 403.4440f, 295.8328f, 406.4968f, 286.3401f, 25.5231f, 315.6569f, 63.5189f, + 301.3286f, 163.1046f, 436.1865f, 232.1301f, 16.5538f, 343.6795f, 55.2966f, 403.3963f, + 204.8009f, 124.9041f, 310.8865f, 246.6391f, 235.2927f, 65.7693f, 246.2989f, 123.0671f, + 457.4555f, 57.7300f, 464.2295f, 137.7658f, 197.5504f, 160.3075f, 295.9562f, 249.7413f, + 208.4036f, 237.5821f, 259.9170f, 241.8350f, 431.7683f, 392.0298f, 530.4317f, 469.7846f, + 217.7836f, 294.9363f, 232.7928f, 347.3161f, 19.1783f, 313.3156f, 161.7061f, 377.0863f, + 52.1937f, 483.5222f, 164.7224f, 499.4650f, -18.1881f, 147.1016f, 113.3757f, 264.7419f, + -10.3830f, 130.9681f, 10.9511f, 272.3863f, 191.6208f, 459.5145f, 240.3248f, 463.8325f, + 356.6797f, 77.6355f, 412.5629f, 168.2401f, 326.2139f, 307.5013f, 407.2526f, 422.3140f, + -6.5422f, 355.5684f, 38.6912f, 399.0047f, 279.9745f, -10.2789f, 290.0085f, 108.0669f, + 49.1601f, 186.5052f, 105.1230f, 281.7262f, 451.0742f, 30.5586f, 490.0021f, 170.0038f, + 54.4314f, 19.1028f, 112.9336f, 166.2725f, 298.1461f, 228.2593f, 328.4931f, 235.5688f, + 143.1079f, 111.0670f, 183.1305f, 178.3627f, 273.5727f, 356.7796f, 367.9886f, 439.2808f, + 176.7118f, 442.3701f, 235.5468f, 465.2348f, 353.5905f, 375.8070f, 406.0526f, 426.9136f, + 75.0636f, 58.9357f, 155.6155f, 207.0952f, 394.8923f, 135.3580f, 510.8995f, 138.7764f, + 221.3792f, 93.1523f, 278.8305f, 161.5760f, 333.7764f, 4.2413f, 422.3168f, 130.7968f, + 352.3830f, 447.2686f, 497.3472f, 496.5298f, 460.0268f, 164.7789f, 538.8018f, 237.2689f, + 43.6929f, 38.9803f, 180.2527f, 185.7092f, 83.8176f, 387.4572f, 203.0748f, 459.2138f, + 120.3420f, 189.3440f, 130.0911f, 209.8513f, 98.9678f, 13.2052f, 163.9035f, 21.9117f, + 238.6976f, 10.0373f, 343.7471f, 151.9043f, 422.7512f, 299.3224f, 570.7713f, 339.9280f, + 460.4900f, 353.3999f, 529.7881f, 429.5054f, 255.9741f, 98.2099f, 270.7991f, 112.7245f, + 277.1439f, 426.6355f, 361.8833f, 490.7601f, 420.0563f, 355.7057f, 439.9143f, 495.2914f, + 409.9785f, 386.2606f, 522.9550f, 462.1201f, 63.6084f, 40.9810f, 140.2522f, 186.6801f, + 209.8752f, 5.4847f, 318.6665f, 45.0513f, 351.1511f, 395.6231f, 481.6860f, 471.8004f, + 104.2444f, 88.3651f, 198.9577f, 217.4352f, 173.7778f, 275.5634f, 266.0312f, 343.3530f, + 436.0951f, 358.6616f, 549.5261f, 401.3052f, 429.2604f, -0.0863f, 555.7863f, 128.3795f, + 387.8089f, 360.8724f, 518.2979f, 419.9659f, 396.0101f, 429.2169f, 402.4382f, 509.2946f, + 92.6291f, 290.9362f, 176.5014f, 437.4388f, 143.8130f, 206.2184f, 177.0371f, 235.0044f, + 209.0457f, 415.3847f, 338.2372f, 461.2934f, 231.5831f, 260.9141f, 329.1943f, 266.5435f, + 220.9448f, 342.6935f, 284.5580f, 402.0774f, 303.8214f, 394.8393f, 332.8489f, 425.6666f, + 178.4043f, 323.5138f, 229.9188f, 425.8390f, 321.6556f, 129.9190f, 427.5185f, 157.9359f, + 151.0502f, 8.1484f, 182.4998f, 109.6955f, 157.8666f, 99.0403f, 172.8104f, 139.2982f, + -3.0452f, 224.4737f, 130.2711f, 278.4012f, 36.9224f, 226.1483f, 151.7898f, 279.1286f, + 409.8757f, 237.4242f, 440.6452f, 345.2202f, 200.8640f, 162.2960f, 245.4184f, 232.8059f, + 41.0147f, 366.0289f, 186.8531f, 420.8625f, 326.4108f, 392.5565f, 432.9303f, 520.5973f, + 231.0067f, 80.2522f, 322.9745f, 166.4729f, -12.8403f, 351.8312f, 33.9963f, 384.6920f, + 135.3959f, 271.4291f, 180.9655f, 406.5427f, 85.0562f, 235.5178f, 91.9452f, 287.5727f, + 273.1645f, 90.8612f, 382.7083f, 97.6691f, 133.7990f, 360.2684f, 141.2321f, 434.9638f, + 31.6115f, 470.5798f, 33.3353f, 490.0465f, -27.3799f, 342.6524f, 82.3149f, 379.1839f, + 219.6726f, 402.7702f, 362.0547f, 515.0898f, -45.9977f, 481.8516f, 67.7212f, 502.3336f, + 388.7589f, 115.4080f, 460.0333f, 236.6427f, 40.9882f, 248.8122f, 114.4089f, 389.4114f, + 270.2910f, 191.2797f, 336.2753f, 282.6530f, 197.6581f, 439.8926f, 247.0300f, 546.7361f, + 182.0580f, -6.7583f, 260.7935f, 100.5661f, 3.2778f, 131.7233f, 68.5193f, 280.6516f, + 356.3126f, 411.8249f, 446.4396f, 463.7141f, 379.1163f, 129.3928f, 513.9362f, 154.6585f, + -69.1199f, 354.7185f, 80.1365f, 433.0744f, 82.9357f, 151.1645f, 95.6685f, 231.6187f, + 422.7932f, 476.2348f, 481.1110f, 503.7437f, 260.7842f, 395.5883f, 288.7094f, 487.9416f, + 48.2868f, 149.1079f, 101.7528f, 152.2125f, 79.4785f, 315.4853f, 123.3120f, 454.7079f, + 316.4901f, 148.2175f, 343.4961f, 188.6391f, 304.9847f, 299.7342f, 419.8321f, 306.6287f, + 262.2399f, 320.6758f, 337.1869f, 337.8050f, 407.5904f, 396.3992f, 545.5580f, 433.1963f, + 244.1037f, -8.6806f, 249.9599f, 33.1314f, 144.6461f, 107.1346f, 155.6258f, 113.0233f, + 208.0726f, 334.6470f, 269.1603f, 377.2708f, 173.3525f, 266.8875f, 186.3138f, 296.6358f, + 92.1346f, 219.0953f, 132.2813f, 276.5098f, -50.9776f, -1.5900f, 96.9408f, 56.8000f, + 160.0388f, 148.3819f, 192.1737f, 199.8940f, 340.4449f, 407.6198f, 370.9644f, 457.4804f, + -34.0173f, 8.2614f, 52.4551f, 22.6314f, 181.9884f, 195.8403f, 257.1901f, 200.5959f, + 278.2621f, 457.0166f, 365.7473f, 488.1317f, 276.6353f, -31.4300f, 333.7688f, 82.3108f, + 326.2304f, 300.5375f, 450.4180f, 449.1682f, 394.4356f, 59.1311f, 416.0841f, 198.4815f, + 323.4377f, 395.2401f, 388.2682f, 471.3687f, -0.4884f, 332.9131f, 103.2861f, 413.1549f, + 172.3276f, 418.9163f, 302.6948f, 466.7889f, 273.6699f, 49.8039f, 329.7361f, 166.1209f, + 79.9860f, 208.1720f, 165.5801f, 323.1208f, 15.6250f, 326.2367f, 26.9268f, 453.0333f, + 98.6064f, 55.6348f, 124.9839f, 190.0650f, 221.7964f, 82.5141f, 233.0980f, 148.2322f, + 152.2380f, -44.0412f, 261.6923f, 71.2233f, 66.3730f, 418.6809f, 110.2940f, 539.8344f, + 357.7888f, 331.5282f, 466.6268f, 378.4887f, 457.3967f, 248.0516f, 468.2900f, 387.5087f, + 35.9143f, 364.4689f, 165.4340f, 379.5258f, 402.0395f, 191.2334f, 527.5334f, 340.3795f, + 1.8053f, 180.1951f, 16.0557f, 295.9387f, 460.2114f, 217.3174f, 464.7511f, 232.2148f, + 471.2709f, 270.8305f, 480.6579f, 369.6087f, -58.0695f, 97.7211f, 70.1214f, 103.8139f, + 363.5242f, 386.1504f, 399.4951f, 501.9083f, 443.7544f, 345.8341f, 526.4471f, 465.9183f, + 420.6959f, 129.4022f, 485.2063f, 220.1614f, 425.5884f, 224.9686f, 545.1217f, 353.6407f, + 238.2388f, 62.7213f, 312.0847f, 78.3060f, 1.2788f, 465.1168f, 76.8773f, 507.2295f, + 350.7072f, 420.0901f, 499.0819f, 482.8026f, 295.2295f, 457.2856f, 318.5988f, 464.6119f, + 248.9387f, 366.2193f, 368.7308f, 464.4846f, 266.4057f, -43.0988f, 411.9049f, 94.8485f, + 365.3591f, 230.8355f, 381.3726f, 246.8133f, 213.6699f, 419.1429f, 302.9046f, 467.1919f, + 282.3146f, 326.7091f, 321.6300f, 338.5049f, 157.0835f, 271.7193f, 238.9818f, 413.4953f, + -3.7474f, 97.9864f, 45.0004f, 165.3309f, 28.3577f, 158.4742f, 71.5941f, 260.1006f, + 284.2465f, 120.1271f, 370.7495f, 246.4540f, 483.6205f, 186.3921f, 511.9348f, 335.0511f, + -27.5488f, 218.5612f, 43.3521f, 243.6668f, 229.8062f, 103.3855f, 327.7773f, 223.5129f, + 365.4548f, 86.1273f, 385.5540f, 219.3533f, 343.5581f, 121.2852f, 483.2167f, 129.5677f, + 234.4260f, 125.8439f, 310.7789f, 239.2034f, 248.4032f, 48.0437f, 371.5128f, 101.8978f, + 299.1465f, 387.2317f, 397.5784f, 484.8726f, 376.0880f, 262.2631f, 482.8782f, 339.8563f, + 7.2930f, 47.0424f, 114.9965f, 86.7440f, 397.3961f, 336.3557f, 528.7860f, 357.5037f, + -33.2049f, 414.6207f, 59.2223f, 433.0458f, 396.8727f, 110.5703f, 439.3271f, 126.9654f, + 30.4567f, 27.2849f, 46.2837f, 123.3157f, 51.6484f, -22.3715f, 142.9798f, 30.9887f, + -3.4962f, 6.9860f, 7.3904f, 40.2644f, 204.1520f, 329.0802f, 241.1047f, 433.1711f, + 162.1569f, 441.9229f, 172.2023f, 545.2635f, 41.6043f, -18.2279f, 124.3886f, 63.1082f, + 213.0999f, 303.8811f, 237.9903f, 444.1898f, 155.2101f, 6.7177f, 247.1608f, 65.1444f, + 324.4111f, 233.2946f, 443.2500f, 358.8382f, 384.8351f, 371.9398f, 508.2953f, 384.1355f, + 302.7226f, 123.9848f, 349.8446f, 235.2196f, 20.8081f, -68.6720f, 103.6023f, 79.6067f, + 105.2511f, 234.0231f, 190.1397f, 361.1662f, 420.9290f, 451.9373f, 492.3893f, 539.3073f, + -4.9387f, 81.6146f, 93.6732f, 176.0028f, 187.2764f, 67.9256f, 219.5794f, 121.5657f, + 397.7987f, 10.8413f, 544.7059f, 113.0846f, 467.5255f, 219.7334f, 483.1394f, 335.5223f, + 143.3246f, 223.3545f, 267.8786f, 373.0906f, 288.9383f, 358.9469f, 378.4586f, 433.3239f, + 209.6311f, 371.4695f, 247.1145f, 381.6038f, 320.6775f, 401.3793f, 432.7831f, 491.1622f, + 8.9968f, 393.5190f, 22.5845f, 412.2537f, 13.8844f, 104.8985f, 130.2727f, 142.3685f, + 262.6455f, 252.9446f, 351.5533f, 302.9328f, 107.5252f, 93.7443f, 125.0270f, 203.6677f, + 326.6030f, 150.6990f, 339.4493f, 179.0864f, 119.1742f, 453.1236f, 232.0488f, 478.8208f, + 420.9991f, 337.0981f, 465.6465f, 344.7978f, 342.8767f, 421.7388f, 476.3827f, 552.8516f, + 189.1445f, 156.2901f, 303.6933f, 260.6224f, 333.9324f, 265.2428f, 438.9627f, 272.1948f, + 114.3128f, 240.9499f, 156.8251f, 246.1655f, 193.8135f, 11.5223f, 300.4463f, 95.7648f, + 27.6040f, 96.8022f, 169.8780f, 139.8998f, 423.1219f, 218.8621f, 437.7643f, 308.7743f, + 386.7347f, 0.8091f, 436.3329f, 66.5652f, 433.0917f, 396.4442f, 469.0579f, 535.0178f, + 408.9413f, 39.9801f, 468.5356f, 83.8636f, 423.9944f, 47.8940f, 535.6019f, 150.0867f, + 78.3370f, 378.1336f, 149.9992f, 387.1877f, 422.8927f, -23.2443f, 508.9316f, 120.1789f, + 261.7021f, 376.5726f, 309.5111f, 523.7055f, 200.2215f, 307.2894f, 222.2736f, 418.4116f, + 259.8004f, -0.8479f, 300.5735f, 69.4688f, 106.7550f, 329.0340f, 235.8474f, 362.8130f, + 98.8964f, 254.7818f, 189.6566f, 376.8467f, 91.9970f, 323.3163f, 149.3173f, 434.0331f, + -18.1340f, 397.0634f, 100.5620f, 431.1345f, 242.9804f, 325.0598f, 253.5845f, 393.2908f, + 424.4659f, 258.1096f, 463.2957f, 328.0667f, 297.4333f, 99.1641f, 332.7187f, 223.2992f, + 186.5782f, 297.1904f, 334.3975f, 400.0833f, 161.1921f, 430.0698f, 267.4008f, 526.9018f, + 185.6758f, 244.8488f, 278.7259f, 342.6730f, 103.7673f, 311.5224f, 105.5101f, 352.8224f, + 397.2368f, 190.3715f, 425.6990f, 246.7565f, 51.3437f, 374.1586f, 147.0393f, 381.9622f, + 329.5223f, 439.7066f, 387.1005f, 557.9608f, 310.6336f, 47.4363f, 449.3514f, 112.9530f, + 229.9626f, 68.0539f, 344.9065f, 134.3514f, 397.6331f, 250.9398f, 465.2933f, 288.4979f, + 89.1863f, 224.5854f, 201.8640f, 256.7900f, 367.6410f, 241.4922f, 513.9763f, 330.0776f, + 329.8622f, 6.7118f, 399.5483f, 42.3622f, 351.0067f, 196.8547f, 447.7431f, 207.4218f, + 263.3493f, 233.8098f, 401.2304f, 349.1684f, 404.1452f, 264.0487f, 442.1978f, 321.1426f, + 430.0009f, 299.8394f, 563.0980f, 357.4945f, 202.3143f, 327.4748f, 217.8485f, 392.7412f, + 358.1485f, 259.5528f, 455.7672f, 381.9944f, 313.4684f, 370.7192f, 431.1113f, 419.5239f, + 180.1469f, 255.4066f, 272.7232f, 369.3540f, 426.0572f, 198.2577f, 500.8918f, 339.2499f, + 150.7206f, 253.3635f, 243.7053f, 352.8329f, 270.9340f, 17.9364f, 294.5319f, 83.2569f, + 36.4112f, 80.3679f, 69.5312f, 192.7886f, 92.2801f, 229.0865f, 133.4951f, 298.3132f, + 375.3135f, 405.1188f, 465.3827f, 467.8684f, 164.8547f, 299.8922f, 231.6980f, 379.1594f, + 178.3286f, 21.0337f, 215.7555f, 69.3744f, 56.7212f, 287.8708f, 189.2598f, 304.4041f, + 217.4480f, 79.4625f, 274.1624f, 142.2755f, 369.1791f, 357.2809f, 436.6378f, 376.7356f, + 416.5593f, 382.6425f, 478.6048f, 444.7983f, 21.0025f, 254.7366f, 49.1120f, 338.7197f, + 232.4042f, 225.8433f, 342.4166f, 365.5193f, 199.7265f, 166.0972f, 267.5468f, 172.4943f, + 305.4298f, 176.3264f, 308.8521f, 269.9237f, 151.3188f, 397.4529f, 295.9569f, 466.6555f, + 138.0480f, 359.6507f, 260.5968f, 363.6696f, 181.5352f, 240.7855f, 290.3455f, 278.9682f, + 225.7522f, 174.7890f, 356.2469f, 193.4433f, 182.4345f, 8.5387f, 318.5487f, 41.8410f, + 210.4292f, 50.5482f, 261.7152f, 92.4592f, 362.9012f, 66.1153f, 454.9341f, 126.9099f, + 326.9678f, 146.7783f, 418.6802f, 226.6052f, 150.2754f, 471.4981f, 191.1031f, 472.6456f, + 383.2531f, 240.0174f, 417.3240f, 265.1360f, 417.8392f, 109.9494f, 435.8114f, 124.8908f, + 27.1272f, 11.4244f, 126.3650f, 94.3257f, 232.6628f, 144.1367f, 350.0197f, 194.1688f, + 85.4650f, 366.5097f, 199.8470f, 449.2209f, 345.5237f, 174.6456f, 393.6487f, 208.6972f, + 103.6008f, 383.9478f, 135.1845f, 388.5580f, 301.4075f, 330.7206f, 369.9960f, 471.9843f, + 86.3247f, 46.8414f, 168.7999f, 63.9793f, 186.5999f, 294.3789f, 324.5439f, 314.2809f, + 408.6489f, 468.1303f, 539.9976f, 490.9658f, 121.9074f, 127.4639f, 259.4001f, 274.6741f, + 374.0247f, -21.0436f, 501.7138f, 71.9877f, 421.1110f, 415.6848f, 565.8336f, 507.6180f, + 402.2457f, 367.8241f, 472.6052f, 515.8422f, 78.8962f, 253.9820f, 86.9698f, 268.1594f, + 403.1037f, 203.0262f, 416.5545f, 349.2269f, -13.5009f, 90.1716f, 45.6503f, 121.5695f, + 176.9532f, 362.8065f, 216.3486f, 456.6442f, 422.2061f, 217.5038f, 448.5273f, 281.0963f, + 272.8624f, -12.1655f, 415.8898f, 46.0433f, 251.3114f, 271.6299f, 281.4290f, 411.3851f, + 121.9583f, 463.6307f, 265.9058f, 486.8656f, 348.9660f, 339.7936f, 463.3310f, 489.3569f, + 306.5287f, 109.8543f, 403.0297f, 167.3439f, 183.3392f, -22.1712f, 285.0661f, 75.4963f, + 421.0473f, 397.5667f, 471.4370f, 542.7847f, 66.3152f, 463.7401f, 163.6328f, 473.3226f, + 70.7872f, 196.9543f, 99.6043f, 335.4611f, 251.0428f, 278.3568f, 391.7609f, 363.9607f, + 463.0136f, 178.3225f, 508.9808f, 284.2776f, 104.1169f, 198.2685f, 143.1397f, 221.4969f, + 71.3536f, 19.4869f, 178.3168f, 99.9616f, 20.3440f, -2.3003f, 119.1549f, 99.0532f, + 396.1600f, 81.8756f, 464.4035f, 150.8565f, 65.5815f, 406.2740f, 160.8160f, 430.3668f, + 239.2070f, 54.2293f, 263.9715f, 91.6030f, 444.7733f, 49.1971f, 546.0992f, 177.5016f, + -14.5900f, 271.2390f, 26.7309f, 277.3751f, 257.4168f, 54.2554f, 299.0693f, 160.8758f, + 243.5621f, 6.6488f, 268.7269f, 156.5579f, 378.4616f, 280.6006f, 428.9858f, 282.7156f, + 152.4626f, 171.5487f, 202.8190f, 196.5445f, 170.8344f, 262.3559f, 239.5070f, 363.8034f, + 69.2827f, 451.1334f, 98.6552f, 461.0720f, 355.5286f, 31.0572f, 385.2867f, 119.9359f, + 351.4949f, 405.2588f, 433.2140f, 508.1748f, 58.2303f, 406.9281f, 78.4330f, 495.5619f, + 144.9057f, 386.8375f, 248.5514f, 442.2501f, 375.6284f, 263.1954f, 517.2766f, 368.0905f, + -30.9426f, 265.2984f, 33.6499f, 354.8483f, 81.7472f, 303.6374f, 217.0119f, 335.5753f, + 269.6966f, 302.7942f, 285.3457f, 387.7014f, 163.3466f, -57.9610f, 170.7473f, 74.4432f, + 81.7806f, 428.8672f, 190.2646f, 529.2253f, 172.8226f, 257.1534f, 287.2148f, 328.4503f, + 27.4537f, 366.2749f, 154.0694f, 415.1909f, 260.0797f, 181.7424f, 269.5455f, 195.5394f, + 294.9684f, -12.5261f, 411.7275f, 24.9233f, 259.0953f, 253.5339f, 316.1996f, 256.2007f, + 23.4560f, 179.5914f, 69.6533f, 327.5987f, 408.8140f, 201.4197f, 435.5946f, 235.5696f, + 12.7857f, 108.6503f, 162.1921f, 231.0668f, 377.1631f, 111.8490f, 387.6489f, 137.9771f, + 118.1705f, 242.1441f, 242.3947f, 285.4007f, 343.2383f, 155.9774f, 439.5230f, 219.3007f, + 47.8730f, 460.2977f, 158.3999f, 509.6342f, 39.8081f, 26.4865f, 146.8540f, 146.4408f, + 184.0596f, 87.9846f, 312.9663f, 231.6809f, 2.2755f, 81.2708f, 30.6605f, 212.6897f, + 112.0872f, 259.7130f, 113.2101f, 283.5961f, 316.9157f, 191.2768f, 407.0965f, 308.0034f, + 391.8293f, 310.3482f, 445.5542f, 333.3923f, 30.6705f, 406.4540f, 50.1148f, 543.5478f, + 426.6715f, 103.5286f, 455.4062f, 181.6925f, 373.5433f, 320.8254f, 423.9739f, 371.9462f, + 429.1098f, 0.3217f, 440.5745f, 24.7185f, 344.4742f, 129.8145f, 353.9543f, 132.5740f, + 268.3326f, 212.8878f, 405.8205f, 250.8319f, 238.7950f, -53.0971f, 286.2983f, 84.0919f}; std::vector scores_vec = { - 0.9822f, 0.9644f, 0.1426f, 0.7149f, 0.6008f, 0.6906f, 0.0962f, 0.1886f, 0.0766f, 0.6041f, 0.9866f, 0.6720f, - 0.7108f, 0.9846f, 0.6780f, 0.0402f, 0.8670f, 0.3647f, 0.0044f, 0.5072f, 0.9370f, 0.2573f, 0.4915f, 0.1738f, - 0.0577f, 0.0805f, 0.7270f, 0.8641f, 0.1433f, 0.2883f, 0.1950f, 0.0269f, 0.5534f, 0.6999f, 0.6479f, 0.3881f, - 0.5550f, 0.0941f, 0.1543f, 0.9318f, 0.7615f, 0.9227f, 0.9167f, 0.6494f, 0.9282f, 0.4167f, 0.0036f, 0.0626f, - 0.1095f, 0.0954f, 0.3517f, 0.7013f, 0.7906f, 0.5902f, 0.1464f, 0.7479f, 0.3548f, 0.0130f, 0.2806f, 0.3306f, - 0.2742f, 0.8119f, 0.7599f, 0.6956f, 0.1390f, 0.8078f, 0.6772f, 0.1948f, 0.6481f, 0.4835f, 0.4394f, 0.1121f, - 0.5183f, 0.0999f, 0.1643f, 0.1325f, 0.9541f, 0.2849f, 0.3552f, 0.3221f, 0.8983f, 0.5630f, 0.9192f, 0.2999f, - 0.1148f, 0.5562f, 0.3455f, 0.8019f, 0.8794f, 0.4726f, 0.9714f, 0.5530f, 0.2709f, 0.4890f, 0.0373f, 0.8040f, - 0.1014f, 0.3087f, 0.5653f, 0.0430f, 0.0793f, 0.6961f, 0.0718f, 0.4771f, 0.3387f, 0.2281f, 0.1888f, 0.7634f, - 0.9515f, 0.1402f, 0.9597f, 0.5948f, 0.6417f, 0.7099f, 0.7041f, 0.8198f, 0.4835f, 0.5334f, 0.3238f, 0.1053f, - 0.6646f, 0.0336f, 0.2756f, 0.0942f, 0.1907f, 0.6387f, 0.6285f, 0.4211f, 0.0902f, 0.4334f, 0.3527f, 0.7205f, - 0.5790f, 0.4916f, 0.4870f, 0.9663f, 0.7563f, 0.4970f, 0.4792f, 0.0265f, 0.9425f, 0.3192f, 0.2559f, 0.9994f, - 0.7187f, 0.0474f, 0.0619f, 0.0255f, 0.5996f, 0.0716f, 0.9334f, 0.9369f, 0.5461f, 0.6166f, 0.2919f, 0.0640f, - 0.7375f, 0.1018f, 0.0856f, 0.3112f, 0.0125f, 0.4340f, 0.7077f, 0.8013f, 0.6043f, 0.8469f, 0.4065f, 0.8488f, - 0.5065f, 0.2230f, 0.9441f, 0.2750f, 0.0262f, 0.2427f, 0.3667f, 0.3513f, 0.5247f, 0.8831f, 0.2923f, 0.5208f, - 0.3401f, 0.8218f, 0.1576f, 0.1035f, 0.5030f, 0.6719f, 0.7955f, 0.5896f, 0.7738f, 0.3927f, 0.0329f, 0.1161f, - 0.0387f, 0.3289f, 0.4955f, 0.3563f, 0.5606f, 0.4806f, 0.6779f, 0.6670f, 0.3181f, 0.3462f, 0.5851f, 0.5964f, - 0.3147f, 0.3303f, 0.6940f, 0.6474f, 0.1351f, 0.4410f, 0.8927f, 0.0363f, 0.8552f, 0.1632f, 0.5072f, 0.4243f, - 0.0101f, 0.9154f, 0.4549f, 0.9543f, 0.2867f, 0.8663f, 0.9224f, 0.5568f, 0.2027f, 0.6852f, 0.5490f, 0.9445f, - 0.4393f, 0.2685f, 0.1383f, 0.6986f, 0.9741f, 0.0283f, 0.7404f, 0.9269f, 0.0748f, 0.1102f, 0.6920f, 0.6480f, - 0.0688f, 0.8344f, 0.5234f, 0.9072f, 0.8780f, 0.8125f, 0.5159f, 0.2517f, 0.5060f, 0.1008f, 0.6588f, 0.1340f, - 0.5112f, 0.0544f, 0.2995f, 0.2321f, 0.6200f, 0.7868f, 0.0573f, 0.8503f, 0.8608f, 0.3423f, 0.6590f, 0.4026f, - 0.1542f, 0.5287f, 0.0864f, 0.8785f, 0.9243f, 0.8216f, 0.5625f, 0.5576f, 0.9846f, 0.2479f, 0.0759f, 0.5619f, - 0.3288f, 0.3223f, 0.0071f, 0.5962f, 0.2640f, 0.1879f, 0.0404f, 0.3644f, 0.8790f, 0.3367f, 0.6791f, 0.7565f, - 0.3281f, 0.8216f, 0.6919f, 0.5592f, 0.0010f, 0.0351f, 0.9909f, 0.7823f, 0.9376f, 0.9023f, 0.0204f, 0.7918f, - 0.4511f, 0.7896f, 0.0067f, 0.2882f, 0.7513f, 0.7930f, 0.6197f, 0.3013f, 0.3104f, 0.9668f, 0.4392f, 0.4471f, - 0.5523f, 0.4095f, 0.5527f, 0.4323f, 0.8267f, 0.9091f, 0.9321f, 0.5643f, 0.4421f, 0.7052f, 0.8383f, 0.5630f, - 0.7000f, 0.7497f, 0.6764f, 0.7461f, 0.2086f, 0.4984f, 0.5883f, 0.0025f, 0.8560f, 0.6100f, 0.1291f, 0.8164f, - 0.7171f, 0.7583f, 0.3920f, 0.8542f, 0.4140f, 0.5705f, 0.0006f, 0.6449f, 0.7182f, 0.5671f, 0.4966f, 0.8099f, - 0.6814f, 0.2781f, 0.9591f, 0.7073f, 0.9879f, 0.9713f, 0.9189f, 0.7554f, 0.6094f, 0.1722f, 0.5434f, 0.7654f, - 0.5209f, 0.8682f, 0.1097f, 0.3809f, 0.5060f, 0.4323f, 0.1086f, 0.1535f, 0.8376f, 0.4844f, 0.0487f, 0.0165f, - 0.4735f, 0.1644f, 0.7051f, 0.7953f, 0.2283f, 0.5922f, 0.1544f, 0.3036f, 0.8888f, 0.5441f, 0.8859f, 0.2252f, - 0.3300f, 0.4710f, 0.4801f, 0.9976f, 0.1144f, 0.8520f, 0.8637f, 0.5532f, 0.3440f, 0.5192f, 0.2925f, 0.7991f, - 0.4983f, 0.9258f, 0.6227f, 0.5143f, 0.7111f, 0.5039f, 0.9045f, 0.1844f, 0.9733f, 0.8122f, 0.8607f, 0.4829f, - 0.8372f, 0.3068f, 0.7619f, 0.1405f, 0.3071f, 0.4457f, 0.3223f, 0.3870f, 0.8201f, 0.2567f, 0.7453f, 0.0737f, - 0.7657f, 0.7920f, 0.4017f, 0.7225f, 0.9151f, 0.8007f, 0.3904f, 0.4842f, 0.7794f, 0.2926f, 0.8039f, 0.3281f, - 0.8060f, 0.0868f, 0.0444f, 0.9977f, 0.8695f, 0.8828f, 0.9513f, 0.4383f, 0.2868f, 0.1300f, 0.5012f, 0.2200f, - 0.9356f, 0.0040f, 0.1432f, 0.2465f, 0.1990f, 0.2258f, 0.6560f, 0.3275f, 0.6150f, 0.8903f, 0.6026f, 0.6945f, - 0.3655f, 0.1597f, 0.3206f, 0.9643f, 0.6218f, 0.2775f, 0.4509f, 0.8355f, 0.6684f, 0.5607f, 0.8852f, 0.6724f, - 0.6427f, 0.1898f, 0.1064f, 0.9651f, 0.5989f, 0.4157f, 0.5890f, 0.0618f, 0.8221f, 0.2166f, 0.8045f, 0.5344f, - 0.2766f, 0.0302f, 0.8158f, 0.1765f, 0.0518f, 0.7559f, 0.3500f, 0.3893f, 0.2471f, 0.8592f, 0.2973f, 0.2102f, - 0.3092f, 0.2031f, 0.3177f, 0.0829f, 0.1585f, 0.4171f, 0.8795f, 0.0573f, 0.2127f, 0.9083f, 0.8900f, 0.6795f, - 0.2405f, 0.4198f, 0.2112f, 0.1286f, 0.3800f, 0.5758f, 0.3599f, 0.6108f, 0.2963f, 0.3459f, 0.7907f, 0.8783f, - 0.3220f, 0.5715f, 0.2782f, 0.0533f, 0.7379f, 0.1710f, 0.4257f, 0.4870f, 0.1845f, 0.0946f, 0.3480f, 0.9523f, - 0.6151f, 0.3814f, 0.0389f, 0.6003f, 0.0923f, 0.5425f, 0.7520f, 0.4236f, 0.2994f, 0.0474f, 0.0248f, 0.4300f, - 0.8833f, 0.2441f, 0.5741f, 0.6843f, 0.0608f, 0.1531f, 0.3313f, 0.6701f, 0.4390f, 0.7342f, 0.8676f, 0.7584f, - 0.9922f, 0.7544f, 0.8522f, 0.8324f, 0.7303f, 0.8018f, 0.9347f, 0.4752f, 0.6383f, 0.5149f, 0.8510f, 0.4314f, - 0.8197f, 0.7994f, 0.9619f, 0.2489f, 0.7096f, 0.7569f, 0.9363f, 0.9069f, 0.5735f, 0.5792f, 0.1673f, 0.9750f, - 0.2550f, 0.7247f, 0.7958f, 0.4412f, 0.2112f, 0.1890f, 0.8565f, 0.5108f, 0.0901f, 0.7170f, 0.2502f, 0.8764f, - 0.3096f, 0.2003f, 0.0849f, 0.5115f, 0.4507f, 0.7513f, 0.4646f, 0.3438f, 0.2617f, 0.2781f, 0.9278f, 0.1651f, - 0.9882f, 0.3269f, 0.0884f, 0.2487f, 0.0584f, 0.7900f, 0.5126f, 0.3370f, 0.6620f, 0.6306f, 0.9399f, 0.9613f, - 0.6807f, 0.8178f, 0.7924f, 0.4913f, 0.7045f, 0.0783f, 0.7580f, 0.9618f, 0.0850f, 0.8361f, 0.9330f, 0.2262f, - 0.5248f, 0.9279f, 0.9602f, 0.1279f, 0.3490f, 0.6981f, 0.2216f, 0.3248f, 0.0233f, 0.1535f, 0.5623f, 0.6531f, - 0.6489f, 0.7784f, 0.4153f, 0.2735f, 0.0156f, 0.2066f, 0.3124f, 0.1782f, 0.0201f, 0.1574f, 0.6661f, 0.6296f, - 0.9357f, 0.7982f, 0.5678f, 0.1376f, 0.5641f, 0.0616f, 0.4309f, 0.3903f, 0.4278f, 0.2798f, 0.6858f, 0.8409f, - 0.7685f, 0.6278f, 0.5383f, 0.0311f, 0.7229f, 0.5450f, 0.2707f, 0.3278f, 0.9356f, 0.6244f, 0.4759f, 0.6209f, - 0.4137f, 0.4702f, 0.2903f, 0.4399f, 0.6856f, 0.0399f, 0.7950f, 0.2830f, 0.6826f, 0.6427f, 0.6526f, 0.6081f, - 0.9591f, 0.5083f, 0.7323f, 0.7054f, 0.2363f, 0.2833f, 0.4240f, 0.2777f, 0.3667f, 0.3910f, 0.6039f, 0.2199f, - 0.8043f, 0.4375f, 0.7062f, 0.0814f, 0.4700f, 0.0282f, 0.6759f, 0.3437f, 0.9493f, 0.3241f, 0.5638f, 0.2574f, - 0.6201f, 0.4670f, 0.3706f, 0.2037f, 0.1115f, 0.1199f, 0.9990f, 0.4123f, 0.0019f, 0.9529f, 0.0200f, 0.4186f, - 0.7175f, 0.9146f, 0.7129f, 0.4636f, 0.9744f, 0.0393f, 0.9869f, 0.8494f, 0.9289f, 0.2548f, 0.1425f, 0.6633f, - 0.5159f, 0.5232f, 0.9246f, 0.6201f, 0.3111f, 0.4001f, 0.1335f, 0.1923f, 0.1434f, 0.8103f, 0.7049f, 0.5303f, - 0.3744f, 0.6685f, 0.8129f, 0.8812f, 0.5470f, 0.8199f, 0.5113f, 0.4745f, 0.8654f, 0.3864f, 0.3959f, 0.3049f, - 0.5187f, 0.5449f, 0.6605f, 0.4305f, 0.2178f, 0.8668f, 0.3460f, 0.9229f, 0.2074f, 0.5601f, 0.5366f, 0.8286f, - 0.1389f, 0.9099f, 0.5314f, 0.5861f, 0.5102f, 0.0360f, 0.4971f, 0.2635f, 0.3427f, 0.6491f, 0.4977f, 0.0932f, - 0.0730f, 0.1857f, 0.1909f, 0.6083f, 0.1778f, 0.8817f, 0.2098f, 0.0911f, 0.8757f, 0.2953f, 0.4254f, 0.9590f, - 0.9444f, 0.7149f, 0.0689f, 0.5933f, 0.9891f, 0.9469f, 0.1060f, 0.3960f}; + 0.9822f, 0.9644f, 0.1426f, 0.7149f, 0.6008f, 0.6906f, 0.0962f, 0.1886f, 0.0766f, 0.6041f, + 0.9866f, 0.6720f, 0.7108f, 0.9846f, 0.6780f, 0.0402f, 0.8670f, 0.3647f, 0.0044f, 0.5072f, + 0.9370f, 0.2573f, 0.4915f, 0.1738f, 0.0577f, 0.0805f, 0.7270f, 0.8641f, 0.1433f, 0.2883f, + 0.1950f, 0.0269f, 0.5534f, 0.6999f, 0.6479f, 0.3881f, 0.5550f, 0.0941f, 0.1543f, 0.9318f, + 0.7615f, 0.9227f, 0.9167f, 0.6494f, 0.9282f, 0.4167f, 0.0036f, 0.0626f, 0.1095f, 0.0954f, + 0.3517f, 0.7013f, 0.7906f, 0.5902f, 0.1464f, 0.7479f, 0.3548f, 0.0130f, 0.2806f, 0.3306f, + 0.2742f, 0.8119f, 0.7599f, 0.6956f, 0.1390f, 0.8078f, 0.6772f, 0.1948f, 0.6481f, 0.4835f, + 0.4394f, 0.1121f, 0.5183f, 0.0999f, 0.1643f, 0.1325f, 0.9541f, 0.2849f, 0.3552f, 0.3221f, + 0.8983f, 0.5630f, 0.9192f, 0.2999f, 0.1148f, 0.5562f, 0.3455f, 0.8019f, 0.8794f, 0.4726f, + 0.9714f, 0.5530f, 0.2709f, 0.4890f, 0.0373f, 0.8040f, 0.1014f, 0.3087f, 0.5653f, 0.0430f, + 0.0793f, 0.6961f, 0.0718f, 0.4771f, 0.3387f, 0.2281f, 0.1888f, 0.7634f, 0.9515f, 0.1402f, + 0.9597f, 0.5948f, 0.6417f, 0.7099f, 0.7041f, 0.8198f, 0.4835f, 0.5334f, 0.3238f, 0.1053f, + 0.6646f, 0.0336f, 0.2756f, 0.0942f, 0.1907f, 0.6387f, 0.6285f, 0.4211f, 0.0902f, 0.4334f, + 0.3527f, 0.7205f, 0.5790f, 0.4916f, 0.4870f, 0.9663f, 0.7563f, 0.4970f, 0.4792f, 0.0265f, + 0.9425f, 0.3192f, 0.2559f, 0.9994f, 0.7187f, 0.0474f, 0.0619f, 0.0255f, 0.5996f, 0.0716f, + 0.9334f, 0.9369f, 0.5461f, 0.6166f, 0.2919f, 0.0640f, 0.7375f, 0.1018f, 0.0856f, 0.3112f, + 0.0125f, 0.4340f, 0.7077f, 0.8013f, 0.6043f, 0.8469f, 0.4065f, 0.8488f, 0.5065f, 0.2230f, + 0.9441f, 0.2750f, 0.0262f, 0.2427f, 0.3667f, 0.3513f, 0.5247f, 0.8831f, 0.2923f, 0.5208f, + 0.3401f, 0.8218f, 0.1576f, 0.1035f, 0.5030f, 0.6719f, 0.7955f, 0.5896f, 0.7738f, 0.3927f, + 0.0329f, 0.1161f, 0.0387f, 0.3289f, 0.4955f, 0.3563f, 0.5606f, 0.4806f, 0.6779f, 0.6670f, + 0.3181f, 0.3462f, 0.5851f, 0.5964f, 0.3147f, 0.3303f, 0.6940f, 0.6474f, 0.1351f, 0.4410f, + 0.8927f, 0.0363f, 0.8552f, 0.1632f, 0.5072f, 0.4243f, 0.0101f, 0.9154f, 0.4549f, 0.9543f, + 0.2867f, 0.8663f, 0.9224f, 0.5568f, 0.2027f, 0.6852f, 0.5490f, 0.9445f, 0.4393f, 0.2685f, + 0.1383f, 0.6986f, 0.9741f, 0.0283f, 0.7404f, 0.9269f, 0.0748f, 0.1102f, 0.6920f, 0.6480f, + 0.0688f, 0.8344f, 0.5234f, 0.9072f, 0.8780f, 0.8125f, 0.5159f, 0.2517f, 0.5060f, 0.1008f, + 0.6588f, 0.1340f, 0.5112f, 0.0544f, 0.2995f, 0.2321f, 0.6200f, 0.7868f, 0.0573f, 0.8503f, + 0.8608f, 0.3423f, 0.6590f, 0.4026f, 0.1542f, 0.5287f, 0.0864f, 0.8785f, 0.9243f, 0.8216f, + 0.5625f, 0.5576f, 0.9846f, 0.2479f, 0.0759f, 0.5619f, 0.3288f, 0.3223f, 0.0071f, 0.5962f, + 0.2640f, 0.1879f, 0.0404f, 0.3644f, 0.8790f, 0.3367f, 0.6791f, 0.7565f, 0.3281f, 0.8216f, + 0.6919f, 0.5592f, 0.0010f, 0.0351f, 0.9909f, 0.7823f, 0.9376f, 0.9023f, 0.0204f, 0.7918f, + 0.4511f, 0.7896f, 0.0067f, 0.2882f, 0.7513f, 0.7930f, 0.6197f, 0.3013f, 0.3104f, 0.9668f, + 0.4392f, 0.4471f, 0.5523f, 0.4095f, 0.5527f, 0.4323f, 0.8267f, 0.9091f, 0.9321f, 0.5643f, + 0.4421f, 0.7052f, 0.8383f, 0.5630f, 0.7000f, 0.7497f, 0.6764f, 0.7461f, 0.2086f, 0.4984f, + 0.5883f, 0.0025f, 0.8560f, 0.6100f, 0.1291f, 0.8164f, 0.7171f, 0.7583f, 0.3920f, 0.8542f, + 0.4140f, 0.5705f, 0.0006f, 0.6449f, 0.7182f, 0.5671f, 0.4966f, 0.8099f, 0.6814f, 0.2781f, + 0.9591f, 0.7073f, 0.9879f, 0.9713f, 0.9189f, 0.7554f, 0.6094f, 0.1722f, 0.5434f, 0.7654f, + 0.5209f, 0.8682f, 0.1097f, 0.3809f, 0.5060f, 0.4323f, 0.1086f, 0.1535f, 0.8376f, 0.4844f, + 0.0487f, 0.0165f, 0.4735f, 0.1644f, 0.7051f, 0.7953f, 0.2283f, 0.5922f, 0.1544f, 0.3036f, + 0.8888f, 0.5441f, 0.8859f, 0.2252f, 0.3300f, 0.4710f, 0.4801f, 0.9976f, 0.1144f, 0.8520f, + 0.8637f, 0.5532f, 0.3440f, 0.5192f, 0.2925f, 0.7991f, 0.4983f, 0.9258f, 0.6227f, 0.5143f, + 0.7111f, 0.5039f, 0.9045f, 0.1844f, 0.9733f, 0.8122f, 0.8607f, 0.4829f, 0.8372f, 0.3068f, + 0.7619f, 0.1405f, 0.3071f, 0.4457f, 0.3223f, 0.3870f, 0.8201f, 0.2567f, 0.7453f, 0.0737f, + 0.7657f, 0.7920f, 0.4017f, 0.7225f, 0.9151f, 0.8007f, 0.3904f, 0.4842f, 0.7794f, 0.2926f, + 0.8039f, 0.3281f, 0.8060f, 0.0868f, 0.0444f, 0.9977f, 0.8695f, 0.8828f, 0.9513f, 0.4383f, + 0.2868f, 0.1300f, 0.5012f, 0.2200f, 0.9356f, 0.0040f, 0.1432f, 0.2465f, 0.1990f, 0.2258f, + 0.6560f, 0.3275f, 0.6150f, 0.8903f, 0.6026f, 0.6945f, 0.3655f, 0.1597f, 0.3206f, 0.9643f, + 0.6218f, 0.2775f, 0.4509f, 0.8355f, 0.6684f, 0.5607f, 0.8852f, 0.6724f, 0.6427f, 0.1898f, + 0.1064f, 0.9651f, 0.5989f, 0.4157f, 0.5890f, 0.0618f, 0.8221f, 0.2166f, 0.8045f, 0.5344f, + 0.2766f, 0.0302f, 0.8158f, 0.1765f, 0.0518f, 0.7559f, 0.3500f, 0.3893f, 0.2471f, 0.8592f, + 0.2973f, 0.2102f, 0.3092f, 0.2031f, 0.3177f, 0.0829f, 0.1585f, 0.4171f, 0.8795f, 0.0573f, + 0.2127f, 0.9083f, 0.8900f, 0.6795f, 0.2405f, 0.4198f, 0.2112f, 0.1286f, 0.3800f, 0.5758f, + 0.3599f, 0.6108f, 0.2963f, 0.3459f, 0.7907f, 0.8783f, 0.3220f, 0.5715f, 0.2782f, 0.0533f, + 0.7379f, 0.1710f, 0.4257f, 0.4870f, 0.1845f, 0.0946f, 0.3480f, 0.9523f, 0.6151f, 0.3814f, + 0.0389f, 0.6003f, 0.0923f, 0.5425f, 0.7520f, 0.4236f, 0.2994f, 0.0474f, 0.0248f, 0.4300f, + 0.8833f, 0.2441f, 0.5741f, 0.6843f, 0.0608f, 0.1531f, 0.3313f, 0.6701f, 0.4390f, 0.7342f, + 0.8676f, 0.7584f, 0.9922f, 0.7544f, 0.8522f, 0.8324f, 0.7303f, 0.8018f, 0.9347f, 0.4752f, + 0.6383f, 0.5149f, 0.8510f, 0.4314f, 0.8197f, 0.7994f, 0.9619f, 0.2489f, 0.7096f, 0.7569f, + 0.9363f, 0.9069f, 0.5735f, 0.5792f, 0.1673f, 0.9750f, 0.2550f, 0.7247f, 0.7958f, 0.4412f, + 0.2112f, 0.1890f, 0.8565f, 0.5108f, 0.0901f, 0.7170f, 0.2502f, 0.8764f, 0.3096f, 0.2003f, + 0.0849f, 0.5115f, 0.4507f, 0.7513f, 0.4646f, 0.3438f, 0.2617f, 0.2781f, 0.9278f, 0.1651f, + 0.9882f, 0.3269f, 0.0884f, 0.2487f, 0.0584f, 0.7900f, 0.5126f, 0.3370f, 0.6620f, 0.6306f, + 0.9399f, 0.9613f, 0.6807f, 0.8178f, 0.7924f, 0.4913f, 0.7045f, 0.0783f, 0.7580f, 0.9618f, + 0.0850f, 0.8361f, 0.9330f, 0.2262f, 0.5248f, 0.9279f, 0.9602f, 0.1279f, 0.3490f, 0.6981f, + 0.2216f, 0.3248f, 0.0233f, 0.1535f, 0.5623f, 0.6531f, 0.6489f, 0.7784f, 0.4153f, 0.2735f, + 0.0156f, 0.2066f, 0.3124f, 0.1782f, 0.0201f, 0.1574f, 0.6661f, 0.6296f, 0.9357f, 0.7982f, + 0.5678f, 0.1376f, 0.5641f, 0.0616f, 0.4309f, 0.3903f, 0.4278f, 0.2798f, 0.6858f, 0.8409f, + 0.7685f, 0.6278f, 0.5383f, 0.0311f, 0.7229f, 0.5450f, 0.2707f, 0.3278f, 0.9356f, 0.6244f, + 0.4759f, 0.6209f, 0.4137f, 0.4702f, 0.2903f, 0.4399f, 0.6856f, 0.0399f, 0.7950f, 0.2830f, + 0.6826f, 0.6427f, 0.6526f, 0.6081f, 0.9591f, 0.5083f, 0.7323f, 0.7054f, 0.2363f, 0.2833f, + 0.4240f, 0.2777f, 0.3667f, 0.3910f, 0.6039f, 0.2199f, 0.8043f, 0.4375f, 0.7062f, 0.0814f, + 0.4700f, 0.0282f, 0.6759f, 0.3437f, 0.9493f, 0.3241f, 0.5638f, 0.2574f, 0.6201f, 0.4670f, + 0.3706f, 0.2037f, 0.1115f, 0.1199f, 0.9990f, 0.4123f, 0.0019f, 0.9529f, 0.0200f, 0.4186f, + 0.7175f, 0.9146f, 0.7129f, 0.4636f, 0.9744f, 0.0393f, 0.9869f, 0.8494f, 0.9289f, 0.2548f, + 0.1425f, 0.6633f, 0.5159f, 0.5232f, 0.9246f, 0.6201f, 0.3111f, 0.4001f, 0.1335f, 0.1923f, + 0.1434f, 0.8103f, 0.7049f, 0.5303f, 0.3744f, 0.6685f, 0.8129f, 0.8812f, 0.5470f, 0.8199f, + 0.5113f, 0.4745f, 0.8654f, 0.3864f, 0.3959f, 0.3049f, 0.5187f, 0.5449f, 0.6605f, 0.4305f, + 0.2178f, 0.8668f, 0.3460f, 0.9229f, 0.2074f, 0.5601f, 0.5366f, 0.8286f, 0.1389f, 0.9099f, + 0.5314f, 0.5861f, 0.5102f, 0.0360f, 0.4971f, 0.2635f, 0.3427f, 0.6491f, 0.4977f, 0.0932f, + 0.0730f, 0.1857f, 0.1909f, 0.6083f, 0.1778f, 0.8817f, 0.2098f, 0.0911f, 0.8757f, 0.2953f, + 0.4254f, 0.9590f, 0.9444f, 0.7149f, 0.0689f, 0.5933f, 0.9891f, 0.9469f, 0.1060f, 0.3960f}; migraphx::parameter_map host_params; host_params["boxes"] = migraphx::argument(boxes_s, boxes_vec.data()); @@ -876,7 +1010,20 @@ TEST_CASE(nms_200boxes_2batch_2class_test) auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params); indices.resize(static_cast(num_selected) * 3); - std::vector gold = {0, 0, 143, 0, 0, 10, 0, 0, 13, 0, 0, 0, 0, 0, 90, 0, 0, 135, 0, 0, 1, 0, 0, 76, 0, 0, 108, 0, 0, 170, 0, 0, 140, 0, 0, 20, 0, 0, 151, 0, 0, 150, 0, 0, 39, 0, 0, 44, 0, 0, 41, 0, 0, 82, 0, 0, 80, 0, 0, 88, 0, 0, 16, 0, 0, 27, 0, 0, 167, 0, 0, 165, 0, 0, 181, 0, 1, 187, 0, 1, 94, 0, 1, 152, 0, 1, 72, 0, 1, 32, 0, 1, 153, 0, 1, 109, 0, 1, 150, 0, 1, 19, 0, 1, 27, 0, 1, 96, 0, 1, 35, 0, 1, 197, 0, 1, 68, 0, 1, 22, 0, 1, 154, 0, 1, 17, 0, 1, 117, 0, 1, 43, 0, 1, 97, 0, 1, 10, 0, 1, 180, 0, 1, 182, 0, 1, 67, 0, 1, 44, 1, 0, 35, 1, 0, 152, 1, 0, 175, 1, 0, 4, 1, 0, 71, 1, 0, 166, 1, 0, 127, 1, 0, 38, 1, 0, 170, 1, 0, 44, 1, 0, 158, 1, 0, 198, 1, 0, 24, 1, 0, 101, 1, 0, 171, 1, 0, 2, 1, 0, 53, 1, 0, 102, 1, 0, 66, 1, 0, 140, 1, 0, 37, 1, 0, 98, 1, 0, 115, 1, 0, 150, 1, 0, 6, 1, 1, 114, 1, 1, 196, 1, 1, 0, 1, 1, 126, 1, 1, 124, 1, 1, 19, 1, 1, 11, 1, 1, 26, 1, 1, 84, 1, 1, 191, 1, 1, 117, 1, 1, 104, 1, 1, 197, 1, 1, 192, 1, 1, 10, 1, 1, 48, 1, 1, 68, 1, 1, 22, 1, 1, 128, 1, 1, 25, 1, 1, 134, 1, 1, 163, 1, 1, 121, 1, 1, 169, 1, 1, 185}; + std::vector gold = { + 0, 0, 143, 0, 0, 10, 0, 0, 13, 0, 0, 0, 0, 0, 90, 0, 0, 135, 0, 0, 1, 0, 0, 76, + 0, 0, 108, 0, 0, 170, 0, 0, 140, 0, 0, 20, 0, 0, 151, 0, 0, 150, 0, 0, 39, 0, 0, 44, + 0, 0, 41, 0, 0, 82, 0, 0, 80, 0, 0, 88, 0, 0, 16, 0, 0, 27, 0, 0, 167, 0, 0, 165, + 0, 0, 181, 0, 1, 187, 0, 1, 94, 0, 1, 152, 0, 1, 72, 0, 1, 32, 0, 1, 153, 0, 1, 109, + 0, 1, 150, 0, 1, 19, 0, 1, 27, 0, 1, 96, 0, 1, 35, 0, 1, 197, 0, 1, 68, 0, 1, 22, + 0, 1, 154, 0, 1, 17, 0, 1, 117, 0, 1, 43, 0, 1, 97, 0, 1, 10, 0, 1, 180, 0, 1, 182, + 0, 1, 67, 0, 1, 44, 1, 0, 35, 1, 0, 152, 1, 0, 175, 1, 0, 4, 1, 0, 71, 1, 0, 166, + 1, 0, 127, 1, 0, 38, 1, 0, 170, 1, 0, 44, 1, 0, 158, 1, 0, 198, 1, 0, 24, 1, 0, 101, + 1, 0, 171, 1, 0, 2, 1, 0, 53, 1, 0, 102, 1, 0, 66, 1, 0, 140, 1, 0, 37, 1, 0, 98, + 1, 0, 115, 1, 0, 150, 1, 0, 6, 1, 1, 114, 1, 1, 196, 1, 1, 0, 1, 1, 126, 1, 1, 124, + 1, 1, 19, 1, 1, 11, 1, 1, 26, 1, 1, 84, 1, 1, 191, 1, 1, 117, 1, 1, 104, 1, 1, 197, + 1, 1, 192, 1, 1, 10, 1, 1, 48, 1, 1, 68, 1, 1, 22, 1, 1, 128, 1, 1, 25, 1, 1, 134, + 1, 1, 163, 1, 1, 121, 1, 1, 169, 1, 1, 185}; EXPECT(migraphx::verify::verify_rms_range(indices, gold)); EXPECT(num_selected == 100); } From 49e3a2a2fc3db6163ddeb6d7e791f7fe5025778a Mon Sep 17 00:00:00 2001 From: charlie Date: Thu, 21 May 2026 14:01:36 -0500 Subject: [PATCH 23/32] Update NMS op to do fixed_shape_error_check only on fixed shapes --- src/include/migraphx/op/nonmaxsuppression.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp index b71bc4822eb..96ad442cd8a 100644 --- a/src/include/migraphx/op/nonmaxsuppression.hpp +++ b/src/include/migraphx/op/nonmaxsuppression.hpp @@ -93,7 +93,10 @@ struct nonmaxsuppression } }; - fixed_shape_error_check(); + if(not (inputs.at(0).dynamic() or inputs.at(1).dynamic())) + { + fixed_shape_error_check(); + } std::vector out_lens = {max_num_boxes, 3}; shape s_ind{shape::int64_type, out_lens}; shape s_num_selected{shape::int64_type, {1}}; From 94c374438dd0157cfe8c8132002d36327a7427b8 Mon Sep 17 00:00:00 2001 From: charlie Date: Thu, 21 May 2026 14:44:32 -0500 Subject: [PATCH 24/32] Update tests and fixes --- docs/dev/onnx_operators.rst | 18 +-- src/include/migraphx/onnx.hpp | 2 - .../include/migraphx/onnx/onnx_parser.hpp | 1 - src/onnx/onnx.cpp | 1 - src/onnx/parse_nonmaxsuppression.cpp | 11 +- src/targets/gpu/lowering.cpp | 3 +- test/multi_target/multitarget_test.cpp | 9 +- test/onnx/gen_onnx.py | 28 +--- test/onnx/nms_dynamic_batch_test.onnx | Bin 411 -> 388 bytes test/onnx/nms_use_dyn_output_false_test.onnx | Bin 404 -> 0 bytes test/onnx/parse/nms_dynamic_batch_test.cpp | 7 +- test/onnx/parse/nms_dynamic_boxes_test.cpp | 6 +- test/onnx/parse/nms_dynamic_classes_test.cpp | 6 +- test/onnx/parse/nms_test.cpp | 3 +- .../parse/nms_use_dyn_output_false_test.cpp | 55 ------- test/op_shape_test.cpp | 140 ++++-------------- test/ref/nonmaxsuppression.cpp | 126 ++++++++-------- 17 files changed, 128 insertions(+), 288 deletions(-) delete mode 100644 test/onnx/nms_use_dyn_output_false_test.onnx delete mode 100644 test/onnx/parse/nms_use_dyn_output_false_test.cpp diff --git a/docs/dev/onnx_operators.rst b/docs/dev/onnx_operators.rst index a24fbac6a80..e3bd403db67 100644 --- a/docs/dev/onnx_operators.rst +++ b/docs/dev/onnx_operators.rst @@ -511,15 +511,15 @@ Operator Support Matrix +--------------------------+-----------+-----------------+------------------------------+ | NegativeLogLikelihoodLoss| ❌ | | | +--------------------------+-----------+-----------------+------------------------------+ -| NonMaxSuppression | ✅ | FP8, FP16, | fixed output | -| | | FP32, FP64 | size unless | -| | | | ``use_dyn_output`` | -| | | | set | -+--------------------------+-----------+-----------------+------------------------------+ -| NonZero | ✅ | FP8, FP16, | fixed output | -| | | FP32, FP64 | size unless | -| | | | ``use_dyn_output`` | -| | | | set | +| NonMaxSuppression | ✅ | FP8, FP16, | | +| | | FP32, FP64 | | +| | | | | +| | | | | ++--------------------------+-----------+-----------------+------------------------------+ +| NonZero | ✅ | FP8, FP16, | fixed output size | +| | | FP32, FP64 | | +| | | | | +| | | | | +--------------------------+-----------+-----------------+------------------------------+ | Not | ✅ | BOOL | | +--------------------------+-----------+-----------------+------------------------------+ diff --git a/src/include/migraphx/onnx.hpp b/src/include/migraphx/onnx.hpp index 13745994fd7..6715022a1ec 100644 --- a/src/include/migraphx/onnx.hpp +++ b/src/include/migraphx/onnx.hpp @@ -56,8 +56,6 @@ struct onnx_options /// Since loop will become a tensor of max iter size a huge number can cause overflow during /// shape computations. int64_t limit_max_iterations = std::numeric_limits::max(); - /// Use dynamic output for operators when available - bool use_dyn_output = false; /// Parse in ONNX node names as debug symbols bool use_debug_symbols = false; /// Path to use for the external data if it is stored at different location compared to onnx diff --git a/src/onnx/include/migraphx/onnx/onnx_parser.hpp b/src/onnx/include/migraphx/onnx/onnx_parser.hpp index 4f58cd085a9..0d83a094759 100644 --- a/src/onnx/include/migraphx/onnx/onnx_parser.hpp +++ b/src/onnx/include/migraphx/onnx/onnx_parser.hpp @@ -102,7 +102,6 @@ struct onnx_parser std::unordered_map> map_input_dims; std::unordered_map dim_params; std::unordered_map> map_dyn_input_dims; - bool use_dyn_output = false; bool skip_unknown_operators = false; bool use_debug_symbols = false; int64_t max_loop_iterations = 10; diff --git a/src/onnx/onnx.cpp b/src/onnx/onnx.cpp index a14b4b3c581..cab3fc5daa1 100644 --- a/src/onnx/onnx.cpp +++ b/src/onnx/onnx.cpp @@ -72,7 +72,6 @@ static program parse_onnx_from(const onnx_options& options, Ts&&... xs) parser.skip_unknown_operators = options.skip_unknown_operators; parser.max_loop_iterations = options.max_loop_iterations; parser.limit_max_iterations = options.limit_max_iterations; - parser.use_dyn_output = options.use_dyn_output; if(options.print_program_on_error) { diff --git a/src/onnx/parse_nonmaxsuppression.cpp b/src/onnx/parse_nonmaxsuppression.cpp index 74427a9d5b8..959683d01e9 100644 --- a/src/onnx/parse_nonmaxsuppression.cpp +++ b/src/onnx/parse_nonmaxsuppression.cpp @@ -25,7 +25,7 @@ #include #include -MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_USE_DYNAMIC_NMS); +MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_USE_DYNAMIC_NMS) namespace migraphx { inline namespace MIGRAPHX_INLINE_NS { @@ -46,16 +46,17 @@ struct parse_nonmaxsuppression : op_parser auto indices = info.add_instruction(make_op("get_tuple_elem", {{"index", 0}}), nms_ins); if(enabled(MIGRAPHX_USE_DYNAMIC_NMS{})) { - return indices; - } - else - { + //TODO: planning to make this the default behavior and removing the env var. auto num_selected = info.add_instruction(make_op("get_tuple_elem", {{"index", 1}}), nms_ins); auto slice_ins = info.add_instruction( make_op("slice", {{"axes", {0}}, {"starts", {0}}}), indices, num_selected); return slice_ins; } + else + { + return indices; + } } }; diff --git a/src/targets/gpu/lowering.cpp b/src/targets/gpu/lowering.cpp index a7baf80a755..092fe42892f 100644 --- a/src/targets/gpu/lowering.cpp +++ b/src/targets/gpu/lowering.cpp @@ -59,6 +59,7 @@ namespace gpu { MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_SET_GEMM_PROVIDER) MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_MIOPEN_POOLING) +MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_USE_DYNAMIC_NMS) struct miopen_apply { @@ -484,7 +485,7 @@ struct miopen_apply // replace_allocate pass can later turn it into hip::allocate. shape mask_shape{shape::uint8_type, {num_batches * num_classes, iou_packed}}; auto mask_alloc = insert_allocation(ins, mask_shape); - + auto sorted = mod->insert_instruction( ins, make_op("gpu::nms_sort", {{"center_point_box", center_point_box}}), diff --git a/test/multi_target/multitarget_test.cpp b/test/multi_target/multitarget_test.cpp index a52d4940ce6..1ca5758e74a 100644 --- a/test/multi_target/multitarget_test.cpp +++ b/test/multi_target/multitarget_test.cpp @@ -216,14 +216,17 @@ TEST_CASE(single_target_multi_compile) auto max_out_l = gpu_mod->add_literal(int64_t{4}); auto iou_threshold = gpu_mod->add_literal(0.5f); auto score_threshold = gpu_mod->add_literal(0.0f); - auto r = gpu_mod->add_instruction( - migraphx::make_op("nonmaxsuppression", - {{"center_point_box", true}, {"use_dyn_output", true}}), + auto nms = gpu_mod->add_instruction( + migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), boxes_param_gpu, scores_l, max_out_l, iou_threshold, score_threshold); + auto idx = gpu_mod->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms); + auto cnt = gpu_mod->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 1}}), nms); + auto r = gpu_mod->add_instruction( + migraphx::make_op("slice", {{"axes", {0}}, {"starts", {0}}}), idx, cnt); gpu_mod->add_return({r}); auto run_on_gpu = mm->add_instruction( diff --git a/test/onnx/gen_onnx.py b/test/onnx/gen_onnx.py index b359b64de76..0423f56963a 100644 --- a/test/onnx/gen_onnx.py +++ b/test/onnx/gen_onnx.py @@ -11158,31 +11158,6 @@ def nms_test(): return ([node], [b, s, mo, iou, st], [out]) -@onnx_test() -def nms_use_dyn_output_false_test(): - b = helper.make_tensor_value_info('boxes', TensorProto.FLOAT, [1, 6, 4]) - s = helper.make_tensor_value_info('scores', TensorProto.FLOAT, [1, 1, 6]) - mo = helper.make_tensor_value_info('max_output_boxes_per_class', - TensorProto.INT64, [1]) - iou = helper.make_tensor_value_info('iou_threshold', TensorProto.FLOAT, - [1]) - st = helper.make_tensor_value_info('score_threshold', TensorProto.FLOAT, - [1]) - out = helper.make_tensor_value_info('selected_indices', TensorProto.INT64, - [None, 3]) - - node = onnx.helper.make_node('NonMaxSuppression', - inputs=[ - 'boxes', 'scores', - 'max_output_boxes_per_class', - 'iou_threshold', 'score_threshold' - ], - outputs=['selected_indices'], - use_dyn_output=0) - - return ([node], [b, s, mo, iou, st], [out]) - - @onnx_test() def nms_dynamic_batch_test(): b = helper.make_tensor_value_info('boxes', TensorProto.FLOAT, [None, 6, 4]) @@ -11204,8 +11179,7 @@ def nms_dynamic_batch_test(): 'iou_threshold', 'score_threshold' ], outputs=['selected_indices'], - center_point_box=1, - use_dyn_output=1) + center_point_box=1) return ([node], [b, s, mo, iou, st], [out]) diff --git a/test/onnx/nms_dynamic_batch_test.onnx b/test/onnx/nms_dynamic_batch_test.onnx index 65b3ff5bd74f800105c2cdcb6e2e1d9303ca8d74..174d699680c39e364c184cac3a735d88399c819b 100644 GIT binary patch delta 24 gcmbQu+``Pn!8!4O1gq6!Ca%tj@?8@T++q|409PXi(f|Me delta 63 zcmZo+p3N-J!7e10ms=d4Qkj>So0%M+lvt9S5nqy8Tw?W$iEF_``7R+XQ7*pH;#825 R`25n6g3^+SOYSrB0sxsl7CHa` diff --git a/test/onnx/nms_use_dyn_output_false_test.onnx b/test/onnx/nms_use_dyn_output_false_test.onnx deleted file mode 100644 index 6f4d989d6a676004f890b9bd964345499cefe784..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 404 zcmah^ONzok5VfNwP2ndt2r{D}uEZm_bnk53Wz)o@#X#&1RyV{GcnuHeAuK~=P(c?( zLA_7C#{)6Z2b2z#Oix-GXA5WLF3#y-724(1gENEn)|3jt$HW|I{~Y4-!L1NksH05s z7!kUeankOIwl{eS{YvtG5Fx5uC03;}(`l9{oDKerUdQFe6$PmhX%PfS6add_parameter("iou_threshold", siou); migraphx::shape sst{migraphx::shape::float_type, {1}}; auto st = mm->add_parameter("score_threshold", sst); - auto ret = mm->add_instruction( - migraphx::make_op("nonmaxsuppression", - {{"center_point_box", true}, {"use_dyn_output", true}}), + auto nms = mm->add_instruction( + migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), b, s, mo, iou, st); + auto ret = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms); mm->add_return({ret}); migraphx::onnx_options options; options.default_dyn_dim_value = {1, 10}; - options.use_dyn_output = true; auto prog = read_onnx("nms_dynamic_batch_test.onnx", options); EXPECT(p == prog); diff --git a/test/onnx/parse/nms_dynamic_boxes_test.cpp b/test/onnx/parse/nms_dynamic_boxes_test.cpp index 42706ccefdc..d11552ca3d7 100644 --- a/test/onnx/parse/nms_dynamic_boxes_test.cpp +++ b/test/onnx/parse/nms_dynamic_boxes_test.cpp @@ -38,13 +38,13 @@ TEST_CASE(nms_dynamic_boxes_test) auto iou = mm->add_parameter("iou_threshold", siou); migraphx::shape sst{migraphx::shape::float_type, {1}}; auto st = mm->add_parameter("score_threshold", sst); - auto ret = mm->add_instruction( - migraphx::make_op("nonmaxsuppression", {{"use_dyn_output", true}}), b, s, mo, iou, st); + auto nms = mm->add_instruction( + migraphx::make_op("nonmaxsuppression"), b, s, mo, iou, st); + auto ret = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms); mm->add_return({ret}); migraphx::onnx_options options; options.default_dyn_dim_value = {6, 20}; - options.use_dyn_output = true; auto prog = read_onnx("nms_dynamic_boxes_test.onnx", options); EXPECT(p == prog); diff --git a/test/onnx/parse/nms_dynamic_classes_test.cpp b/test/onnx/parse/nms_dynamic_classes_test.cpp index 9e230a067f9..67a21634568 100644 --- a/test/onnx/parse/nms_dynamic_classes_test.cpp +++ b/test/onnx/parse/nms_dynamic_classes_test.cpp @@ -38,13 +38,13 @@ TEST_CASE(nms_dynamic_classes_test) auto iou = mm->add_parameter("iou_threshold", siou); migraphx::shape sst{migraphx::shape::float_type, {1}}; auto st = mm->add_parameter("score_threshold", sst); - auto ret = mm->add_instruction( - migraphx::make_op("nonmaxsuppression", {{"use_dyn_output", true}}), b, s, mo, iou, st); + auto nms = mm->add_instruction( + migraphx::make_op("nonmaxsuppression"), b, s, mo, iou, st); + auto ret = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms); mm->add_return({ret}); migraphx::onnx_options options; options.default_dyn_dim_value = {1, 10}; - options.use_dyn_output = true; auto prog = read_onnx("nms_dynamic_classes_test.onnx", options); EXPECT(p == prog); diff --git a/test/onnx/parse/nms_test.cpp b/test/onnx/parse/nms_test.cpp index 6836117bbd8..f8826a8a96e 100644 --- a/test/onnx/parse/nms_test.cpp +++ b/test/onnx/parse/nms_test.cpp @@ -43,8 +43,9 @@ TEST_CASE(nms_test) migraphx::shape sst{migraphx::shape::float_type, {1}}; auto st = mm->add_parameter("score_threshold", sst); - auto ret = mm->add_instruction( + auto nms = mm->add_instruction( migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), b, s, mo, iou, st); + auto ret = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms); mm->add_return({ret}); auto prog = read_onnx("nms_test.onnx"); diff --git a/test/onnx/parse/nms_use_dyn_output_false_test.cpp b/test/onnx/parse/nms_use_dyn_output_false_test.cpp deleted file mode 100644 index 8e95686550f..00000000000 --- a/test/onnx/parse/nms_use_dyn_output_false_test.cpp +++ /dev/null @@ -1,55 +0,0 @@ -/* - * The MIT License (MIT) - * - * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include - -TEST_CASE(nms_overwrite_use_dyn_output_test) -{ - migraphx::program p; - auto* mm = p.get_main_module(); - migraphx::shape sb{migraphx::shape::float_type, {1, 6, 4}}; - auto b = mm->add_parameter("boxes", sb); - - migraphx::shape ss{migraphx::shape::float_type, {1, 1, 6}}; - auto s = mm->add_parameter("scores", ss); - - migraphx::shape smo{migraphx::shape::int64_type, {1}}; - auto mo = mm->add_parameter("max_output_boxes_per_class", smo); - - migraphx::shape siou{migraphx::shape::float_type, {1}}; - auto iou = mm->add_parameter("iou_threshold", siou); - - migraphx::shape sst{migraphx::shape::float_type, {1}}; - auto st = mm->add_parameter("score_threshold", sst); - - auto ret = mm->add_instruction( - migraphx::make_op("nonmaxsuppression", {{"use_dyn_output", true}}), b, s, mo, iou, st); - mm->add_return({ret}); - - migraphx::onnx_options options; - options.use_dyn_output = true; - - auto prog = read_onnx("nms_use_dyn_output_false_test.onnx", options); - EXPECT(p == prog); -} diff --git a/test/op_shape_test.cpp b/test/op_shape_test.cpp index d6655e49967..89f1a743f06 100644 --- a/test/op_shape_test.cpp +++ b/test/op_shape_test.cpp @@ -2853,64 +2853,46 @@ TEST_CASE(multinomial_dyn) TEST_CASE(nms_shape) { - // use_dyn_output == false - migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}}; - migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}}; + // The nonmaxsuppression op always returns a tuple shape: + // {indices [max_num_boxes, 3] int64, num_selected [1] int64} + // where max_num_boxes = max_batches * max_classes * max_spatial_dim (from max_lens). migraphx::shape max_out_s{migraphx::shape::int64_type, {1}}; migraphx::shape iou_thres_s{migraphx::shape::float_type, {1}}; migraphx::shape score_thres_s{migraphx::shape::float_type, {1}}; - migraphx::shape output_s{migraphx::shape::int64_type, {6, 3}}; - expect_shape(output_s, - migraphx::make_op("nonmaxsuppression", - {{"center_point_box", true}, {"use_dyn_output", false}}), - boxes_s, - scores_s, - max_out_s, - iou_thres_s, - score_thres_s); + migraphx::shape num_selected_s{migraphx::shape::int64_type, {1}}; + + auto nms_tuple = [&](std::size_t max_num_boxes) { + return migraphx::shape( + {migraphx::shape{migraphx::shape::int64_type, {max_num_boxes, 3}}, num_selected_s}); + }; - // use_dyn_output == true - output_s = {migraphx::shape::int64_type, {{0, 6}, {3, 3}}}; - expect_shape(output_s, - migraphx::make_op("nonmaxsuppression", - {{"center_point_box", true}, {"use_dyn_output", true}}), + // fully static inputs + migraphx::shape boxes_s{migraphx::shape::float_type, {1, 6, 4}}; + migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}}; + expect_shape(nms_tuple(6), + migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), boxes_s, scores_s, max_out_s, iou_thres_s, score_thres_s); - // dynamic batches + // dynamic batches: max_num_boxes = 3 * 1 * 6 = 18 boxes_s = {migraphx::shape::float_type, {{1, 3}, {6, 6}, {4, 4}}}; scores_s = {migraphx::shape::float_type, {{1, 3}, {1, 1}, {6, 6}}}; - output_s = {migraphx::shape::int64_type, {{0, 18}, {3, 3}}}; - expect_shape(output_s, - migraphx::make_op("nonmaxsuppression", - {{"center_point_box", true}, {"use_dyn_output", true}}), + expect_shape(nms_tuple(18), + migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), boxes_s, scores_s, max_out_s, iou_thres_s, score_thres_s); - // dynamic num boxes + // dynamic num boxes: max_num_boxes = 1 * 1 * 20 = 20 boxes_s = {migraphx::shape::float_type, {{1, 1}, {6, 20}, {4, 4}}}; scores_s = {migraphx::shape::float_type, {{1, 1}, {1, 1}, {6, 20}}}; - output_s = {migraphx::shape::int64_type, {{0, 20}, {3, 3}}}; - expect_shape(output_s, - migraphx::make_op("nonmaxsuppression", - {{"center_point_box", true}, {"use_dyn_output", true}}), - boxes_s, - scores_s, - max_out_s, - iou_thres_s, - score_thres_s); - - // use_dyn_output false with dynamic input shape: auto-enables dynamic output - output_s = {migraphx::shape::int64_type, {{0, 20}, {3, 3}}}; - expect_shape(output_s, - migraphx::make_op("nonmaxsuppression", - {{"center_point_box", true}, {"use_dyn_output", false}}), + expect_shape(nms_tuple(20), + migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), boxes_s, scores_s, max_out_s, @@ -2920,90 +2902,20 @@ TEST_CASE(nms_shape) // dynamic classes: max_num_boxes = 1 * 3 * 6 = 18 boxes_s = {migraphx::shape::float_type, {{1, 1}, {6, 6}, {4, 4}}}; scores_s = {migraphx::shape::float_type, {{1, 1}, {1, 3}, {6, 6}}}; - output_s = {migraphx::shape::int64_type, {{0, 18}, {3, 3}}}; - expect_shape(output_s, - migraphx::make_op("nonmaxsuppression", - {{"center_point_box", true}, {"use_dyn_output", true}}), - boxes_s, - scores_s, - max_out_s, - iou_thres_s, - score_thres_s); - - // fixed mismatch batches: use_dyn_output=true takes dynamic path, deferred to runtime - // max_num_boxes = 2 * 1 * 6 = 12 - boxes_s = {migraphx::shape::float_type, {2, 6, 4}}; - scores_s = {migraphx::shape::float_type, {1, 1, 6}}; - output_s = {migraphx::shape::int64_type, {{0, 12}, {3, 3}}}; - expect_shape(output_s, - migraphx::make_op("nonmaxsuppression", - {{"center_point_box", true}, {"use_dyn_output", true}}), + expect_shape(nms_tuple(18), + migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), boxes_s, scores_s, max_out_s, iou_thres_s, score_thres_s); - // fixed mismatch num boxes: use_dyn_output=true takes dynamic path, deferred to runtime - // max_num_boxes = 1 * 1 * 6 = 6 - boxes_s = {migraphx::shape::float_type, {1, 6, 4}}; - scores_s = {migraphx::shape::float_type, {1, 1, 4}}; - output_s = {migraphx::shape::int64_type, {{0, 6}, {3, 3}}}; - expect_shape(output_s, - migraphx::make_op("nonmaxsuppression", - {{"center_point_box", true}, {"use_dyn_output", true}}), - boxes_s, - scores_s, - max_out_s, - iou_thres_s, - score_thres_s); - - // dynamic mismatch batches: deferred to runtime validation - // max_num_boxes = boxes_max[0] * scores_max[1] * boxes_max[1] = 4 * 1 * 6 = 24 - boxes_s = {migraphx::shape::float_type, {{1, 4}, {6, 6}, {4, 4}}}; - scores_s = {migraphx::shape::float_type, {{2, 8}, {1, 1}, {6, 6}}}; - output_s = {migraphx::shape::int64_type, {{0, 24}, {3, 3}}}; - expect_shape(output_s, - migraphx::make_op("nonmaxsuppression", - {{"center_point_box", true}, {"use_dyn_output", true}}), - boxes_s, - scores_s, - max_out_s, - iou_thres_s, - score_thres_s); - - // dynamic mismatch num boxes: deferred to runtime validation + // dynamic mismatch num boxes: deferred to runtime validation. + // spatial dim is taken from boxes.max_lens()[1] = 8, so max_num_boxes = 1 * 1 * 8 = 8 boxes_s = {migraphx::shape::float_type, {{1, 1}, {6, 8}, {4, 4}}}; scores_s = {migraphx::shape::float_type, {{1, 1}, {1, 1}, {3, 9}}}; - output_s = {migraphx::shape::int64_type, {{0, 8}, {3, 3}}}; - expect_shape(output_s, - migraphx::make_op("nonmaxsuppression", - {{"center_point_box", true}, {"use_dyn_output", true}}), - boxes_s, - scores_s, - max_out_s, - iou_thres_s, - score_thres_s); - - // dynamic number of classes, fixed boxes_s, mismatch batches: deferred to runtime - boxes_s = {migraphx::shape::float_type, {1, 6, 4}}; - scores_s = {migraphx::shape::float_type, {{1, 3}, {1, 3}, {6, 6}}}; - output_s = {migraphx::shape::int64_type, {{0, 18}, {3, 3}}}; - expect_shape(output_s, - migraphx::make_op("nonmaxsuppression", - {{"center_point_box", true}, {"use_dyn_output", true}}), - boxes_s, - scores_s, - max_out_s, - iou_thres_s, - score_thres_s); - // dynamic number of classes, fixed boxes_s, mismatch num boxes: deferred to runtime - boxes_s = {migraphx::shape::float_type, {1, 6, 4}}; - scores_s = {migraphx::shape::float_type, {{1, 1}, {1, 3}, {4, 8}}}; - output_s = {migraphx::shape::int64_type, {{0, 18}, {3, 3}}}; - expect_shape(output_s, - migraphx::make_op("nonmaxsuppression", - {{"center_point_box", true}, {"use_dyn_output", true}}), + expect_shape(nms_tuple(8), + migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), boxes_s, scores_s, max_out_s, diff --git a/test/ref/nonmaxsuppression.cpp b/test/ref/nonmaxsuppression.cpp index c65dc4916ea..8f16ec6dd75 100644 --- a/test/ref/nonmaxsuppression.cpp +++ b/test/ref/nonmaxsuppression.cpp @@ -30,6 +30,15 @@ #include +static migraphx::instruction_ref add_nms_dynamic_slice(migraphx::module* mm, + migraphx::instruction_ref nms) +{ + auto idx = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms); + auto cnt = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 1}}), nms); + return mm->add_instruction( + migraphx::make_op("slice", {{"axes", {0}}, {"starts", {0}}}), idx, cnt); +} + TEST_CASE(nms_dyn_out_test) { migraphx::program p; @@ -47,14 +56,13 @@ TEST_CASE(nms_dyn_out_test) auto iou_threshold = mm->add_literal(0.5f); auto score_threshold = mm->add_literal(0.0f); - auto r = mm->add_instruction( - migraphx::make_op("nonmaxsuppression", - {{"center_point_box", true}, {"use_dyn_output", true}}), - boxes_l, - scores_l, - max_out_l, - iou_threshold, - score_threshold); + auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), + boxes_l, + scores_l, + max_out_l, + iou_threshold, + score_threshold); + auto r = add_nms_dynamic_slice(mm, nms); mm->add_return({r}); p.compile(migraphx::make_target("ref")); @@ -83,12 +91,13 @@ TEST_CASE(nms_identical_all_dyn_out_test) auto iou_threshold = mm->add_literal(0.1f); auto score_threshold = mm->add_literal(0.0f); - auto r = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"use_dyn_output", true}}), - boxes_l, - scores_l, - max_out_l, - iou_threshold, - score_threshold); + auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"), + boxes_l, + scores_l, + max_out_l, + iou_threshold, + score_threshold); + auto r = add_nms_dynamic_slice(mm, nms); mm->add_return({r}); p.compile(migraphx::make_target("ref")); @@ -114,14 +123,14 @@ TEST_CASE(nms_dyn_batch_test) auto iou_threshold = mm->add_literal(0.5f); auto score_threshold = mm->add_literal(0.0f); - auto r = mm->add_instruction( - migraphx::make_op("nonmaxsuppression", - {{"center_point_box", true}, {"use_dyn_output", true}}), + auto nms = mm->add_instruction( + migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), boxes_p, scores_p, max_out_l, iou_threshold, score_threshold); + auto r = add_nms_dynamic_slice(mm, nms); mm->add_return({r}); p.compile(migraphx::make_target("ref")); @@ -160,14 +169,14 @@ TEST_CASE(nms_dyn_boxes_test) auto iou_threshold = mm->add_literal(0.5f); auto score_threshold = mm->add_literal(0.0f); - auto r = mm->add_instruction( - migraphx::make_op("nonmaxsuppression", - {{"center_point_box", true}, {"use_dyn_output", true}}), + auto nms = mm->add_instruction( + migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), boxes_p, scores_p, max_out_l, iou_threshold, score_threshold); + auto r = add_nms_dynamic_slice(mm, nms); mm->add_return({r}); p.compile(migraphx::make_target("ref")); @@ -203,14 +212,14 @@ TEST_CASE(nms_dyn_classes_test) auto iou_threshold = mm->add_literal(0.5f); auto score_threshold = mm->add_literal(0.0f); - auto r = mm->add_instruction( - migraphx::make_op("nonmaxsuppression", - {{"center_point_box", true}, {"use_dyn_output", true}}), + auto nms = mm->add_instruction( + migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), boxes_p, scores_p, max_out_l, iou_threshold, score_threshold); + auto r = add_nms_dynamic_slice(mm, nms); mm->add_return({r}); p.compile(migraphx::make_target("ref")); @@ -251,21 +260,20 @@ TEST_CASE(nms_not_center_test) auto iou_threshold = mm->add_literal(0.5f); auto score_threshold = mm->add_literal(0.0f); - // set use_dyn_output back to false in operator map - auto r = - mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"use_dyn_output", false}}), - boxes_l, - scores_l, - max_out_l, - iou_threshold, - score_threshold); + auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"), + boxes_l, + scores_l, + max_out_l, + iou_threshold, + score_threshold); + auto r = add_nms_dynamic_slice(mm, nms); mm->add_return({r}); p.compile(migraphx::make_target("ref")); auto output = p.eval({}).back(); std::vector result; output.visit([&](auto out) { result.assign(out.begin(), out.end()); }); - std::vector gold = {0, 0, 3, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + std::vector gold = {0, 0, 3, 0, 0, 0, 0, 0, 5}; EXPECT(migraphx::verify::verify_rms_range(result, gold)); } @@ -286,20 +294,20 @@ TEST_CASE(nms_test) auto iou_threshold = mm->add_literal(0.5f); auto score_threshold = mm->add_literal(0.0f); - auto r = - mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), - boxes_l, - scores_l, - max_out_l, - iou_threshold, - score_threshold); + auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), + boxes_l, + scores_l, + max_out_l, + iou_threshold, + score_threshold); + auto r = add_nms_dynamic_slice(mm, nms); mm->add_return({r}); p.compile(migraphx::make_target("ref")); auto output = p.eval({}).back(); std::vector result; output.visit([&](auto out) { result.assign(out.begin(), out.end()); }); - std::vector gold = {0, 0, 3, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + std::vector gold = {0, 0, 3, 0, 0, 0, 0, 0, 5}; EXPECT(migraphx::verify::verify_rms_range(result, gold)); } @@ -324,20 +332,20 @@ TEST_CASE(nms_transpose1_test) auto transpose_boxes = mm->add_instruction( migraphx::make_op("transpose", {{"permutation", {0, 2, 1}}}), t_boxes_l); - auto r = - mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), - transpose_boxes, - scores_l, - max_out_l, - iou_threshold, - score_threshold); + auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), + transpose_boxes, + scores_l, + max_out_l, + iou_threshold, + score_threshold); + auto r = add_nms_dynamic_slice(mm, nms); mm->add_return({r}); p.compile(migraphx::make_target("ref")); auto output = p.eval({}).back(); std::vector result; output.visit([&](auto out) { result.assign(out.begin(), out.end()); }); - std::vector gold = {0, 0, 3, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + std::vector gold = {0, 0, 3, 0, 0, 0, 0, 0, 5}; EXPECT(migraphx::verify::verify_rms_range(result, gold)); } @@ -362,20 +370,20 @@ TEST_CASE(nms_transpose2_test) auto transpose_boxes = mm->add_instruction( migraphx::make_op("transpose", {{"permutation", {1, 2, 0}}}), t_boxes_l); - auto r = - mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), - transpose_boxes, - scores_l, - max_out_l, - iou_threshold, - score_threshold); + auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), + transpose_boxes, + scores_l, + max_out_l, + iou_threshold, + score_threshold); + auto r = add_nms_dynamic_slice(mm, nms); mm->add_return({r}); p.compile(migraphx::make_target("ref")); auto output = p.eval({}).back(); std::vector result; output.visit([&](auto out) { result.assign(out.begin(), out.end()); }); - std::vector gold = {0, 0, 3, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + std::vector gold = {0, 0, 3, 0, 0, 0, 0, 0, 5}; EXPECT(migraphx::verify::verify_rms_range(result, gold)); } @@ -396,14 +404,14 @@ TEST_CASE(nms_dyn_different_spatial_ranges_test) auto iou_threshold = mm->add_literal(0.5f); auto score_threshold = mm->add_literal(0.0f); - auto r = mm->add_instruction( - migraphx::make_op("nonmaxsuppression", - {{"center_point_box", true}, {"use_dyn_output", true}}), + auto nms = mm->add_instruction( + migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), boxes_p, scores_p, max_out_l, iou_threshold, score_threshold); + auto r = add_nms_dynamic_slice(mm, nms); mm->add_return({r}); p.compile(migraphx::make_target("ref")); From 22d8beb390196d57916452cad9c1d5194d5cff9a Mon Sep 17 00:00:00 2001 From: charlie Date: Thu, 21 May 2026 15:46:37 -0500 Subject: [PATCH 25/32] Add ref fallback for dynamic input NMS and cleanup kernel types --- src/include/migraphx/op/nonmaxsuppression.hpp | 3 +- .../migraphx/kernels/nonmaxsuppression.hpp | 44 ++--- src/targets/gpu/lowering.cpp | 186 ++++++++++++------ src/targets/gpu/nms_ops.cpp | 3 + test/gpu/nonmaxsuppression.cpp | 49 +++++ 5 files changed, 204 insertions(+), 81 deletions(-) diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp index 96ad442cd8a..87bd541fba1 100644 --- a/src/include/migraphx/op/nonmaxsuppression.hpp +++ b/src/include/migraphx/op/nonmaxsuppression.hpp @@ -189,7 +189,8 @@ struct nonmaxsuppression return intersection_over_union > iou_threshold; } - // filter boxes below score_threshold + // Filter boxes below score_threshold. + // Don't filter for score if score_threshold == 0.f template std::vector> filter_boxes_by_score(T scores_start, std::size_t num_boxes, double score_threshold) const diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp index 7ae9638e173..177f373712a 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp @@ -64,23 +64,23 @@ __device__ inline array nms_normalize_box(const Box box) { if constexpr(CenterPointBox) { - const float xc = box[0]; - const float yc = box[1]; - const float hw = box[2] * 0.5f; - const float hh = box[3] * 0.5f; + const auto xc = box[0]; + const auto yc = box[1]; + const auto hw = box[2] * 0.5f; + const auto hh = box[3] * 0.5f; return {xc - hw, yc - hh, xc + hw, yc + hh}; } else { // ONNX layout: [y1, x1, y2, x2]; corners may be in either order. - const float y1 = box[0]; - const float x1 = box[1]; - const float y2 = box[2]; - const float x2 = box[3]; - const float xmin = min(x1, x2); - const float xmax = max(x1, x2); - const float ymin = min(y1, y2); - const float ymax = max(y1, y2); + const auto y1 = box[0]; + const auto x1 = box[1]; + const auto y2 = box[2]; + const auto x2 = box[3]; + const auto xmin = min(x1, x2); + const auto xmax = max(x1, x2); + const auto ymin = min(y1, y2); + const auto ymax = max(y1, y2); return {xmin, ymin, xmax, ymax}; } } @@ -88,16 +88,16 @@ __device__ inline array nms_normalize_box(const Box box) template __device__ inline bool nms_iou_over_threshold(const Box a, const Box b, const Threshold threshold) { - const float left = max(a[0], b[0]); - const float right = min(a[2], b[2]); - const float top = max(a[1], b[1]); - const float bottom = min(a[3], b[3]); - const float w = max(right - left, 0.f); - const float h = max(bottom - top, 0.f); - const float inter = w * h; - const float area_a = max(a[2] - a[0], 0.f) * max(a[3] - a[1], 0.f); - const float area_b = max(b[2] - b[0], 0.f) * max(b[3] - b[1], 0.f); - const float un = area_a + area_b - inter; + const auto left = max(a[0], b[0]); + const auto right = min(a[2], b[2]); + const auto top = max(a[1], b[1]); + const auto bottom = min(a[3], b[3]); + const auto w = max(right - left, 0.f); + const auto h = max(bottom - top, 0.f); + const auto inter = w * h; + const auto area_a = max(a[2] - a[0], 0.f) * max(a[3] - a[1], 0.f); + const auto area_b = max(b[2] - b[0], 0.f) * max(b[3] - b[1], 0.f); + const auto un = area_a + area_b - inter; if(area_a <= 0.f or area_b <= 0.f or un <= 0.f) return false; return (inter / un) > threshold; diff --git a/src/targets/gpu/lowering.cpp b/src/targets/gpu/lowering.cpp index 092fe42892f..192e3f8c45d 100644 --- a/src/targets/gpu/lowering.cpp +++ b/src/targets/gpu/lowering.cpp @@ -59,7 +59,6 @@ namespace gpu { MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_SET_GEMM_PROVIDER) MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_MIOPEN_POOLING) -MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_USE_DYNAMIC_NMS) struct miopen_apply { @@ -455,69 +454,140 @@ struct miopen_apply // compile pass can pick them up later. We can't rely on the main lowering // loop to wrap them: it walks forward, and the new instructions land // before `ins` so they would never be revisited. + // + // The kernels are JIT'd against compile-time sizes baked from the input + // shapes, so when either of `boxes` / `scores` is dynamic we fall back to + // executing the ref op on the host. void add_nms_op() { apply_map.emplace("nonmaxsuppression", [=](instruction_ref ins) { - auto inputs = ins->inputs(); - const auto& boxes_s = inputs[0]->get_shape(); - const auto& scores_s = inputs[1]->get_shape(); - const auto num_batches = boxes_s.lens()[0]; - const auto num_boxes = boxes_s.lens()[1]; - const auto num_classes = scores_s.lens()[1]; - const auto iou_packed = num_boxes * (num_boxes - 1) / 2; - - // Fill in missing optional scalar inputs with default literals. - const shape default_max_s{shape::int64_type, {1}}; - const shape default_iou_s{shape::float_type, {1}}; - const shape default_thr_s{shape::float_type, {1}}; - if(inputs.size() < 3) - inputs.push_back( - mod->insert_literal(ins, literal{default_max_s, {std::int64_t{0}}})); - if(inputs.size() < 4) - inputs.push_back(mod->insert_literal(ins, literal{default_iou_s, {0.0f}})); - if(inputs.size() < 5) - inputs.push_back(mod->insert_literal(ins, literal{default_thr_s, {0.0f}})); - - bool center_point_box = - ins->get_operator().to_value().at("center_point_box").to(); - - // Mask is scratch only; allocate up-front so the standard - // replace_allocate pass can later turn it into hip::allocate. - shape mask_shape{shape::uint8_type, {num_batches * num_classes, iou_packed}}; - auto mask_alloc = insert_allocation(ins, mask_shape); - - auto sorted = mod->insert_instruction( - ins, - make_op("gpu::nms_sort", {{"center_point_box", center_point_box}}), - inputs[0], - inputs[1]); - sorted = insert_precompile_op(sorted); - - auto filter = mod->insert_instruction(ins, - make_op("gpu::nms_filter", - {{"num_batches", num_batches}, - {"num_classes", num_classes}, - {"num_boxes", num_boxes}}), - sorted, - inputs[2], - inputs[3], - inputs[4], - mask_alloc); - filter = insert_precompile_op(filter); - - auto raw_output = - mod->insert_instruction(ins, make_op("get_tuple_elem", {{"index", 0}}), filter); - auto bc_counts = - mod->insert_instruction(ins, make_op("get_tuple_elem", {{"index", 1}}), filter); - - auto compact = - mod->insert_instruction(ins, make_op("gpu::nms_compact"), bc_counts, raw_output); - compact = insert_precompile_op(compact); - - return mod->replace_instruction(ins, compact); + const auto& boxes_s = ins->inputs()[0]->get_shape(); + const auto& scores_s = ins->inputs()[1]->get_shape(); + if(boxes_s.dynamic() or scores_s.dynamic()) + return lower_nms_to_ref(ins); + return lower_nms_to_gpu_pipeline(ins); }); } + // Static GPU pipeline: gpu::nms_sort -> gpu::nms_filter -> gpu::nms_compact. + instruction_ref lower_nms_to_gpu_pipeline(instruction_ref ins) const + { + auto inputs = ins->inputs(); + const auto& boxes_s = inputs[0]->get_shape(); + const auto& scores_s = inputs[1]->get_shape(); + const auto num_batches = boxes_s.lens()[0]; + const auto num_boxes = boxes_s.lens()[1]; + const auto num_classes = scores_s.lens()[1]; + const auto iou_packed = num_boxes * (num_boxes - 1) / 2; + + // Fill in missing optional scalar inputs with default literals. + const shape default_max_s{shape::int64_type, {1}}; + const shape default_iou_s{shape::float_type, {1}}; + const shape default_thr_s{shape::float_type, {1}}; + if(inputs.size() < 3) + inputs.push_back(mod->insert_literal(ins, literal{default_max_s, {std::int64_t{0}}})); + if(inputs.size() < 4) + inputs.push_back(mod->insert_literal(ins, literal{default_iou_s, {0.0f}})); + if(inputs.size() < 5) + inputs.push_back(mod->insert_literal(ins, literal{default_thr_s, {0.0f}})); + + bool center_point_box = + ins->get_operator().to_value().at("center_point_box").to(); + + // Mask is scratch only; allocate up-front so the standard + // replace_allocate pass can later turn it into hip::allocate. + shape mask_shape{shape::uint8_type, {num_batches * num_classes, iou_packed}}; + auto mask_alloc = insert_allocation(ins, mask_shape); + + auto sorted = mod->insert_instruction( + ins, + make_op("gpu::nms_sort", {{"center_point_box", center_point_box}}), + inputs[0], + inputs[1]); + sorted = insert_precompile_op(sorted); + + auto filter = mod->insert_instruction(ins, + make_op("gpu::nms_filter", + {{"num_batches", num_batches}, + {"num_classes", num_classes}, + {"num_boxes", num_boxes}}), + sorted, + inputs[2], + inputs[3], + inputs[4], + mask_alloc); + filter = insert_precompile_op(filter); + + auto raw_output = + mod->insert_instruction(ins, make_op("get_tuple_elem", {{"index", 0}}), filter); + auto bc_counts = + mod->insert_instruction(ins, make_op("get_tuple_elem", {{"index", 1}}), filter); + + auto compact = + mod->insert_instruction(ins, make_op("gpu::nms_compact"), bc_counts, raw_output); + compact = insert_precompile_op(compact); + + return mod->replace_instruction(ins, compact); + } + + // Dynamic-shape fallback: run the ref `nonmaxsuppression` op on the host + // and copy each tuple element back to its own GPU allocation. Downstream + // `get_tuple_elem` consumers of `ins` are rewritten in place to point at + // the per-element GPU copies; `ins` itself is left for DCE to remove. + // + // The ref op produces a tuple {indices, num_selected}, and `hip::copy_to_gpu` + // is not tuple-aware (calls `argument::data()` which asserts non-tuple), so + // we have to split the tuple on the host side before copying back. + instruction_ref lower_nms_to_ref(instruction_ref ins) const + { + // Copy each input from GPU to host, then sync before running the ref op. + auto inputs = ins->inputs(); + std::vector cpu_inputs; + cpu_inputs.reserve(inputs.size()); + std::transform( + inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) { + return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in); + }); + cpu_inputs.front() = + mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs); + + // Ref op produces a tuple {indices [max_num_boxes, 3], num_selected [1]}. + auto cpu_out = mod->insert_instruction(ins, ins->get_operator(), cpu_inputs); + + // For each sub-shape, extract on the host side and copy back to its + // own GPU allocation. + const auto& sub_shapes = ins->get_shape().sub_shapes(); + std::vector gpu_subs; + gpu_subs.reserve(sub_shapes.size()); + for(std::size_t i = 0; i < sub_shapes.size(); ++i) + { + auto cpu_sub = mod->insert_instruction( + ins, make_op("get_tuple_elem", {{"index", i}}), cpu_out); + auto gpu_alloc = insert_allocation(ins, sub_shapes[i]); + gpu_subs.push_back(mod->insert_instruction( + ins, make_op("hip::copy_to_gpu"), cpu_sub, gpu_alloc)); + } + + // Snapshot outputs since we mutate the graph below. + auto consumers = ins->outputs(); + for(auto consumer : consumers) + { + if(consumer->name() != "get_tuple_elem") + MIGRAPHX_THROW("gpu::add_nms_op: dynamic NMS fallback expects only " + "get_tuple_elem consumers of nonmaxsuppression; got: " + + consumer->name()); + auto idx = + consumer->get_operator().to_value().at("index").to(); + assert(idx < gpu_subs.size()); + mod->replace_instruction(consumer, gpu_subs[idx]); + } + + // `ins` is now dead; leave it for dead_code_elimination. Return it so + // the apply-loop shape check (which compares against the original + // tuple shape) succeeds. + return ins; + } + void add_lrn_op() { apply_map.emplace("lrn", [=](instruction_ref ins) { diff --git a/src/targets/gpu/nms_ops.cpp b/src/targets/gpu/nms_ops.cpp index a1fb8fdfe48..f9ac82c8ebf 100644 --- a/src/targets/gpu/nms_ops.cpp +++ b/src/targets/gpu/nms_ops.cpp @@ -33,6 +33,7 @@ inline namespace MIGRAPHX_INLINE_NS { namespace gpu { // Sort boxes per (batch, class) into nms_data{} tensor. +// inputs = {boxes, scores} struct nms_sort { bool center_point_box = false; @@ -69,6 +70,7 @@ MIGRAPHX_REGISTER_OP(nms_sort); // Produces a tuple of (raw_output, bc_counts). // num_batches/num_classes/num_boxes are kept as op attributes because the filter inputs // is a scratch buffer from which these can't be recovered. +// inputs = {sorted_boxes, sorted_scores, sorted_box_indices, output_indices, output_bc_counts} struct nms_filter { std::size_t num_batches = 0; @@ -99,6 +101,7 @@ MIGRAPHX_REGISTER_OP(nms_filter); // Needs a make_tuple type of operator that reuses the indicies input. // Prefix-scan the per-block counts and compact the selections into // the final selected_indices. Output as selected_indices and num_selected tuple. +// inputs = {output_bc_counts, output_indices} struct nms_compact { std::string name() const { return "gpu::nms_compact"; } diff --git a/test/gpu/nonmaxsuppression.cpp b/test/gpu/nonmaxsuppression.cpp index f123263b596..4d9bdb89602 100644 --- a/test/gpu/nonmaxsuppression.cpp +++ b/test/gpu/nonmaxsuppression.cpp @@ -187,6 +187,55 @@ TEST_CASE(nms_not_center_test) EXPECT(num_selected == 3); } +// Exercises the CPU fallback in src/targets/gpu/lowering.cpp::lower_nms_to_ref +// by declaring dynamic-shape parameters for boxes/scores. Uses different +// dynamic-dim ranges between boxes and scores so split_single_dyn_dim bails +// out (its has_one_unique_dyn_dim check requires identical ranges) and the +// dynamic shape survives until lowering. Same data and gold as +// nms_not_center_test. +TEST_CASE(nms_dynamic_fallback_test) +{ + using dd = migraphx::shape::dynamic_dimension; + migraphx::program p; + auto* mm = p.get_main_module(); + migraphx::shape boxes_dyn_s{migraphx::shape::float_type, + {dd{1, 1}, dd{4, 10}, dd{4, 4}}}; + migraphx::shape scores_dyn_s{migraphx::shape::float_type, + {dd{1, 1}, dd{1, 1}, dd{4, 8}}}; + + auto boxes_p = mm->add_parameter("boxes", boxes_dyn_s); + auto scores_p = mm->add_parameter("scores", scores_dyn_s); + auto max_out_l = mm->add_literal(int64_t{4}); + auto iou_threshold = mm->add_literal(0.5f); + auto score_threshold = mm->add_literal(0.0f); + + auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"), + boxes_p, + scores_p, + max_out_l, + iou_threshold, + score_threshold); + add_nms_return(mm, nms); + + std::vector boxes_vec = {1.0, 1.0, 0.0, 0.0, 0.0, 0.1, 1.0, 1.1, + 0.0, 0.9, 1.0, -0.1, 0.0, 10.0, 1.0, 11.0, + 1.0, 10.1, 0.0, 11.1, 1.0, 101.0, 0.0, 100.0}; + std::vector scores_vec = {0.9f, 0.75f, 0.6f, 0.95f, 0.5f, 0.3f}; + + migraphx::shape boxes_runtime_s{migraphx::shape::float_type, {1, 6, 4}}; + migraphx::shape scores_runtime_s{migraphx::shape::float_type, {1, 1, 6}}; + + migraphx::parameter_map host_params; + host_params["boxes"] = migraphx::argument(boxes_runtime_s, boxes_vec.data()); + host_params["scores"] = migraphx::argument(scores_runtime_s, scores_vec.data()); + + auto [indices, num_selected] = run_gpu_nms(std::move(p), host_params); + indices.resize(static_cast(num_selected) * 3); + std::vector gold = {0, 0, 3, 0, 0, 0, 0, 0, 5}; + EXPECT(migraphx::verify::verify_rms_range(indices, gold)); + EXPECT(num_selected == 3); +} + TEST_CASE(nms_transpose1_test) { migraphx::program p; From 8fc4844cd0c823e08aa35c3144f3eb07667431f3 Mon Sep 17 00:00:00 2001 From: charlie Date: Thu, 21 May 2026 16:23:08 -0500 Subject: [PATCH 26/32] Get rid of nms_data in kernel to use global memory only for now --- .../migraphx/kernels/nonmaxsuppression.hpp | 164 ++++++++---------- .../kernels/include/migraphx/kernels/sort.hpp | 48 ++++- src/targets/gpu/lowering.cpp | 41 ++--- src/targets/gpu/nms_ops.cpp | 3 +- 4 files changed, 132 insertions(+), 124 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp index 177f373712a..ee5b1b090e9 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp @@ -38,25 +38,6 @@ namespace migraphx { -template -struct nms_data -{ - // holds a copy of data - Score score; - array box; - Index box_index; -}; - -// Comparator for sorting nms_data{} (or anything else with a `.score` field). -struct nms_score_greater -{ - template - constexpr bool operator()(const T& a, const T& b) const - { - return a.score > b.score; - } -}; - // Decode a single box into (xmin, ymin, xmax, ymax) corners. // Normalize such that [x1, y1] is the bottom left corner. template @@ -109,15 +90,14 @@ __device__ inline index_int nms_packed_idx(index_int i, index_int j, index_int N return (i * N - (i * (i + 1)) / 2) + j - (i + 1); } -// One block per (batch_idx, class_idx). -// Load data into per-block buffer of nms_data. -// Pads values after N with sentinel values. -// Sorts the nms_data in descending order by score. -// boxes_tv: dims([NumBatches, NumBoxes, 4]) -// scores_tv: dims([NumBatches, NumClasses, NumBoxes]) -// sorted_scores: output, dims([B, C, AlignedNumBoxes]) -// sorted_boxes: output, dims([B, C, AlignedNumBoxes, 4]) -// sorted_indices: output, dims([B, C, AlignedNumBoxes]) +// One block per (batch_idx, class_idx). Initializes the per-block slice of +// sorted_* in place (padding past NumBoxes with score-sentinels) and bitonic +// sorts the three global arrays in lockstep by descending score. +// boxes_tv: dims([NumBatches, NumBoxes, 4]) +// scores_tv: dims([NumBatches, NumClasses, NumBoxes]) +// sorted_scores: out, dims([B, C, AlignedNumBoxes]) +// sorted_boxes: out, dims([B, C, AlignedNumBoxes, 4]) +// sorted_indices: out, dims([B, C, AlignedNumBoxes]) template {batch_idx, class_idx, 0}, slice_axes<2>()); + // TODO: make version that uses block shared memory if the data will fit + auto my_sorted_scores = + slice_tensor(sorted_scores, array{block_id, 0}, slice_axes<1>()); + auto my_sorted_boxes = + slice_tensor(sorted_boxes, array{block_id, 0, 0}, slice_axes<1, 2>()); + auto my_sorted_indices = + slice_tensor(sorted_indices, array{block_id, 0}, slice_axes<1>()); + using scores_type = typename SortedScores::type; using boxes_type = typename SortedBoxes::type; using indices_type = typename SortedIndices::type; - // Use shared memory for sorting per-block nms_data. Assuming it fits in LDS. - // TODO: can add a static_assert on needed LDS size - __shared__ - uninitialized_buffer, AlignedNumBoxes> - block_nms_data; + + // Initialize sorted_* in place; pad past NumBoxes with sentinels that + // sink to the end under descending sort. sorted_boxes is 3D ([1, N, 4]) + // since slice_tensor preserves rank. idx.local_stride(AlignedNumBoxes, [&](auto i) { if(i < NumBoxes) { - block_nms_data[i].score = my_scores[i]; - block_nms_data[i].box = nms_normalize_box( + const auto box = nms_normalize_box( slice_tensor(boxes_tv, array{batch_idx, i, 0}, slice_axes<2>())); - block_nms_data[i].box_index = static_cast(i); + my_sorted_scores[i] = my_scores[i]; + for(index_int k = 0; k < 4; ++k) + my_sorted_boxes[array{0, i, k}] = box[k]; + my_sorted_indices[i] = static_cast(i); } else { - block_nms_data[i].score = numeric_lowest(); - block_nms_data[i].box = array{0.f, 0.f, 0.f, 0.f}; - block_nms_data[i].box_index = -1; + my_sorted_scores[i] = numeric_lowest(); + for(index_int k = 0; k < 4; ++k) + my_sorted_boxes[array{0, i, k}] = boxes_type{0}; + my_sorted_indices[i] = static_cast(-1); } }); __syncthreads(); - bitonic_sort{nms_score_greater{}}.template block_sort(idx, block_nms_data); - - // Copy sorted result back to global memory. - auto block_out_scores = - slice_tensor(sorted_scores, array{block_id, 0}, slice_axes<1>()); - auto block_out_boxes = - slice_tensor(sorted_boxes, array{block_id, 0, 0}, slice_axes<1, 2>()); - auto block_out_indices = - slice_tensor(sorted_indices, array{block_id, 0}, slice_axes<1>()); - idx.local_stride(AlignedNumBoxes, [&](auto i) { - block_out_scores[i] = block_nms_data[i].score; - auto out_box_iter = block_out_boxes.begin_at(array{0, i, 0}); - copy(block_nms_data[i].box.begin(), block_nms_data[i].box.end(), out_box_iter); - block_out_indices[i] = block_nms_data[i].box_index; - }); + // Sort scores descending, dragging boxes and indices along. Uses the + // indexed variant so we can swap all 4 box lanes per index pair. + bitonic_sort{greater{}}.template block_sort_indexed( + idx, + [&](auto i, auto j) { return my_sorted_scores[j] > my_sorted_scores[i]; }, + [&](auto i, auto j) { + swap(my_sorted_scores[i], my_sorted_scores[j]); + swap(my_sorted_indices[i], my_sorted_indices[j]); + for(index_int k = 0; k < 4; ++k) + swap(my_sorted_boxes[array{0, i, k}], + my_sorted_boxes[array{0, j, k}]); + }); } -// Build the packed upper-triangular IoU mask for the NumBoxes nms_data boxes. -// Work is striped such that each thread does a multiple of 2 rows so each does roughly the same -// amount of work regardless of where it falls in the triangle. -// `nms_data`: nms_data nms_data{} tensor -// `mask`: bool mask tensor -template -__device__ void -nms_make_iou_mask(const index idx, const NMSData nms_data, Mask mask, const float iou_threshold) +// Build the packed upper-triangular IoU mask for the first NumBoxes sorted +// boxes. Threads are paired across the triangle so each does roughly the same +// amount of work. +// `sorted_boxes`: per-block 3D view, dims([1, >=NumBoxes, 4]) +// `mask`: bool mask tensor +template +__device__ void nms_make_iou_mask(const index idx, + const SortedBoxes sorted_boxes, + Mask mask, + const float iou_threshold) { static_assert(NumBoxes > 0); constexpr index_int half = NumBoxes / 2; + using box_elem_type = typename SortedBoxes::type; + + auto load_box = [&](index_int i) { + return array{sorted_boxes[array{0, i, 0}], + sorted_boxes[array{0, i, 1}], + sorted_boxes[array{0, i, 2}], + sorted_boxes[array{0, i, 3}]}; + }; auto fill_row = [&](index_int i) { + const auto box_i = load_box(i); for(index_int j = i + 1; j < NumBoxes; ++j) { mask[nms_packed_idx(i, j, NumBoxes)] = - nms_iou_over_threshold(nms_data[i].box, nms_data[j].box, iou_threshold); + nms_iou_over_threshold(box_i, load_box(j), iou_threshold); } }; @@ -223,15 +221,18 @@ nms_make_iou_mask(const index idx, const NMSData nms_data, Mask mask, const floa } } -// Greedy filter that writes selections into a per-batch per-class region of output. +// Greedy filter that writes selections into a per-batch per-class region of +// output, reading scores and original-box indices from the sorted_* views. template __device__ void nms_filter_per_block(const index idx, - const NMSData nms_data, + const SortedScores sorted_scores, + const SortedIndices sorted_indices, const Mask mask, const int max_output, const float score_thr, @@ -247,7 +248,7 @@ __device__ void nms_filter_per_block(const index idx, // Match the ref op: only filter by score when score_threshold > 0. const bool do_filter = score_thr > 0.f; idx.local_stride(NumBoxes, - [&](auto i) { removed[i] = (do_filter and nms_data[i].score < score_thr); }); + [&](auto i) { removed[i] = (do_filter and sorted_scores[i] < score_thr); }); __syncthreads(); index_int output_idx = 0; @@ -262,7 +263,7 @@ __device__ void nms_filter_per_block(const index idx, { if(idx.local == 0) { - array tmp = {batch_idx, class_idx, nms_data[i].box_index}; + array tmp = {batch_idx, class_idx, sorted_indices[i]}; auto output_iter = block_output.begin_at(array{0, output_idx, 0}); copy(tmp.begin(), tmp.end(), output_iter); } @@ -279,10 +280,11 @@ __device__ void nms_filter_per_block(const index idx, bc_counts[block_id] = static_cast(output_idx); } -// Per-block filter driver: one block per (batch_idx, class_idx).`. -// Expecting box-coordinate convention has already been normalized into corner form. // TODO: Merge the nonmaxsuppression_sort and nonmaxsuppression_filter kernels by relaxing -// the AlignedNumBoxes resitriction for the sort. +// the AlignedNumBoxes restriction for the sort. +// Per-block filter driver: one block per (batch_idx, class_idx). Slices the +// global sorted_* arrays and passes the views to the IoU-mask and greedy +// filter helpers. Box coordinates are assumed to already be in corner form. template {block_idx, 0}, slice_axes<1>()); - using scores_type = typename SortedScores::type; - using boxes_type = typename SortedBoxes::type; - using indices_type = typename SortedIndices::type; - // Use shared memory for sorting per-block nms_data. Assuming it fits in LDS. - // TODO: can add a static_assert on needed LDS size - __shared__ uninitialized_buffer, NumBoxes> - block_nms_data; - - idx.local_stride(NumBoxes, [&](auto i) { - block_nms_data[i].score = my_sorted_scores[i]; - auto boxes_iter = my_sorted_boxes.begin_at(array{0, i, 0}); - copy(boxes_iter, boxes_iter + 4, block_nms_data[i].box.begin()); - block_nms_data[i].box_index = my_sorted_indices[i]; - }); auto my_mask = slice_tensor(mask, array{block_idx, 0}, slice_axes<1>()); auto my_output = slice_tensor(output, array{block_idx, 0, 0}, slice_axes<1, 2>()); @@ -342,12 +330,12 @@ __device__ void nonmaxsuppression_filter(const SortedScores sorted_scores, const float iou_thr_val = iou_thr_p[0]; const float score_thr_val = score_thr_p[0]; - __syncthreads(); - nms_make_iou_mask(idx, block_nms_data, my_mask, iou_thr_val); + nms_make_iou_mask(idx, my_sorted_boxes, my_mask, iou_thr_val); __syncthreads(); nms_filter_per_block(idx, - block_nms_data, + my_sorted_scores, + my_sorted_indices, my_mask, max_output_boxes_per_class, score_thr_val, diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp index b49d78ca572..491e9348e1e 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp @@ -140,12 +140,8 @@ struct bitonic_sort }); } - // Block-level bitonic sort over a power-of-two buffer in shared or global - // memory. All threads in the block cooperate; buf must point to N elements - // visible to every thread. The compare_function determines the final order - // (e.g. greater{} -> descending). The buffer must be sized to N (a - // compile-time power of 2); callers pad with sentinel values when the - // logical length is smaller. + // Block-wide bitonic sort of an N-element buffer (N must be a power of 2; + // pad with sentinels when the logical length is smaller). template __device__ void block_sort(index idx, T& buf) const { @@ -167,6 +163,46 @@ struct bitonic_sort } } } + + // Bitonic sort with caller-supplied compare and swap. Callbacks are + // always invoked with i < j; compare_at(i, j) follows the same convention + // as compare_function(buf[j], buf[i]) in block_sort. + template + __device__ void block_sort_indexed(index idx, CompareAt compare_at, SwapAt swap_at) const + { + static_assert(is_power_of_2(N), "N must be a power of 2"); + for(index_int k = 2; k <= N; k <<= 1) + { + for(index_int j = k >> 1; j > 0; j >>= 1) + { + idx.local_stride(N, [&](auto tid) { + index_int partner = tid ^ j; + if(partner > tid) + { + const bool reverse = (tid & k) != 0; + if(reverse ^ compare_at(tid, partner)) + swap_at(tid, partner); + } + }); + __syncthreads(); + } + } + } + + // Sort keys (under compare_function) and swap each vals[i]/vals[j] in + // lockstep. Each ValBuf must support swap(buf[i], buf[j]). For non-scalar + // layouts, use block_sort_indexed directly. + template + __device__ void block_sort(index idx, KeyBuf& keys, ValBufs&... vals) const + { + block_sort_indexed( + idx, + [&](auto i, auto j) { return compare_function(keys[j], keys[i]); }, + [&](auto i, auto j) { + swap(keys[i], keys[j]); + (swap(vals[i], vals[j]), ...); + }); + } }; MIGRAPHX_AUTO_DEDUCE(bitonic_sort); diff --git a/src/targets/gpu/lowering.cpp b/src/targets/gpu/lowering.cpp index 192e3f8c45d..e685d538973 100644 --- a/src/targets/gpu/lowering.cpp +++ b/src/targets/gpu/lowering.cpp @@ -448,16 +448,9 @@ struct miopen_apply }); } - // Rewrites onnx `nonmaxsuppression` into the GPU op pipeline: - // gpu::nms_sort -> gpu::nms_filter -> gpu::nms_compact - // Each gpu::nms_* op is wrapped in gpu::precompile_op inline so the JIT - // compile pass can pick them up later. We can't rely on the main lowering - // loop to wrap them: it walks forward, and the new instructions land - // before `ins` so they would never be revisited. - // - // The kernels are JIT'd against compile-time sizes baked from the input - // shapes, so when either of `boxes` / `scores` is dynamic we fall back to - // executing the ref op on the host. + // Lowers `nonmaxsuppression` to the gpu::nms_sort -> nms_filter -> + // nms_compact pipeline, or to a host ref-op fallback when either input + // shape is dynamic (the kernels bake compile-time sizes). void add_nms_op() { apply_map.emplace("nonmaxsuppression", [=](instruction_ref ins) { @@ -469,7 +462,8 @@ struct miopen_apply }); } - // Static GPU pipeline: gpu::nms_sort -> gpu::nms_filter -> gpu::nms_compact. + // Static GPU pipeline. Each gpu::nms_* is wrapped in precompile_op inline + // because the main lowering loop walks forward and would skip them. instruction_ref lower_nms_to_gpu_pipeline(instruction_ref ins) const { auto inputs = ins->inputs(); @@ -494,8 +488,7 @@ struct miopen_apply bool center_point_box = ins->get_operator().to_value().at("center_point_box").to(); - // Mask is scratch only; allocate up-front so the standard - // replace_allocate pass can later turn it into hip::allocate. + // Scratch mask; replace_allocate later turns it into hip::allocate. shape mask_shape{shape::uint8_type, {num_batches * num_classes, iou_packed}}; auto mask_alloc = insert_allocation(ins, mask_shape); @@ -530,17 +523,11 @@ struct miopen_apply return mod->replace_instruction(ins, compact); } - // Dynamic-shape fallback: run the ref `nonmaxsuppression` op on the host - // and copy each tuple element back to its own GPU allocation. Downstream - // `get_tuple_elem` consumers of `ins` are rewritten in place to point at - // the per-element GPU copies; `ins` itself is left for DCE to remove. - // - // The ref op produces a tuple {indices, num_selected}, and `hip::copy_to_gpu` - // is not tuple-aware (calls `argument::data()` which asserts non-tuple), so - // we have to split the tuple on the host side before copying back. + // Dynamic-shape fallback: run the ref op on the host. The tuple has to be + // split host-side before copy_to_gpu (which is not tuple-aware), and the + // downstream get_tuple_elem consumers are rewritten in place. instruction_ref lower_nms_to_ref(instruction_ref ins) const { - // Copy each input from GPU to host, then sync before running the ref op. auto inputs = ins->inputs(); std::vector cpu_inputs; cpu_inputs.reserve(inputs.size()); @@ -551,11 +538,8 @@ struct miopen_apply cpu_inputs.front() = mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs); - // Ref op produces a tuple {indices [max_num_boxes, 3], num_selected [1]}. auto cpu_out = mod->insert_instruction(ins, ins->get_operator(), cpu_inputs); - // For each sub-shape, extract on the host side and copy back to its - // own GPU allocation. const auto& sub_shapes = ins->get_shape().sub_shapes(); std::vector gpu_subs; gpu_subs.reserve(sub_shapes.size()); @@ -568,7 +552,7 @@ struct miopen_apply ins, make_op("hip::copy_to_gpu"), cpu_sub, gpu_alloc)); } - // Snapshot outputs since we mutate the graph below. + // Snapshot since we mutate the graph below. auto consumers = ins->outputs(); for(auto consumer : consumers) { @@ -582,9 +566,8 @@ struct miopen_apply mod->replace_instruction(consumer, gpu_subs[idx]); } - // `ins` is now dead; leave it for dead_code_elimination. Return it so - // the apply-loop shape check (which compares against the original - // tuple shape) succeeds. + // Leave `ins` for dead_code_elimination; return it so the apply-loop + // tuple-shape check passes. return ins; } diff --git a/src/targets/gpu/nms_ops.cpp b/src/targets/gpu/nms_ops.cpp index f9ac82c8ebf..de801c2aa32 100644 --- a/src/targets/gpu/nms_ops.cpp +++ b/src/targets/gpu/nms_ops.cpp @@ -32,7 +32,8 @@ namespace migraphx { inline namespace MIGRAPHX_INLINE_NS { namespace gpu { -// Sort boxes per (batch, class) into nms_data{} tensor. +// Sort boxes per (batch, class) into per-class sorted_scores / sorted_boxes / +// sorted_indices tensors. // inputs = {boxes, scores} struct nms_sort { From 229cf90ef1ec37252ffbcda18c145e6e82b762ab Mon Sep 17 00:00:00 2001 From: charlie Date: Thu, 21 May 2026 16:42:42 -0500 Subject: [PATCH 27/32] doc comments cleanup --- src/include/migraphx/op/nonmaxsuppression.hpp | 2 +- src/onnx/parse_nonmaxsuppression.cpp | 4 ++-- .../kernels/include/migraphx/kernels/nonmaxsuppression.hpp | 2 +- src/targets/gpu/lowering.cpp | 1 + src/targets/gpu/nms_ops.cpp | 6 +++--- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp index 87bd541fba1..38f076ca8de 100644 --- a/src/include/migraphx/op/nonmaxsuppression.hpp +++ b/src/include/migraphx/op/nonmaxsuppression.hpp @@ -308,7 +308,7 @@ struct nonmaxsuppression argument result{max_output_shape}; argument num_selected_result{output_shapes.at(1)}; - std::size_t max_output_boxes_per_class = + int64_t max_output_boxes_per_class = (args.size() > 2) ? (args.at(2).at()) : 0; if(max_output_boxes_per_class == 0) { diff --git a/src/onnx/parse_nonmaxsuppression.cpp b/src/onnx/parse_nonmaxsuppression.cpp index 959683d01e9..0c343bf970a 100644 --- a/src/onnx/parse_nonmaxsuppression.cpp +++ b/src/onnx/parse_nonmaxsuppression.cpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -42,7 +42,7 @@ struct parse_nonmaxsuppression : op_parser { auto op = parser.load(opd.op_name, info); auto nms_ins = info.add_instruction(op, args); - // variable ends input slice to handle dynamic shape output + // slice with variable ends to handle dynamic shape output. auto indices = info.add_instruction(make_op("get_tuple_elem", {{"index", 0}}), nms_ins); if(enabled(MIGRAPHX_USE_DYNAMIC_NMS{})) { diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp index ee5b1b090e9..5b02b8136ad 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp @@ -392,7 +392,7 @@ nonmaxsuppression_compact(const Counts bc_counts, const Idx indices, Out output, const index_int box_idx = i % NumBoxes; if(box_idx < bc_counts[batch_class_idx]) { - for(int k = 0; k < 3; ++k) + for(index_int k = 0; k < 3; ++k) { output[(offsets[batch_class_idx] + box_idx) * index_size + k] = indices[(batch_class_idx * NumBoxes + box_idx) * index_size + k]; diff --git a/src/targets/gpu/lowering.cpp b/src/targets/gpu/lowering.cpp index e685d538973..dbb16ee5e0a 100644 --- a/src/targets/gpu/lowering.cpp +++ b/src/targets/gpu/lowering.cpp @@ -552,6 +552,7 @@ struct miopen_apply ins, make_op("hip::copy_to_gpu"), cpu_sub, gpu_alloc)); } + // TODO: this needs cleanup // Snapshot since we mutate the graph below. auto consumers = ins->outputs(); for(auto consumer : consumers) diff --git a/src/targets/gpu/nms_ops.cpp b/src/targets/gpu/nms_ops.cpp index de801c2aa32..10418002216 100644 --- a/src/targets/gpu/nms_ops.cpp +++ b/src/targets/gpu/nms_ops.cpp @@ -58,9 +58,9 @@ struct nms_sort const auto num_boxes = boxes_s.lens()[1]; const auto num_classes = scores_s.lens()[1]; const auto aligned_b = - static_cast(bit_ceil(static_cast(num_boxes))); - shape out_scores_shape{shape::float_type, {num_batches * num_classes, aligned_b}}; - shape out_boxes_shape{shape::float_type, {num_batches * num_classes, aligned_b, 4}}; + static_cast(bit_ceil(static_cast(num_boxes))); + shape out_scores_shape{scores_s.type(), {num_batches * num_classes, aligned_b}}; + shape out_boxes_shape{boxes.type(), {num_batches * num_classes, aligned_b, 4}}; shape out_box_index_shape{shape::int32_type, {num_batches * num_classes, aligned_b}}; return shape{{out_scores_shape, out_boxes_shape, out_box_index_shape}}; } From 8bb7865aae1b26e7862274541da78a8288a1eefc Mon Sep 17 00:00:00 2001 From: charlie Date: Thu, 21 May 2026 17:05:41 -0500 Subject: [PATCH 28/32] Formatting --- src/include/migraphx/op/nonmaxsuppression.hpp | 5 +- src/onnx/parse_nonmaxsuppression.cpp | 4 +- src/targets/gpu/lowering.cpp | 24 ++-- src/targets/gpu/nms_ops.cpp | 9 +- test/gpu/nonmaxsuppression.cpp | 6 +- test/onnx/parse/nms_dynamic_batch_test.cpp | 7 +- test/onnx/parse/nms_dynamic_boxes_test.cpp | 3 +- test/onnx/parse/nms_dynamic_classes_test.cpp | 3 +- test/ref/nonmaxsuppression.cpp | 112 +++++++++--------- 9 files changed, 81 insertions(+), 92 deletions(-) diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp index 38f076ca8de..4b4c47070b3 100644 --- a/src/include/migraphx/op/nonmaxsuppression.hpp +++ b/src/include/migraphx/op/nonmaxsuppression.hpp @@ -93,7 +93,7 @@ struct nonmaxsuppression } }; - if(not (inputs.at(0).dynamic() or inputs.at(1).dynamic())) + if(not(inputs.at(0).dynamic() or inputs.at(1).dynamic())) { fixed_shape_error_check(); } @@ -308,8 +308,7 @@ struct nonmaxsuppression argument result{max_output_shape}; argument num_selected_result{output_shapes.at(1)}; - int64_t max_output_boxes_per_class = - (args.size() > 2) ? (args.at(2).at()) : 0; + int64_t max_output_boxes_per_class = (args.size() > 2) ? (args.at(2).at()) : 0; if(max_output_boxes_per_class == 0) { num_selected_result.visit([&](auto output) { output[0] = 0; }); diff --git a/src/onnx/parse_nonmaxsuppression.cpp b/src/onnx/parse_nonmaxsuppression.cpp index 0c343bf970a..a62e04988b0 100644 --- a/src/onnx/parse_nonmaxsuppression.cpp +++ b/src/onnx/parse_nonmaxsuppression.cpp @@ -40,13 +40,13 @@ struct parse_nonmaxsuppression : op_parser const onnx_parser::node_info& info, const std::vector& args) const { - auto op = parser.load(opd.op_name, info); + auto op = parser.load(opd.op_name, info); auto nms_ins = info.add_instruction(op, args); // slice with variable ends to handle dynamic shape output. auto indices = info.add_instruction(make_op("get_tuple_elem", {{"index", 0}}), nms_ins); if(enabled(MIGRAPHX_USE_DYNAMIC_NMS{})) { - //TODO: planning to make this the default behavior and removing the env var. + // TODO: planning to make this the default behavior and removing the env var. auto num_selected = info.add_instruction(make_op("get_tuple_elem", {{"index", 1}}), nms_ins); auto slice_ins = info.add_instruction( diff --git a/src/targets/gpu/lowering.cpp b/src/targets/gpu/lowering.cpp index dbb16ee5e0a..6fb4f0cd9c9 100644 --- a/src/targets/gpu/lowering.cpp +++ b/src/targets/gpu/lowering.cpp @@ -485,8 +485,7 @@ struct miopen_apply if(inputs.size() < 5) inputs.push_back(mod->insert_literal(ins, literal{default_thr_s, {0.0f}})); - bool center_point_box = - ins->get_operator().to_value().at("center_point_box").to(); + bool center_point_box = ins->get_operator().to_value().at("center_point_box").to(); // Scratch mask; replace_allocate later turns it into hip::allocate. shape mask_shape{shape::uint8_type, {num_batches * num_classes, iou_packed}}; @@ -531,12 +530,10 @@ struct miopen_apply auto inputs = ins->inputs(); std::vector cpu_inputs; cpu_inputs.reserve(inputs.size()); - std::transform( - inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) { - return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in); - }); - cpu_inputs.front() = - mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs); + std::transform(inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) { + return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in); + }); + cpu_inputs.front() = mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs); auto cpu_out = mod->insert_instruction(ins, ins->get_operator(), cpu_inputs); @@ -545,11 +542,11 @@ struct miopen_apply gpu_subs.reserve(sub_shapes.size()); for(std::size_t i = 0; i < sub_shapes.size(); ++i) { - auto cpu_sub = mod->insert_instruction( - ins, make_op("get_tuple_elem", {{"index", i}}), cpu_out); + auto cpu_sub = + mod->insert_instruction(ins, make_op("get_tuple_elem", {{"index", i}}), cpu_out); auto gpu_alloc = insert_allocation(ins, sub_shapes[i]); - gpu_subs.push_back(mod->insert_instruction( - ins, make_op("hip::copy_to_gpu"), cpu_sub, gpu_alloc)); + gpu_subs.push_back( + mod->insert_instruction(ins, make_op("hip::copy_to_gpu"), cpu_sub, gpu_alloc)); } // TODO: this needs cleanup @@ -561,8 +558,7 @@ struct miopen_apply MIGRAPHX_THROW("gpu::add_nms_op: dynamic NMS fallback expects only " "get_tuple_elem consumers of nonmaxsuppression; got: " + consumer->name()); - auto idx = - consumer->get_operator().to_value().at("index").to(); + auto idx = consumer->get_operator().to_value().at("index").to(); assert(idx < gpu_subs.size()); mod->replace_instruction(consumer, gpu_subs[idx]); } diff --git a/src/targets/gpu/nms_ops.cpp b/src/targets/gpu/nms_ops.cpp index 10418002216..682f1f2c003 100644 --- a/src/targets/gpu/nms_ops.cpp +++ b/src/targets/gpu/nms_ops.cpp @@ -57,10 +57,9 @@ struct nms_sort const auto num_batches = boxes_s.lens()[0]; const auto num_boxes = boxes_s.lens()[1]; const auto num_classes = scores_s.lens()[1]; - const auto aligned_b = - static_cast(bit_ceil(static_cast(num_boxes))); + const auto aligned_b = bit_ceil(static_cast(num_boxes)); shape out_scores_shape{scores_s.type(), {num_batches * num_classes, aligned_b}}; - shape out_boxes_shape{boxes.type(), {num_batches * num_classes, aligned_b, 4}}; + shape out_boxes_shape{boxes_s.type(), {num_batches * num_classes, aligned_b, 4}}; shape out_box_index_shape{shape::int32_type, {num_batches * num_classes, aligned_b}}; return shape{{out_scores_shape, out_boxes_shape, out_box_index_shape}}; } @@ -69,8 +68,8 @@ MIGRAPHX_REGISTER_OP(nms_sort); // Build the IoU mask and run the greedy filter. // Produces a tuple of (raw_output, bc_counts). -// num_batches/num_classes/num_boxes are kept as op attributes because the filter inputs -// is a scratch buffer from which these can't be recovered. +// num_batches/num_classes/num_boxes are kept as op attributes because these can't be recovered +// from the inputs. // inputs = {sorted_boxes, sorted_scores, sorted_box_indices, output_indices, output_bc_counts} struct nms_filter { diff --git a/test/gpu/nonmaxsuppression.cpp b/test/gpu/nonmaxsuppression.cpp index 4d9bdb89602..89fdafef7a8 100644 --- a/test/gpu/nonmaxsuppression.cpp +++ b/test/gpu/nonmaxsuppression.cpp @@ -198,10 +198,8 @@ TEST_CASE(nms_dynamic_fallback_test) using dd = migraphx::shape::dynamic_dimension; migraphx::program p; auto* mm = p.get_main_module(); - migraphx::shape boxes_dyn_s{migraphx::shape::float_type, - {dd{1, 1}, dd{4, 10}, dd{4, 4}}}; - migraphx::shape scores_dyn_s{migraphx::shape::float_type, - {dd{1, 1}, dd{1, 1}, dd{4, 8}}}; + migraphx::shape boxes_dyn_s{migraphx::shape::float_type, {dd{1, 1}, dd{4, 10}, dd{4, 4}}}; + migraphx::shape scores_dyn_s{migraphx::shape::float_type, {dd{1, 1}, dd{1, 1}, dd{4, 8}}}; auto boxes_p = mm->add_parameter("boxes", boxes_dyn_s); auto scores_p = mm->add_parameter("scores", scores_dyn_s); diff --git a/test/onnx/parse/nms_dynamic_batch_test.cpp b/test/onnx/parse/nms_dynamic_batch_test.cpp index bb7e350bea8..346f8d7de5a 100644 --- a/test/onnx/parse/nms_dynamic_batch_test.cpp +++ b/test/onnx/parse/nms_dynamic_batch_test.cpp @@ -39,12 +39,7 @@ TEST_CASE(nms_dynamic_batch_test) migraphx::shape sst{migraphx::shape::float_type, {1}}; auto st = mm->add_parameter("score_threshold", sst); auto nms = mm->add_instruction( - migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), - b, - s, - mo, - iou, - st); + migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), b, s, mo, iou, st); auto ret = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms); mm->add_return({ret}); diff --git a/test/onnx/parse/nms_dynamic_boxes_test.cpp b/test/onnx/parse/nms_dynamic_boxes_test.cpp index d11552ca3d7..2b43f4200b1 100644 --- a/test/onnx/parse/nms_dynamic_boxes_test.cpp +++ b/test/onnx/parse/nms_dynamic_boxes_test.cpp @@ -38,8 +38,7 @@ TEST_CASE(nms_dynamic_boxes_test) auto iou = mm->add_parameter("iou_threshold", siou); migraphx::shape sst{migraphx::shape::float_type, {1}}; auto st = mm->add_parameter("score_threshold", sst); - auto nms = mm->add_instruction( - migraphx::make_op("nonmaxsuppression"), b, s, mo, iou, st); + auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"), b, s, mo, iou, st); auto ret = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms); mm->add_return({ret}); diff --git a/test/onnx/parse/nms_dynamic_classes_test.cpp b/test/onnx/parse/nms_dynamic_classes_test.cpp index 67a21634568..aaa8f843c1e 100644 --- a/test/onnx/parse/nms_dynamic_classes_test.cpp +++ b/test/onnx/parse/nms_dynamic_classes_test.cpp @@ -38,8 +38,7 @@ TEST_CASE(nms_dynamic_classes_test) auto iou = mm->add_parameter("iou_threshold", siou); migraphx::shape sst{migraphx::shape::float_type, {1}}; auto st = mm->add_parameter("score_threshold", sst); - auto nms = mm->add_instruction( - migraphx::make_op("nonmaxsuppression"), b, s, mo, iou, st); + auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression"), b, s, mo, iou, st); auto ret = mm->add_instruction(migraphx::make_op("get_tuple_elem", {{"index", 0}}), nms); mm->add_return({ret}); diff --git a/test/ref/nonmaxsuppression.cpp b/test/ref/nonmaxsuppression.cpp index 8f16ec6dd75..39d95fd56d6 100644 --- a/test/ref/nonmaxsuppression.cpp +++ b/test/ref/nonmaxsuppression.cpp @@ -56,12 +56,13 @@ TEST_CASE(nms_dyn_out_test) auto iou_threshold = mm->add_literal(0.5f); auto score_threshold = mm->add_literal(0.0f); - auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), - boxes_l, - scores_l, - max_out_l, - iou_threshold, - score_threshold); + auto nms = + mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), + boxes_l, + scores_l, + max_out_l, + iou_threshold, + score_threshold); auto r = add_nms_dynamic_slice(mm, nms); mm->add_return({r}); @@ -97,7 +98,7 @@ TEST_CASE(nms_identical_all_dyn_out_test) max_out_l, iou_threshold, score_threshold); - auto r = add_nms_dynamic_slice(mm, nms); + auto r = add_nms_dynamic_slice(mm, nms); mm->add_return({r}); p.compile(migraphx::make_target("ref")); @@ -123,13 +124,13 @@ TEST_CASE(nms_dyn_batch_test) auto iou_threshold = mm->add_literal(0.5f); auto score_threshold = mm->add_literal(0.0f); - auto nms = mm->add_instruction( - migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), - boxes_p, - scores_p, - max_out_l, - iou_threshold, - score_threshold); + auto nms = + mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), + boxes_p, + scores_p, + max_out_l, + iou_threshold, + score_threshold); auto r = add_nms_dynamic_slice(mm, nms); mm->add_return({r}); @@ -169,13 +170,13 @@ TEST_CASE(nms_dyn_boxes_test) auto iou_threshold = mm->add_literal(0.5f); auto score_threshold = mm->add_literal(0.0f); - auto nms = mm->add_instruction( - migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), - boxes_p, - scores_p, - max_out_l, - iou_threshold, - score_threshold); + auto nms = + mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), + boxes_p, + scores_p, + max_out_l, + iou_threshold, + score_threshold); auto r = add_nms_dynamic_slice(mm, nms); mm->add_return({r}); @@ -212,13 +213,13 @@ TEST_CASE(nms_dyn_classes_test) auto iou_threshold = mm->add_literal(0.5f); auto score_threshold = mm->add_literal(0.0f); - auto nms = mm->add_instruction( - migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), - boxes_p, - scores_p, - max_out_l, - iou_threshold, - score_threshold); + auto nms = + mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), + boxes_p, + scores_p, + max_out_l, + iou_threshold, + score_threshold); auto r = add_nms_dynamic_slice(mm, nms); mm->add_return({r}); @@ -266,7 +267,7 @@ TEST_CASE(nms_not_center_test) max_out_l, iou_threshold, score_threshold); - auto r = add_nms_dynamic_slice(mm, nms); + auto r = add_nms_dynamic_slice(mm, nms); mm->add_return({r}); p.compile(migraphx::make_target("ref")); @@ -294,12 +295,13 @@ TEST_CASE(nms_test) auto iou_threshold = mm->add_literal(0.5f); auto score_threshold = mm->add_literal(0.0f); - auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), - boxes_l, - scores_l, - max_out_l, - iou_threshold, - score_threshold); + auto nms = + mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), + boxes_l, + scores_l, + max_out_l, + iou_threshold, + score_threshold); auto r = add_nms_dynamic_slice(mm, nms); mm->add_return({r}); @@ -332,12 +334,13 @@ TEST_CASE(nms_transpose1_test) auto transpose_boxes = mm->add_instruction( migraphx::make_op("transpose", {{"permutation", {0, 2, 1}}}), t_boxes_l); - auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), - transpose_boxes, - scores_l, - max_out_l, - iou_threshold, - score_threshold); + auto nms = + mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), + transpose_boxes, + scores_l, + max_out_l, + iou_threshold, + score_threshold); auto r = add_nms_dynamic_slice(mm, nms); mm->add_return({r}); @@ -370,12 +373,13 @@ TEST_CASE(nms_transpose2_test) auto transpose_boxes = mm->add_instruction( migraphx::make_op("transpose", {{"permutation", {1, 2, 0}}}), t_boxes_l); - auto nms = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), - transpose_boxes, - scores_l, - max_out_l, - iou_threshold, - score_threshold); + auto nms = + mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), + transpose_boxes, + scores_l, + max_out_l, + iou_threshold, + score_threshold); auto r = add_nms_dynamic_slice(mm, nms); mm->add_return({r}); @@ -404,13 +408,13 @@ TEST_CASE(nms_dyn_different_spatial_ranges_test) auto iou_threshold = mm->add_literal(0.5f); auto score_threshold = mm->add_literal(0.0f); - auto nms = mm->add_instruction( - migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), - boxes_p, - scores_p, - max_out_l, - iou_threshold, - score_threshold); + auto nms = + mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), + boxes_p, + scores_p, + max_out_l, + iou_threshold, + score_threshold); auto r = add_nms_dynamic_slice(mm, nms); mm->add_return({r}); From 4c27d5fe35485d6f5a01f6f4e63c2fef5421501a Mon Sep 17 00:00:00 2001 From: charlie Date: Thu, 21 May 2026 17:11:31 -0500 Subject: [PATCH 29/32] Licensing --- src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp | 2 +- .../gpu/include/migraphx/gpu/compile_hip_code_object.hpp | 2 +- src/targets/gpu/jit/topk.cpp | 2 +- .../kernels/include/migraphx/kernels/nonmaxsuppression.hpp | 4 ++-- src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp | 2 +- test/multi_target/multitarget_test.cpp | 2 +- test/onnx/parse/nms_dynamic_batch_test.cpp | 2 +- test/onnx/parse/nms_dynamic_boxes_test.cpp | 2 +- test/onnx/parse/nms_dynamic_classes_test.cpp | 2 +- test/onnx/parse/nms_test.cpp | 2 +- 10 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp b/src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp index 95ce82f224e..22539bb8d6c 100644 --- a/src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp +++ b/src/targets/gpu/device/include/migraphx/gpu/device/scan.hpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp b/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp index f11051916cf..8ef0fbb6533 100644 --- a/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp +++ b/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/src/targets/gpu/jit/topk.cpp b/src/targets/gpu/jit/topk.cpp index 1deafb2db60..a39a26a4e18 100644 --- a/src/targets/gpu/jit/topk.cpp +++ b/src/targets/gpu/jit/topk.cpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp index 5b02b8136ad..2fd277974d8 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp @@ -85,9 +85,9 @@ __device__ inline bool nms_iou_over_threshold(const Box a, const Box b, const Th } // Packed upper-triangular index for j > i within an N x N matrix. -__device__ inline index_int nms_packed_idx(index_int i, index_int j, index_int N) +__device__ inline index_int nms_packed_idx(index_int i, index_int j, index_int size) { - return (i * N - (i * (i + 1)) / 2) + j - (i + 1); + return (i * size - (i * (i + 1)) / 2) + j - (i + 1); } // One block per (batch_idx, class_idx). Initializes the per-block slice of diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp index 491e9348e1e..5403783a601 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/test/multi_target/multitarget_test.cpp b/test/multi_target/multitarget_test.cpp index 1ca5758e74a..40e4dd37ecf 100644 --- a/test/multi_target/multitarget_test.cpp +++ b/test/multi_target/multitarget_test.cpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/test/onnx/parse/nms_dynamic_batch_test.cpp b/test/onnx/parse/nms_dynamic_batch_test.cpp index 346f8d7de5a..f9ac10fa4aa 100644 --- a/test/onnx/parse/nms_dynamic_batch_test.cpp +++ b/test/onnx/parse/nms_dynamic_batch_test.cpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/test/onnx/parse/nms_dynamic_boxes_test.cpp b/test/onnx/parse/nms_dynamic_boxes_test.cpp index 2b43f4200b1..2b11265d00c 100644 --- a/test/onnx/parse/nms_dynamic_boxes_test.cpp +++ b/test/onnx/parse/nms_dynamic_boxes_test.cpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/test/onnx/parse/nms_dynamic_classes_test.cpp b/test/onnx/parse/nms_dynamic_classes_test.cpp index aaa8f843c1e..8f8a3abd9d2 100644 --- a/test/onnx/parse/nms_dynamic_classes_test.cpp +++ b/test/onnx/parse/nms_dynamic_classes_test.cpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/test/onnx/parse/nms_test.cpp b/test/onnx/parse/nms_test.cpp index f8826a8a96e..3dbf522b504 100644 --- a/test/onnx/parse/nms_test.cpp +++ b/test/onnx/parse/nms_test.cpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal From b3765f6cd8315b8afe11019600628f3a8808e6e6 Mon Sep 17 00:00:00 2001 From: charlie Date: Thu, 21 May 2026 17:12:45 -0500 Subject: [PATCH 30/32] Formatting continued --- test/multi_target/multitarget_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/multi_target/multitarget_test.cpp b/test/multi_target/multitarget_test.cpp index 40e4dd37ecf..d5a196375d3 100644 --- a/test/multi_target/multitarget_test.cpp +++ b/test/multi_target/multitarget_test.cpp @@ -216,7 +216,7 @@ TEST_CASE(single_target_multi_compile) auto max_out_l = gpu_mod->add_literal(int64_t{4}); auto iou_threshold = gpu_mod->add_literal(0.5f); auto score_threshold = gpu_mod->add_literal(0.0f); - auto nms = gpu_mod->add_instruction( + auto nms = gpu_mod->add_instruction( migraphx::make_op("nonmaxsuppression", {{"center_point_box", true}}), boxes_param_gpu, scores_l, @@ -244,7 +244,7 @@ TEST_CASE(single_target_multi_compile) // eval migraphx::parameter_map params; std::vector boxes_vec = {0.5, 0.5, 1.0, 1.0, 0.5, 0.6, 1.0, 1.0, 0.5, 0.4, 1.0, 1.0, - 0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0}; + 0.5, 10.5, 1.0, 1.0, 0.5, 10.6, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0}; params["boxes"] = migraphx::argument(boxes_s, boxes_vec.data()); auto output = p.eval(params).back(); std::vector gold_vec = {0, 0, 3, 0, 0, 0, 0, 0, 5}; From 0bd8d04bf62f21ad3fee63a0739e10f185dfbe90 Mon Sep 17 00:00:00 2001 From: charlie Date: Tue, 26 May 2026 13:11:11 -0500 Subject: [PATCH 31/32] Add changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e5f19d84cb1..60c6c5dbf19 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,8 @@ Full documentation for MIGraphX is available at * Added N-D scale and zero-point support for `QLinearMatMul` operator. * Added test cases for `QLinearConv` per-channel scale and `QLinearMatMul` N-D per-channel quantization. * Added find_concat_same_input matcher to convert concat(N*x) into multibroadcast(x) to reduce hipCopy() (#4981) +* Added GPU kernel for ONNX `NonMaxSuppression` operation and redesigned the `nonmaxsuppression` operation to better represent the data-dependent output shape in the MIGraphX IR (#4893). + ### Changed * Converted `nonzero` operator from device implementation to JIT compilation (#4720). @@ -68,6 +70,7 @@ Full documentation for MIGraphX is available at ### Removed * Removed legacy device implementations for `argmin` and `argmax` in favor of the JIT implementations recently added (#4658). +* Removed `onnx_options::use_dyn_output` after redesign of `NonMaxSuppression` operator (#4893). ## MIGraphX 2.15 for ROCm 7.2.0 From 59b95b7e542644a3dc0300b40d1de8f4fdd82568 Mon Sep 17 00:00:00 2001 From: charlie Date: Wed, 27 May 2026 14:27:13 -0500 Subject: [PATCH 32/32] Tidy and formatting --- src/onnx/parse_nonmaxsuppression.cpp | 1 + src/targets/gpu/jit/nonmaxsuppression.cpp | 6 ++---- .../kernels/include/migraphx/kernels/nonmaxsuppression.hpp | 2 ++ src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp | 4 ++++ src/targets/gpu/nms_ops.cpp | 4 ++-- 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/onnx/parse_nonmaxsuppression.cpp b/src/onnx/parse_nonmaxsuppression.cpp index a62e04988b0..5549e6a102b 100644 --- a/src/onnx/parse_nonmaxsuppression.cpp +++ b/src/onnx/parse_nonmaxsuppression.cpp @@ -24,6 +24,7 @@ #include #include #include +#include MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_USE_DYNAMIC_NMS) diff --git a/src/targets/gpu/jit/nonmaxsuppression.cpp b/src/targets/gpu/jit/nonmaxsuppression.cpp index 732e8702410..a1b3109f413 100644 --- a/src/targets/gpu/jit/nonmaxsuppression.cpp +++ b/src/targets/gpu/jit/nonmaxsuppression.cpp @@ -150,8 +150,7 @@ struct nms_sort_compiler : compiler const auto num_batches = boxes_s.lens()[0]; const auto num_boxes = boxes_s.lens()[1]; const auto num_classes = scores_s.lens()[1]; - const auto aligned_num_boxes = - static_cast(bit_ceil(static_cast(num_boxes))); + const std::size_t aligned_num_boxes = bit_ceil(num_boxes); // NOTE: topK kernel uses relement/4 for amount of work in a block? auto block_size = compute_block_size(ctx, aligned_num_boxes, 1024); @@ -192,8 +191,7 @@ struct nms_filter_compiler : compiler const auto num_batches = v.at("num_batches").to(); const auto num_classes = v.at("num_classes").to(); const auto num_boxes = v.at("num_boxes").to(); - const auto aligned_num_boxes = - static_cast(bit_ceil(static_cast(num_boxes))); + const std::size_t aligned_num_boxes = bit_ceil(num_boxes); // TODO: tune for max block size? // ceil_div(num_boxes, 2) because of strided thread work distribution const auto block_size = compute_block_size(ctx, (num_boxes + 1) / 2, 256); diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp index 2fd277974d8..2f104a4676b 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/nonmaxsuppression.hpp @@ -165,6 +165,7 @@ __device__ void nonmaxsuppression_sort(const Boxes boxes_tv, // Sort scores descending, dragging boxes and indices along. Uses the // indexed variant so we can swap all 4 box lanes per index pair. + // NOLINTNEXTLINE(clang-diagnostic-error) bitonic_sort{greater{}}.template block_sort_indexed( idx, [&](auto i, auto j) { return my_sorted_scores[j] > my_sorted_scores[i]; }, @@ -214,6 +215,7 @@ __device__ void nms_make_iou_mask(const index idx, }); // Have thread 0 do middle row if odd NumBoxes + // NOLINTNEXTLINE(hicpp-signed-bitwise) if constexpr((NumBoxes & 1) != 0 and NumBoxes > 1) { if(idx.local == 0) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp index 5403783a601..7c603eeb9b8 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/sort.hpp @@ -146,8 +146,10 @@ struct bitonic_sort __device__ void block_sort(index idx, T& buf) const { static_assert(is_power_of_2(N), "N must be a power of 2"); + //NOLINTNEXTLINE(hicpp-signed-bitwise) for(index_int k = 2; k <= N; k <<= 1) { + //NOLINTNEXTLINE(hicpp-signed-bitwise) for(index_int j = k >> 1; j > 0; j >>= 1) { idx.local_stride(N, [&](auto tid) { @@ -171,8 +173,10 @@ struct bitonic_sort __device__ void block_sort_indexed(index idx, CompareAt compare_at, SwapAt swap_at) const { static_assert(is_power_of_2(N), "N must be a power of 2"); + //NOLINTNEXTLINE(hicpp-signed-bitwise) for(index_int k = 2; k <= N; k <<= 1) { + //NOLINTNEXTLINE(hicpp-signed-bitwise) for(index_int j = k >> 1; j > 0; j >>= 1) { idx.local_stride(N, [&](auto tid) { diff --git a/src/targets/gpu/nms_ops.cpp b/src/targets/gpu/nms_ops.cpp index 682f1f2c003..76778dc916c 100644 --- a/src/targets/gpu/nms_ops.cpp +++ b/src/targets/gpu/nms_ops.cpp @@ -57,7 +57,7 @@ struct nms_sort const auto num_batches = boxes_s.lens()[0]; const auto num_boxes = boxes_s.lens()[1]; const auto num_classes = scores_s.lens()[1]; - const auto aligned_b = bit_ceil(static_cast(num_boxes)); + const auto aligned_b = bit_ceil(static_cast(num_boxes)); shape out_scores_shape{scores_s.type(), {num_batches * num_classes, aligned_b}}; shape out_boxes_shape{boxes_s.type(), {num_batches * num_classes, aligned_b, 4}}; shape out_box_index_shape{shape::int32_type, {num_batches * num_classes, aligned_b}}; @@ -98,7 +98,7 @@ struct nms_filter MIGRAPHX_REGISTER_OP(nms_filter); // TODO: This should work in-place, saving memory. Need to update IR to handle it. -// Needs a make_tuple type of operator that reuses the indicies input. +// Needs a make_tuple type of operator that reuses the indices input. // Prefix-scan the per-block counts and compact the selections into // the final selected_indices. Output as selected_indices and num_selected tuple. // inputs = {output_bc_counts, output_indices}