From ba6a90be19a25d0e09cca2a0f5f9dc021588ac81 Mon Sep 17 00:00:00 2001 From: Harsh Chauhan Date: Mon, 25 May 2026 22:35:16 +0530 Subject: [PATCH 1/2] divide gemm_n by fAttrGroup in grouped Conv GPU path --- core/inc/SOFIE/ROperator_Conv.hxx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/inc/SOFIE/ROperator_Conv.hxx b/core/inc/SOFIE/ROperator_Conv.hxx index 835a0ff..6711bf0 100644 --- a/core/inc/SOFIE/ROperator_Conv.hxx +++ b/core/inc/SOFIE/ROperator_Conv.hxx @@ -836,11 +836,11 @@ public: size_t gemm_n = outChannels; // output channels size_t gemm_k = fShapeW[1] * kernelSize; // input channels/group * kernel volume size_t gemm_m = oDepth * oHeight * oWidth; // output spatial size per channel + if (fAttrGroup > 1) gemm_n /= fAttrGroup; // per-group output channels for grouped conv size_t colElements = gemm_k * gemm_m; // colRows * colCols size_t wTotal = ConvertShapeToLength(fShapeW); // For group conv: per-group output channels and _f offset - // gemm_n stays as total output channels — we divide per group at launch size_t groupFOffset = gemm_n * gemm_k; // elements of _f per group std::stringstream out; @@ -986,6 +986,7 @@ public: size_t gemm_n_ = fShapeW[0]; size_t gemm_k_ = fShapeW[1] * kSize_; size_t gemm_m_ = oDepth_ * oHeight_ * oWidth_; + if (fAttrGroup > 1) gemm_n_ /= fAttrGroup; auto lda = std::to_string(gemm_m_); // ld for xcol^T (gemm_m×gemm_k col-major) auto ldb = std::to_string(gemm_k_); // ld for xf^T (gemm_k×gemm_n col-major) auto ldc = std::to_string(gemm_m_); // ld for y^T (gemm_m×gemm_n col-major) From d732b8d4f05d7577c7da26d8e76be5718e11348d Mon Sep 17 00:00:00 2001 From: Harsh Chauhan Date: Thu, 11 Jun 2026 15:40:24 +0530 Subject: [PATCH 2/2] add gtest for grouped + batched Conv GPU inference --- .../TestCustomModelsFromONNXForAlpakaCuda.cxx | 37 ++++++++++++++++++ core/test/input_models/ConvGroupBatch.onnx | Bin 0 -> 551 bytes .../references/ConvGroupBatch.ref.hxx | 5 +++ .../references/ConvGroupBatch_input.ref.hxx | 5 +++ 4 files changed, 47 insertions(+) create mode 100644 core/test/input_models/ConvGroupBatch.onnx create mode 100644 core/test/input_models/references/ConvGroupBatch.ref.hxx create mode 100644 core/test/input_models/references/ConvGroupBatch_input.ref.hxx diff --git a/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx b/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx index e415cce..7963095 100644 --- a/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx +++ b/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx @@ -115,6 +115,10 @@ #include "ConvWithAsymmetricPadding_FromONNX_GPU_ALPAKA.hxx" #include "input_models/references/ConvWithAsymmetricPadding.ref.hxx" +#include "ConvGroupBatch_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/ConvGroupBatch.ref.hxx" +#include "input_models/references/ConvGroupBatch_input.ref.hxx" + #include "BatchNorm_FromONNX_GPU_ALPAKA.hxx" #include "BatchNormRelu_FromONNX_GPU_ALPAKA.hxx" @@ -2200,6 +2204,39 @@ TEST_F(SofieAlpakaTest, ConvWithAsymmetricPadding) } } +// group=2 and batch=4 together; patch test for the gemm_n /= fAttrGroup fix +TEST_F(SofieAlpakaTest, ConvGroupBatch) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + constexpr size_t N = sizeof(ConvGroupBatch_Input::data) / sizeof(float); + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < N; ++i) input_ptr[i] = ConvGroupBatch_Input::data[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{N})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + constexpr size_t nOut = sizeof(ConvGroupBatch_ExpectedOutput::output) / sizeof(float); + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{nOut})); + + { + SOFIE_ConvGroupBatch::Session session("ConvGroupBatch_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = ConvGroupBatch_ExpectedOutput::output; + for (size_t i = 0; i < nOut; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; + } +} + TEST_F(SofieAlpakaTest, BatchNormalization) { constexpr float TOLERANCE = DEFAULT_TOLERANCE; diff --git a/core/test/input_models/ConvGroupBatch.onnx b/core/test/input_models/ConvGroupBatch.onnx new file mode 100644 index 0000000000000000000000000000000000000000..0f58d6629f6bd10c8d2cc586633e1f9717c02a0d GIT binary patch literal 551 zcmd$Q1yrLB8ylv@k!X4}QuxUychGu^-0ZsKR}y}rNP>^yZ?>~5>}*d3qiy6<)a zkKNnD5q7>3TlQTJDcWa$@xs2Xn_~7p{48Xba($+q>FFXHy`#(QI)ZxlspYQQcZ$Vq zk6yrB+x0Hjb|2hwdEav8NqZhoZr!J-#%VWKW18*J8i#$isxIyIejBlmE7^C?_c_jX z`42qz$&{Y6?KZr*H$G8m-^u9pdzx87_JtjLwAVMdbf37vNxK}W=hjDk`S)G-(ziXs zxns|I7Sp{v6MoxTC`;Rkc;DRn`qt~cZ9$29tGB=2rzOe30tzc9FM-SI()&(*=eE0E usA0F1>CB!ed1%B6iE)W=FbWB90YevxS%EZbk^&brvT>