From c126ada90252ae097f0976ee42e293f221cf45fa Mon Sep 17 00:00:00 2001 From: Harsh Chauhan Date: Fri, 29 May 2026 17:32:16 +0530 Subject: [PATCH 1/4] 2d maxpool gpu kernel and test --- core/inc/SOFIE/ROperator.hxx | 3 +- core/inc/SOFIE/ROperator_Pool.hxx | 96 +++++++++++++++++++ .../TestCustomModelsFromONNXForAlpakaCuda.cxx | 43 +++++++++ 3 files changed, 141 insertions(+), 1 deletion(-) diff --git a/core/inc/SOFIE/ROperator.hxx b/core/inc/SOFIE/ROperator.hxx index c9ce5cd..db49a0d 100644 --- a/core/inc/SOFIE/ROperator.hxx +++ b/core/inc/SOFIE/ROperator.hxx @@ -38,7 +38,8 @@ enum class OperatorKind { UNARY_COS=22, UNARY_ABS=23, CLIP=24, - NOT=25 + NOT=25, + POOL=26 }; inline const char* toString(OperatorKind kind) { diff --git a/core/inc/SOFIE/ROperator_Pool.hxx b/core/inc/SOFIE/ROperator_Pool.hxx index 8e11271..1eef141 100644 --- a/core/inc/SOFIE/ROperator_Pool.hxx +++ b/core/inc/SOFIE/ROperator_Pool.hxx @@ -80,6 +80,7 @@ public: } fInputTensorNames = { fNX }; fOutputTensorNames = { fNY }; + fKind = OperatorKind::POOL; } // return input type (defined abstract in ROperator class ) @@ -472,6 +473,101 @@ public: out << SP << "}\n"; + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeX.empty() || fShapeY.empty()) + throw std::runtime_error("SOFIE Pool called to Generate without being initialized first"); + if (fPoolMode != MaxPool) + throw std::runtime_error("SOFIE Pool GPU: only MaxPool is supported so far"); + if (fDim != 2) + throw std::runtime_error("SOFIE Pool GPU: only 2D is supported so far"); + + std::string kname = "MaxPoolKernel_" + opName; + + std::stringstream op; + op << "\n//------ MAXPOOL_KERNEL_ALPAKA\n"; + op << SP << "struct " << kname << " {\n"; + op << SP << SP << "template\n"; + op << SP << SP << "ALPAKA_FN_ACC void operator()(\n"; + op << SP << SP << SP << "TAcc const& acc,\n"; + op << SP << SP << SP << "T const* __restrict__ X,\n"; + op << SP << SP << SP << "T* __restrict__ Y,\n"; + op << SP << SP << SP << "std::size_t const totalOut) const {\n\n"; + + op << SP << SP << SP << "constexpr int H = " << fShapeX[2] << ";\n"; + op << SP << SP << SP << "constexpr int W = " << fShapeX[3] << ";\n"; + op << SP << SP << SP << "constexpr int OH = " << fShapeY[2] << ";\n"; + op << SP << SP << SP << "constexpr int OW = " << fShapeY[3] << ";\n"; + op << SP << SP << SP << "constexpr int kh = " << fAttrKernelShape[0] << ";\n"; + op << SP << SP << SP << "constexpr int kw = " << fAttrKernelShape[1] << ";\n"; + op << SP << SP << SP << "constexpr int sh = " << fAttrStrides[0] << ";\n"; + op << SP << SP << SP << "constexpr int sw = " << fAttrStrides[1] << ";\n"; + op << SP << SP << SP << "constexpr int pad_top = " << fAttrPads[0] << ";\n"; + op << SP << SP << SP << "constexpr int pad_left = " << fAttrPads[2] << ";\n\n"; + + op << SP << SP << SP << "auto const tid = alpaka::getIdx(acc)[0];\n"; + op << SP << SP << SP << "auto const stride = alpaka::getWorkDiv(acc)[0];\n\n"; + + op << SP << SP << SP << "for (std::size_t idx = tid; idx < totalOut; idx += stride) {\n"; + op << SP << SP << SP << SP << "int ow = idx % OW;\n"; + op << SP << SP << SP << SP << "int oh = (idx / OW) % OH;\n"; + op << SP << SP << SP << SP << "int nc = idx / (OH * OW);\n"; + op << SP << SP << SP << SP << "int i = oh * sh - pad_top;\n"; + op << SP << SP << SP << SP << "int j = ow * sw - pad_left;\n"; + op << SP << SP << SP << SP << "std::size_t base = static_cast(nc) * (H * W);\n\n"; + op << SP << SP << SP << SP << "T value = static_cast(-INFINITY);\n"; + op << SP << SP << SP << SP << "for (int l = i; l < i + kh; ++l) {\n"; + op << SP << SP << SP << SP << SP << "if (l < 0 || l >= H) continue;\n"; + op << SP << SP << SP << SP << SP << "for (int m = j; m < j + kw; ++m) {\n"; + op << SP << SP << SP << SP << SP << SP << "if (m < 0 || m >= W) continue;\n"; + op << SP << SP << SP << SP << SP << SP << "T xv = X[base + l * W + m];\n"; + op << SP << SP << SP << SP << SP << SP << "if (xv > value) value = xv;\n"; + op << SP << SP << SP << SP << SP << "}\n"; + op << SP << SP << SP << SP << "}\n"; + op << SP << SP << SP << SP << "Y[idx] = value;\n"; + op << SP << SP << SP << "}\n"; + op << SP << SP << "}\n"; + op << SP << "};\n"; + + return op.str(); + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fPoolMode != MaxPool || fDim != 2) + return ""; + std::string kname = "MaxPoolKernel_" + opName; + return SP + kname + " maxPoolKernel_" + opName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeX.empty() || fShapeY.empty()) + throw std::runtime_error("SOFIE Pool called to Generate without being initialized first"); + if (fPoolMode != MaxPool) + throw std::runtime_error("SOFIE Pool GPU: only MaxPool is supported so far"); + if (fDim != 2) + throw std::runtime_error("SOFIE Pool GPU: only 2D is supported so far"); + + std::size_t totalOut = ConvertShapeToLength(fShapeY); + std::string kname = "maxPoolKernel_" + opName; + + std::stringstream out; + out << "\n//------ MAXPOOL_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << fNY << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << fNY << " = Vec::all(Idx{" << totalOut << "});\n"; + out << SP << "auto const workDiv_" << fNY << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n"; + + out << SP << "auto task_" << fNY << " = alpaka::createTaskKernel(workDiv_" << fNY + << ", " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << totalOut << "));\n"; + out << SP << "alpaka::enqueue(queue, task_" << fNY << ");\n"; + return out.str(); } }; diff --git a/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx b/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx index 5ad9383..e513c27 100644 --- a/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx +++ b/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx @@ -164,6 +164,9 @@ #include "ConvWithAsymmetricPadding_FromONNX_GPU_ALPAKA.hxx" #include "input_models/references/ConvWithAsymmetricPadding.ref.hxx" +#include "MaxPool2d_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/MaxPool2d.ref.hxx" + #include "BatchNorm_FromONNX_GPU_ALPAKA.hxx" #include "BatchNormRelu_FromONNX_GPU_ALPAKA.hxx" @@ -2442,6 +2445,46 @@ TEST_F(SofieAlpakaTest, ConvWithAsymmetricPadding) } } +TEST_F(SofieAlpakaTest, MaxPool2d) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({ + 0.6266, 0.1656, 0.2753, -0.4558, -1.4592, 0.9285, -1.3410, 1.3223, -0.5936, -1.3648, + -0.2989, 0.5901, -0.8845, -0.0433, 0.8314, -1.7159, -0.5765, 0.8678, 1.0257, 0.7847, + -0.3421, -1.2364, -0.5805, 0.4421, 1.2184, 0.5043, 1.6823, -1.0483, -2.2798, -1.8927, + 0.7716, 0.0405, 0.3121, -0.3011, -0.3266, -1.9660, 1.0837, 0.2317, 0.9084, -0.3285, + -0.9398, -0.2065, -0.9499, -0.9739, -0.1288, -0.1375, -1.2612, 0.8810, 0.8506, 0.4455 + }); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{sizeof(MaxPool2d_ExpectedOutput::output) / sizeof(float)})); + + { + SOFIE_MaxPool2d::Session session("MaxPool2d_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = MaxPool2d_ExpectedOutput::output; + constexpr size_t nOut_maxpool2d = sizeof(MaxPool2d_ExpectedOutput::output) / sizeof(float); + + for (size_t i = 0; i < nOut_maxpool2d; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; + } +} + TEST_F(SofieAlpakaTest, BatchNormalization) { constexpr float TOLERANCE = DEFAULT_TOLERANCE; From ae76b18d870d7261cf81311ceeacf5a7374daaf7 Mon Sep 17 00:00:00 2001 From: Harsh Chauhan Date: Fri, 29 May 2026 21:57:53 +0530 Subject: [PATCH 2/4] empty codegen for unsupported pool variants --- core/inc/SOFIE/ROperator_Pool.hxx | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/core/inc/SOFIE/ROperator_Pool.hxx b/core/inc/SOFIE/ROperator_Pool.hxx index 1eef141..728b3d8 100644 --- a/core/inc/SOFIE/ROperator_Pool.hxx +++ b/core/inc/SOFIE/ROperator_Pool.hxx @@ -480,10 +480,8 @@ public: opName = "op_" + opName; if (fShapeX.empty() || fShapeY.empty()) throw std::runtime_error("SOFIE Pool called to Generate without being initialized first"); - if (fPoolMode != MaxPool) - throw std::runtime_error("SOFIE Pool GPU: only MaxPool is supported so far"); - if (fDim != 2) - throw std::runtime_error("SOFIE Pool GPU: only 2D is supported so far"); + if (fPoolMode != MaxPool || fDim != 2) + return ""; std::string kname = "MaxPoolKernel_" + opName; @@ -547,10 +545,8 @@ public: opName = "op_" + opName; if (fShapeX.empty() || fShapeY.empty()) throw std::runtime_error("SOFIE Pool called to Generate without being initialized first"); - if (fPoolMode != MaxPool) - throw std::runtime_error("SOFIE Pool GPU: only MaxPool is supported so far"); - if (fDim != 2) - throw std::runtime_error("SOFIE Pool GPU: only 2D is supported so far"); + if (fPoolMode != MaxPool || fDim != 2) + return ""; std::size_t totalOut = ConvertShapeToLength(fShapeY); std::string kname = "maxPoolKernel_" + opName; From 21b0fbbf937c48af0dcef19f59ec38f33e585a7b Mon Sep 17 00:00:00 2001 From: Harsh Chauhan Date: Fri, 29 May 2026 22:47:31 +0530 Subject: [PATCH 3/4] 1d and 3d maxpool gpu kernels and tests --- core/inc/SOFIE/ROperator_Pool.hxx | 144 +++++++++++++----- .../TestCustomModelsFromONNXForAlpakaCuda.cxx | 82 ++++++++++ 2 files changed, 191 insertions(+), 35 deletions(-) diff --git a/core/inc/SOFIE/ROperator_Pool.hxx b/core/inc/SOFIE/ROperator_Pool.hxx index 728b3d8..ca66e1e 100644 --- a/core/inc/SOFIE/ROperator_Pool.hxx +++ b/core/inc/SOFIE/ROperator_Pool.hxx @@ -480,7 +480,7 @@ public: opName = "op_" + opName; if (fShapeX.empty() || fShapeY.empty()) throw std::runtime_error("SOFIE Pool called to Generate without being initialized first"); - if (fPoolMode != MaxPool || fDim != 2) + if (fPoolMode != MaxPool) return ""; std::string kname = "MaxPoolKernel_" + opName; @@ -495,38 +495,112 @@ public: op << SP << SP << SP << "T* __restrict__ Y,\n"; op << SP << SP << SP << "std::size_t const totalOut) const {\n\n"; - op << SP << SP << SP << "constexpr int H = " << fShapeX[2] << ";\n"; - op << SP << SP << SP << "constexpr int W = " << fShapeX[3] << ";\n"; - op << SP << SP << SP << "constexpr int OH = " << fShapeY[2] << ";\n"; - op << SP << SP << SP << "constexpr int OW = " << fShapeY[3] << ";\n"; - op << SP << SP << SP << "constexpr int kh = " << fAttrKernelShape[0] << ";\n"; - op << SP << SP << SP << "constexpr int kw = " << fAttrKernelShape[1] << ";\n"; - op << SP << SP << SP << "constexpr int sh = " << fAttrStrides[0] << ";\n"; - op << SP << SP << SP << "constexpr int sw = " << fAttrStrides[1] << ";\n"; - op << SP << SP << SP << "constexpr int pad_top = " << fAttrPads[0] << ";\n"; - op << SP << SP << SP << "constexpr int pad_left = " << fAttrPads[2] << ";\n\n"; - - op << SP << SP << SP << "auto const tid = alpaka::getIdx(acc)[0];\n"; - op << SP << SP << SP << "auto const stride = alpaka::getWorkDiv(acc)[0];\n\n"; - - op << SP << SP << SP << "for (std::size_t idx = tid; idx < totalOut; idx += stride) {\n"; - op << SP << SP << SP << SP << "int ow = idx % OW;\n"; - op << SP << SP << SP << SP << "int oh = (idx / OW) % OH;\n"; - op << SP << SP << SP << SP << "int nc = idx / (OH * OW);\n"; - op << SP << SP << SP << SP << "int i = oh * sh - pad_top;\n"; - op << SP << SP << SP << SP << "int j = ow * sw - pad_left;\n"; - op << SP << SP << SP << SP << "std::size_t base = static_cast(nc) * (H * W);\n\n"; - op << SP << SP << SP << SP << "T value = static_cast(-INFINITY);\n"; - op << SP << SP << SP << SP << "for (int l = i; l < i + kh; ++l) {\n"; - op << SP << SP << SP << SP << SP << "if (l < 0 || l >= H) continue;\n"; - op << SP << SP << SP << SP << SP << "for (int m = j; m < j + kw; ++m) {\n"; - op << SP << SP << SP << SP << SP << SP << "if (m < 0 || m >= W) continue;\n"; - op << SP << SP << SP << SP << SP << SP << "T xv = X[base + l * W + m];\n"; - op << SP << SP << SP << SP << SP << SP << "if (xv > value) value = xv;\n"; - op << SP << SP << SP << SP << SP << "}\n"; - op << SP << SP << SP << SP << "}\n"; - op << SP << SP << SP << SP << "Y[idx] = value;\n"; - op << SP << SP << SP << "}\n"; + if (fDim == 1) { + op << SP << SP << SP << "constexpr int H = " << fShapeX[2] << ";\n"; + op << SP << SP << SP << "constexpr int OH = " << fShapeY[2] << ";\n"; + op << SP << SP << SP << "constexpr int kh = " << fAttrKernelShape[0] << ";\n"; + op << SP << SP << SP << "constexpr int sh = " << fAttrStrides[0] << ";\n"; + op << SP << SP << SP << "constexpr int pad_top = " << fAttrPads[0] << ";\n\n"; + + op << SP << SP << SP << "auto const tid = alpaka::getIdx(acc)[0];\n"; + op << SP << SP << SP << "auto const stride = alpaka::getWorkDiv(acc)[0];\n\n"; + + op << SP << SP << SP << "for (std::size_t idx = tid; idx < totalOut; idx += stride) {\n"; + op << SP << SP << SP << SP << "int oh = idx % OH;\n"; + op << SP << SP << SP << SP << "int nc = idx / OH;\n"; + op << SP << SP << SP << SP << "int i = oh * sh - pad_top;\n"; + op << SP << SP << SP << SP << "std::size_t base = static_cast(nc) * H;\n\n"; + op << SP << SP << SP << SP << "T value = static_cast(-INFINITY);\n"; + op << SP << SP << SP << SP << "for (int l = i; l < i + kh; ++l) {\n"; + op << SP << SP << SP << SP << SP << "if (l < 0 || l >= H) continue;\n"; + op << SP << SP << SP << SP << SP << "T xv = X[base + l];\n"; + op << SP << SP << SP << SP << SP << "if (xv > value) value = xv;\n"; + op << SP << SP << SP << SP << "}\n"; + op << SP << SP << SP << SP << "Y[idx] = value;\n"; + op << SP << SP << SP << "}\n"; + } + else if (fDim == 2) { + op << SP << SP << SP << "constexpr int H = " << fShapeX[2] << ";\n"; + op << SP << SP << SP << "constexpr int W = " << fShapeX[3] << ";\n"; + op << SP << SP << SP << "constexpr int OH = " << fShapeY[2] << ";\n"; + op << SP << SP << SP << "constexpr int OW = " << fShapeY[3] << ";\n"; + op << SP << SP << SP << "constexpr int kh = " << fAttrKernelShape[0] << ";\n"; + op << SP << SP << SP << "constexpr int kw = " << fAttrKernelShape[1] << ";\n"; + op << SP << SP << SP << "constexpr int sh = " << fAttrStrides[0] << ";\n"; + op << SP << SP << SP << "constexpr int sw = " << fAttrStrides[1] << ";\n"; + op << SP << SP << SP << "constexpr int pad_top = " << fAttrPads[0] << ";\n"; + op << SP << SP << SP << "constexpr int pad_left = " << fAttrPads[2] << ";\n\n"; + + op << SP << SP << SP << "auto const tid = alpaka::getIdx(acc)[0];\n"; + op << SP << SP << SP << "auto const stride = alpaka::getWorkDiv(acc)[0];\n\n"; + + op << SP << SP << SP << "for (std::size_t idx = tid; idx < totalOut; idx += stride) {\n"; + op << SP << SP << SP << SP << "int ow = idx % OW;\n"; + op << SP << SP << SP << SP << "int oh = (idx / OW) % OH;\n"; + op << SP << SP << SP << SP << "int nc = idx / (OH * OW);\n"; + op << SP << SP << SP << SP << "int i = oh * sh - pad_top;\n"; + op << SP << SP << SP << SP << "int j = ow * sw - pad_left;\n"; + op << SP << SP << SP << SP << "std::size_t base = static_cast(nc) * (H * W);\n\n"; + op << SP << SP << SP << SP << "T value = static_cast(-INFINITY);\n"; + op << SP << SP << SP << SP << "for (int l = i; l < i + kh; ++l) {\n"; + op << SP << SP << SP << SP << SP << "if (l < 0 || l >= H) continue;\n"; + op << SP << SP << SP << SP << SP << "for (int m = j; m < j + kw; ++m) {\n"; + op << SP << SP << SP << SP << SP << SP << "if (m < 0 || m >= W) continue;\n"; + op << SP << SP << SP << SP << SP << SP << "T xv = X[base + l * W + m];\n"; + op << SP << SP << SP << SP << SP << SP << "if (xv > value) value = xv;\n"; + op << SP << SP << SP << SP << SP << "}\n"; + op << SP << SP << SP << SP << "}\n"; + op << SP << SP << SP << SP << "Y[idx] = value;\n"; + op << SP << SP << SP << "}\n"; + } + else if (fDim == 3) { + op << SP << SP << SP << "constexpr int H = " << fShapeX[2] << ";\n"; + op << SP << SP << SP << "constexpr int W = " << fShapeX[3] << ";\n"; + op << SP << SP << SP << "constexpr int D = " << fShapeX[4] << ";\n"; + op << SP << SP << SP << "constexpr int OH = " << fShapeY[2] << ";\n"; + op << SP << SP << SP << "constexpr int OW = " << fShapeY[3] << ";\n"; + op << SP << SP << SP << "constexpr int OD = " << fShapeY[4] << ";\n"; + op << SP << SP << SP << "constexpr int kh = " << fAttrKernelShape[0] << ";\n"; + op << SP << SP << SP << "constexpr int kw = " << fAttrKernelShape[1] << ";\n"; + op << SP << SP << SP << "constexpr int kd = " << fAttrKernelShape[2] << ";\n"; + op << SP << SP << SP << "constexpr int sh = " << fAttrStrides[0] << ";\n"; + op << SP << SP << SP << "constexpr int sw = " << fAttrStrides[1] << ";\n"; + op << SP << SP << SP << "constexpr int sd = " << fAttrStrides[2] << ";\n"; + op << SP << SP << SP << "constexpr int pad_top = " << fAttrPads[0] << ";\n"; + op << SP << SP << SP << "constexpr int pad_left = " << fAttrPads[2] << ";\n"; + op << SP << SP << SP << "constexpr int pad_front = " << fAttrPads[4] << ";\n\n"; + + op << SP << SP << SP << "auto const tid = alpaka::getIdx(acc)[0];\n"; + op << SP << SP << SP << "auto const stride = alpaka::getWorkDiv(acc)[0];\n\n"; + + op << SP << SP << SP << "for (std::size_t idx = tid; idx < totalOut; idx += stride) {\n"; + op << SP << SP << SP << SP << "int od = idx % OD;\n"; + op << SP << SP << SP << SP << "int ow = (idx / OD) % OW;\n"; + op << SP << SP << SP << SP << "int oh = (idx / (OD * OW)) % OH;\n"; + op << SP << SP << SP << SP << "int nc = idx / (OD * OW * OH);\n"; + op << SP << SP << SP << SP << "int i = oh * sh - pad_top;\n"; + op << SP << SP << SP << SP << "int j = ow * sw - pad_left;\n"; + op << SP << SP << SP << SP << "int k = od * sd - pad_front;\n"; + op << SP << SP << SP << SP << "std::size_t base = static_cast(nc) * (H * W * D);\n\n"; + op << SP << SP << SP << SP << "T value = static_cast(-INFINITY);\n"; + op << SP << SP << SP << SP << "for (int l = i; l < i + kh; ++l) {\n"; + op << SP << SP << SP << SP << SP << "if (l < 0 || l >= H) continue;\n"; + op << SP << SP << SP << SP << SP << "for (int m = j; m < j + kw; ++m) {\n"; + op << SP << SP << SP << SP << SP << SP << "if (m < 0 || m >= W) continue;\n"; + op << SP << SP << SP << SP << SP << SP << "for (int p = k; p < k + kd; ++p) {\n"; + op << SP << SP << SP << SP << SP << SP << SP << "if (p < 0 || p >= D) continue;\n"; + op << SP << SP << SP << SP << SP << SP << SP << "T xv = X[base + l * (W * D) + m * D + p];\n"; + op << SP << SP << SP << SP << SP << SP << SP << "if (xv > value) value = xv;\n"; + op << SP << SP << SP << SP << SP << SP << "}\n"; + op << SP << SP << SP << SP << SP << "}\n"; + op << SP << SP << SP << SP << "}\n"; + op << SP << SP << SP << SP << "Y[idx] = value;\n"; + op << SP << SP << SP << "}\n"; + } + else { + return ""; + } + op << SP << SP << "}\n"; op << SP << "};\n"; @@ -535,7 +609,7 @@ public: std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { opName = "op_" + opName; - if (fPoolMode != MaxPool || fDim != 2) + if (fPoolMode != MaxPool) return ""; std::string kname = "MaxPoolKernel_" + opName; return SP + kname + " maxPoolKernel_" + opName + ";\n"; @@ -545,7 +619,7 @@ public: opName = "op_" + opName; if (fShapeX.empty() || fShapeY.empty()) throw std::runtime_error("SOFIE Pool called to Generate without being initialized first"); - if (fPoolMode != MaxPool || fDim != 2) + if (fPoolMode != MaxPool) return ""; std::size_t totalOut = ConvertShapeToLength(fShapeY); diff --git a/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx b/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx index e513c27..62a7166 100644 --- a/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx +++ b/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx @@ -164,9 +164,15 @@ #include "ConvWithAsymmetricPadding_FromONNX_GPU_ALPAKA.hxx" #include "input_models/references/ConvWithAsymmetricPadding.ref.hxx" +#include "MaxPool1d_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/MaxPool1d.ref.hxx" + #include "MaxPool2d_FromONNX_GPU_ALPAKA.hxx" #include "input_models/references/MaxPool2d.ref.hxx" +#include "MaxPool3d_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/MaxPool3d.ref.hxx" + #include "BatchNorm_FromONNX_GPU_ALPAKA.hxx" #include "BatchNormRelu_FromONNX_GPU_ALPAKA.hxx" @@ -2485,6 +2491,82 @@ TEST_F(SofieAlpakaTest, MaxPool2d) } } +TEST_F(SofieAlpakaTest, MaxPool1d) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({ + 0.0907, 0.1029, 0.8143, 1.4497, -0.7785, 0.3825, -0.3764, 1.5785, -0.0835, 0.1622, + 1.5867, 0.9823, -0.8821, 0.4439, -0.1378, -0.2273, -0.0198, -2.0230, 0.0905, 0.6674, + -1.4290, -1.3100, -0.9439, -0.0833, -0.1919, 0.6886, 0.9389, -1.2914, -1.3584, -2.0341, + -0.3269, 0.1704, 1.1776, 1.3972, -1.8874, -1.5334, 1.1541, 0.3011, 0.6569, -2.3504, + 0.4033, 0.1142, 2.2846, -1.3948, -0.8573, 0.5756, -1.0864, 0.2283, 0.8947, 1.7627, + -0.1657, 0.0649, -1.6066, 0.4162, -1.1525, -0.8184, 1.1324, -1.1086, 0.1061, 1.0071 + }); // took from reference output + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{sizeof(MaxPool1d_ExpectedOutput::output) / sizeof(float)})); + + { + SOFIE_MaxPool1d::Session session("MaxPool1d_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = MaxPool1d_ExpectedOutput::output; + constexpr size_t nOut_maxpool1d = sizeof(MaxPool1d_ExpectedOutput::output) / sizeof(float); + for (size_t i = 0; i < nOut_maxpool1d; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; + } +} + +TEST_F(SofieAlpakaTest, MaxPool3d) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({ + -2.6496, 1.0476, -0.5153, 0.3771, 0.4129, -0.3077, -0.8717, -0.8040, -0.3525, + -0.1765, -0.3364, 0.8737, -0.2381, -0.8297, 0.4666, 0.6984, -0.6760, 0.6298, + 1.3833, 0.1101, 0.2039, -0.5477, 0.2341, 0.9181, 0.3842, 0.2428, 1.7924 + });// took from reference output + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{sizeof(MaxPool3d_ExpectedOutput::output) / sizeof(float)})); + + { + SOFIE_MaxPool3d::Session session("MaxPool3d_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = MaxPool3d_ExpectedOutput::output; + constexpr size_t nOut_maxpool3d = sizeof(MaxPool3d_ExpectedOutput::output) / sizeof(float); + + for (size_t i = 0; i < nOut_maxpool3d; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; + } +} + TEST_F(SofieAlpakaTest, BatchNormalization) { constexpr float TOLERANCE = DEFAULT_TOLERANCE; From 22b059c855ed73f73de65808ee029e3be840fe53 Mon Sep 17 00:00:00 2001 From: Harsh Chauhan Date: Sun, 7 Jun 2026 15:53:16 +0530 Subject: [PATCH 4/4] added AvgPool and GlobalAvgPool GPU kernels + tests --- core/inc/SOFIE/ROperator_Pool.hxx | 76 ++++++-- .../TestCustomModelsFromONNXForAlpakaCuda.cxx | 168 ++++++++++++++++++ .../input_models/AvgPoolCountIncludePad.onnx | Bin 0 -> 215 bytes core/test/input_models/AvgPoolPad.onnx | Bin 0 -> 177 bytes core/test/input_models/GlobalAvgPool2d.onnx | Bin 0 -> 127 bytes .../references/AvgPoolCountIncludePad.ref.hxx | 3 + .../references/AvgPoolPad.ref.hxx | 3 + .../references/GlobalAvgPool2d.ref.hxx | 3 + 8 files changed, 234 insertions(+), 19 deletions(-) create mode 100644 core/test/input_models/AvgPoolCountIncludePad.onnx create mode 100644 core/test/input_models/AvgPoolPad.onnx create mode 100644 core/test/input_models/GlobalAvgPool2d.onnx create mode 100644 core/test/input_models/references/AvgPoolCountIncludePad.ref.hxx create mode 100644 core/test/input_models/references/AvgPoolPad.ref.hxx create mode 100644 core/test/input_models/references/GlobalAvgPool2d.ref.hxx diff --git a/core/inc/SOFIE/ROperator_Pool.hxx b/core/inc/SOFIE/ROperator_Pool.hxx index ca66e1e..7d99b64 100644 --- a/core/inc/SOFIE/ROperator_Pool.hxx +++ b/core/inc/SOFIE/ROperator_Pool.hxx @@ -480,13 +480,49 @@ public: opName = "op_" + opName; if (fShapeX.empty() || fShapeY.empty()) throw std::runtime_error("SOFIE Pool called to Generate without being initialized first"); - if (fPoolMode != MaxPool) + if (fPoolMode != MaxPool && fPoolMode != AveragePool) return ""; - std::string kname = "MaxPoolKernel_" + opName; + const bool isAvg = (fPoolMode == AveragePool); + bool doPadding = false; + for (auto & e : fAttrPads) doPadding |= (e > 0); + // count_include_pad == 0 with padding: divide by the in-bounds cells counted + // at run time; otherwise by the constant kernel area (CPU Generate above). + const bool runtimeCount = isAvg && fAttrCountIncludePad == 0 && doPadding; + + const std::string kname = (isAvg ? "AvgPoolKernel_" : "MaxPoolKernel_") + opName; + + // Mode dependent fragments, so the index math stays shared across 1D/2D/3D. + auto emitInit = [&](const std::string & ind) { + std::string s; + if (isAvg) { + s += ind + "T value = static_cast(0);\n"; + if (runtimeCount) s += ind + "int count = 0;\n"; + } else { + s += ind + "T value = static_cast(-INFINITY);\n"; + } + return s; + }; + auto emitAccum = [&](const std::string & ind, const std::string & xidx) { + std::string s; + if (isAvg) { + s += ind + "value += X[" + xidx + "];\n"; + if (runtimeCount) s += ind + "++count;\n"; + } else { + s += ind + "T xv = X[" + xidx + "];\n"; + s += ind + "if (xv > value) value = xv;\n"; + } + return s; + }; + auto emitFinal = [&](const std::string & ind, const std::string & area) { + std::string s; + if (isAvg) + s += ind + "value /= static_cast(" + (runtimeCount ? std::string("count") : area) + ");\n"; + return s; + }; std::stringstream op; - op << "\n//------ MAXPOOL_KERNEL_ALPAKA\n"; + op << "\n//------ " << (isAvg ? "AVGPOOL" : "MAXPOOL") << "_KERNEL_ALPAKA\n"; op << SP << "struct " << kname << " {\n"; op << SP << SP << "template\n"; op << SP << SP << "ALPAKA_FN_ACC void operator()(\n"; @@ -510,12 +546,12 @@ public: op << SP << SP << SP << SP << "int nc = idx / OH;\n"; op << SP << SP << SP << SP << "int i = oh * sh - pad_top;\n"; op << SP << SP << SP << SP << "std::size_t base = static_cast(nc) * H;\n\n"; - op << SP << SP << SP << SP << "T value = static_cast(-INFINITY);\n"; + op << emitInit(SP + SP + SP + SP); op << SP << SP << SP << SP << "for (int l = i; l < i + kh; ++l) {\n"; op << SP << SP << SP << SP << SP << "if (l < 0 || l >= H) continue;\n"; - op << SP << SP << SP << SP << SP << "T xv = X[base + l];\n"; - op << SP << SP << SP << SP << SP << "if (xv > value) value = xv;\n"; + op << emitAccum(SP + SP + SP + SP + SP, "base + l"); op << SP << SP << SP << SP << "}\n"; + op << emitFinal(SP + SP + SP + SP, "kh"); op << SP << SP << SP << SP << "Y[idx] = value;\n"; op << SP << SP << SP << "}\n"; } @@ -541,15 +577,15 @@ public: op << SP << SP << SP << SP << "int i = oh * sh - pad_top;\n"; op << SP << SP << SP << SP << "int j = ow * sw - pad_left;\n"; op << SP << SP << SP << SP << "std::size_t base = static_cast(nc) * (H * W);\n\n"; - op << SP << SP << SP << SP << "T value = static_cast(-INFINITY);\n"; + op << emitInit(SP + SP + SP + SP); op << SP << SP << SP << SP << "for (int l = i; l < i + kh; ++l) {\n"; op << SP << SP << SP << SP << SP << "if (l < 0 || l >= H) continue;\n"; op << SP << SP << SP << SP << SP << "for (int m = j; m < j + kw; ++m) {\n"; op << SP << SP << SP << SP << SP << SP << "if (m < 0 || m >= W) continue;\n"; - op << SP << SP << SP << SP << SP << SP << "T xv = X[base + l * W + m];\n"; - op << SP << SP << SP << SP << SP << SP << "if (xv > value) value = xv;\n"; + op << emitAccum(SP + SP + SP + SP + SP + SP, "base + l * W + m"); op << SP << SP << SP << SP << SP << "}\n"; op << SP << SP << SP << SP << "}\n"; + op << emitFinal(SP + SP + SP + SP, "kh * kw"); op << SP << SP << SP << SP << "Y[idx] = value;\n"; op << SP << SP << SP << "}\n"; } @@ -582,18 +618,18 @@ public: op << SP << SP << SP << SP << "int j = ow * sw - pad_left;\n"; op << SP << SP << SP << SP << "int k = od * sd - pad_front;\n"; op << SP << SP << SP << SP << "std::size_t base = static_cast(nc) * (H * W * D);\n\n"; - op << SP << SP << SP << SP << "T value = static_cast(-INFINITY);\n"; + op << emitInit(SP + SP + SP + SP); op << SP << SP << SP << SP << "for (int l = i; l < i + kh; ++l) {\n"; op << SP << SP << SP << SP << SP << "if (l < 0 || l >= H) continue;\n"; op << SP << SP << SP << SP << SP << "for (int m = j; m < j + kw; ++m) {\n"; op << SP << SP << SP << SP << SP << SP << "if (m < 0 || m >= W) continue;\n"; op << SP << SP << SP << SP << SP << SP << "for (int p = k; p < k + kd; ++p) {\n"; op << SP << SP << SP << SP << SP << SP << SP << "if (p < 0 || p >= D) continue;\n"; - op << SP << SP << SP << SP << SP << SP << SP << "T xv = X[base + l * (W * D) + m * D + p];\n"; - op << SP << SP << SP << SP << SP << SP << SP << "if (xv > value) value = xv;\n"; + op << emitAccum(SP + SP + SP + SP + SP + SP + SP, "base + l * (W * D) + m * D + p"); op << SP << SP << SP << SP << SP << SP << "}\n"; op << SP << SP << SP << SP << SP << "}\n"; op << SP << SP << SP << SP << "}\n"; + op << emitFinal(SP + SP + SP + SP, "kh * kw * kd"); op << SP << SP << SP << SP << "Y[idx] = value;\n"; op << SP << SP << SP << "}\n"; } @@ -609,24 +645,26 @@ public: std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { opName = "op_" + opName; - if (fPoolMode != MaxPool) - return ""; - std::string kname = "MaxPoolKernel_" + opName; - return SP + kname + " maxPoolKernel_" + opName + ";\n"; + if (fPoolMode == MaxPool) + return SP + "MaxPoolKernel_" + opName + " maxPoolKernel_" + opName + ";\n"; + if (fPoolMode == AveragePool) + return SP + "AvgPoolKernel_" + opName + " avgPoolKernel_" + opName + ";\n"; + return ""; } std::string Generate_GPU_ALPAKA(std::string opName) override { opName = "op_" + opName; if (fShapeX.empty() || fShapeY.empty()) throw std::runtime_error("SOFIE Pool called to Generate without being initialized first"); - if (fPoolMode != MaxPool) + if (fPoolMode != MaxPool && fPoolMode != AveragePool) return ""; + const bool isAvg = (fPoolMode == AveragePool); std::size_t totalOut = ConvertShapeToLength(fShapeY); - std::string kname = "maxPoolKernel_" + opName; + std::string kname = (isAvg ? "avgPoolKernel_" : "maxPoolKernel_") + opName; std::stringstream out; - out << "\n//------ MAXPOOL_GPU_ALPAKA\n"; + out << "\n//------ " << (isAvg ? "AVGPOOL" : "MAXPOOL") << "_GPU_ALPAKA\n"; out << SP << "auto const elementsPerThread_" << fNY << " = Vec::all(static_cast(1));\n"; out << SP << "auto const elementsPerGrid_" << fNY << " = Vec::all(Idx{" << totalOut << "});\n"; out << SP << "auto const workDiv_" << fNY << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n"; diff --git a/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx b/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx index 62a7166..eca329f 100644 --- a/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx +++ b/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx @@ -173,6 +173,18 @@ #include "MaxPool3d_FromONNX_GPU_ALPAKA.hxx" #include "input_models/references/MaxPool3d.ref.hxx" +#include "AvgPool_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/AvgPool.ref.hxx" + +#include "AvgPoolPad_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/AvgPoolPad.ref.hxx" + +#include "AvgPoolCountIncludePad_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/AvgPoolCountIncludePad.ref.hxx" + +#include "GlobalAvgPool2d_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/GlobalAvgPool2d.ref.hxx" + #include "BatchNorm_FromONNX_GPU_ALPAKA.hxx" #include "BatchNormRelu_FromONNX_GPU_ALPAKA.hxx" @@ -2567,6 +2579,162 @@ TEST_F(SofieAlpakaTest, MaxPool3d) } } +// AveragePool with no padding (kernel 3x2, stride [2,1]); this re uses the existing AvgPool model and CPU reference + +TEST_F(SofieAlpakaTest, AvgPool) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({ + 0.4764, -0.1976, 1.6506, -0.2421, 0.6412, 1.9985, 0.3938, + 0.1347, 0.2204, -0.7503, + 0.2139, 0.7285, -0.0210, -0.4585, -1.5333, -0.4772, 0.5560, + 0.6323, -2.5372, 1.4906, + -1.1062, -0.9703, 0.2366, -0.9184, 0.3014, 0.7985, -0.6841, + -2.2854, -2.7728, -1.2806, + -1.0947, -0.5990, -0.3033, -1.9042, -0.5403, 0.2332, 0.9215, + -0.1549, 0.0557, -0.5567, + -1.4971, 0.5386, -0.2922, 0.4860, -0.3973, -0.4624, 0.4514, + 0.2385, 0.3783, -1.0500 + }); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{sizeof(AvgPool_ExpectedOutput::output) / sizeof(float)})); + + { + SOFIE_AvgPool::Session session("AvgPool_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = AvgPool_ExpectedOutput::output; + constexpr size_t nOut_avgpool = sizeof(AvgPool_ExpectedOutput::output) / sizeof(float); + + for (size_t i = 0; i < nOut_avgpool; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; + } +} + +// AveragePool 3x3, pads 1 all round, count_include_pad=0 (default) + +TEST_F(SofieAlpakaTest, AvgPoolPad) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input(16); + for (size_t i = 0; i < input.size(); ++i) input[i] = float(i); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{sizeof(AvgPoolPad_ExpectedOutput::output) / sizeof(float)})); + + { + SOFIE_AvgPoolPad::Session session("AvgPoolPad_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = AvgPoolPad_ExpectedOutput::output; + constexpr size_t nOut_avgpoolpad = sizeof(AvgPoolPad_ExpectedOutput::output) / sizeof(float); + + for (size_t i = 0; i < nOut_avgpoolpad; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; + } +} + +// Same model as AvgPoolPad but count_include_pad = 1, so the divisor is the full +// kernel area (kh*kw). The border values differ from AvgPoolPad, which is what this test pins down + +TEST_F(SofieAlpakaTest, AvgPoolCountIncludePad) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input(16); + for (size_t i = 0; i < input.size(); ++i) input[i] = float(i); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{sizeof(AvgPoolCountIncludePad_ExpectedOutput::output) / sizeof(float)})); + + { + SOFIE_AvgPoolCountIncludePad::Session session("AvgPoolCountIncludePad_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = AvgPoolCountIncludePad_ExpectedOutput::output; + constexpr size_t nOut_avgpoolinc = sizeof(AvgPoolCountIncludePad_ExpectedOutput::output) / sizeof(float); + + for (size_t i = 0; i < nOut_avgpoolinc; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; + } +} + +// GlobalAveragePool: one output per channel = the mean of the whole channel. +// Input x[1,2,3,3] = iota 0..17, so the channel means are 4 and 13. + +TEST_F(SofieAlpakaTest, GlobalAvgPool2d) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input(18); + for (size_t i = 0; i < input.size(); ++i) input[i] = float(i); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{sizeof(GlobalAvgPool2d_ExpectedOutput::output) / sizeof(float)})); + + { + SOFIE_GlobalAvgPool2d::Session session("GlobalAvgPool2d_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = GlobalAvgPool2d_ExpectedOutput::output; + constexpr size_t nOut_globalavg = sizeof(GlobalAvgPool2d_ExpectedOutput::output) / sizeof(float); + + for (size_t i = 0; i < nOut_globalavg; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; + } +} + TEST_F(SofieAlpakaTest, BatchNormalization) { constexpr float TOLERANCE = DEFAULT_TOLERANCE; diff --git a/core/test/input_models/AvgPoolCountIncludePad.onnx b/core/test/input_models/AvgPoolCountIncludePad.onnx new file mode 100644 index 0000000000000000000000000000000000000000..029843156d54d23c9439818b79793bbc54fb1333 GIT binary patch literal 215 zcmd;J7ve3bEXl~vi!V+t$}A|c+RMn5$;DV9#8|1s?O2vtl$f3xke{EUCBY?_oL`z( z5}%ouoKu>T8efo@BEh(Tkx5IGizhp^C@(c9zBnVXAk~4{fq4NVyOtmq3s6b110w{2 zgao+Qi%W_!Q&PdAKoKD^$Fg*g@y;NlJs~CsB&I}3gPb77CBnfdB*4YQ!3f1HK$;~9 PC|QY1(uswOL4Xec#V;@& literal 0 HcmV?d00001 diff --git a/core/test/input_models/AvgPoolPad.onnx b/core/test/input_models/AvgPoolPad.onnx new file mode 100644 index 0000000000000000000000000000000000000000..66eee7b87b18bffc70598b8cc6192e618f97d74a GIT binary patch literal 177 zcmd;J7ve3bEXl~vi!V+t$}A|cn$F1O&&60F#8|1s?O2vtl$f3xke{EUCCbH8`VyqBitW*+o&&f|p%yBGBElNyJ4am>W5#oml urGvOeDN)iOwPIW%9E?H&TudB{K+FWh%pjZul&r)e2~x!9#KOfOzy|;Y4j3l@ literal 0 HcmV?d00001 diff --git a/core/test/input_models/references/AvgPoolCountIncludePad.ref.hxx b/core/test/input_models/references/AvgPoolCountIncludePad.ref.hxx new file mode 100644 index 0000000..b33af03 --- /dev/null +++ b/core/test/input_models/references/AvgPoolCountIncludePad.ref.hxx @@ -0,0 +1,3 @@ +namespace AvgPoolCountIncludePad_ExpectedOutput { +float output[] = {1.1111111640930176, 2.0, 2.6666667461395264, 2.0, 3.0, 5.0, 6.0, 4.333333492279053, 5.666666507720947, 9.0, 10.0, 7.0, 4.666666507720947, 7.333333492279053, 8.0, 5.55555534362793}; +} // namespace AvgPoolCountIncludePad_ExpectedOutput diff --git a/core/test/input_models/references/AvgPoolPad.ref.hxx b/core/test/input_models/references/AvgPoolPad.ref.hxx new file mode 100644 index 0000000..51f0e8f --- /dev/null +++ b/core/test/input_models/references/AvgPoolPad.ref.hxx @@ -0,0 +1,3 @@ +namespace AvgPoolPad_ExpectedOutput { +float output[] = {2.5, 3.0, 4.0, 4.5, 4.5, 5.0, 6.0, 6.5, 8.5, 9.0, 10.0, 10.5, 10.5, 11.0, 12.0, 12.5}; +} // namespace AvgPoolPad_ExpectedOutput diff --git a/core/test/input_models/references/GlobalAvgPool2d.ref.hxx b/core/test/input_models/references/GlobalAvgPool2d.ref.hxx new file mode 100644 index 0000000..4eee5e7 --- /dev/null +++ b/core/test/input_models/references/GlobalAvgPool2d.ref.hxx @@ -0,0 +1,3 @@ +namespace GlobalAvgPool2d_ExpectedOutput { +float output[] = {4.0, 13.0}; +} // namespace GlobalAvgPool2d_ExpectedOutput