From c126ada90252ae097f0976ee42e293f221cf45fa Mon Sep 17 00:00:00 2001
From: Harsh Chauhan <harsh2005.hc@gmail.com>
Date: Fri, 29 May 2026 17:32:16 +0530
Subject: [PATCH 1/4] 2d maxpool gpu kernel and test

---
 core/inc/SOFIE/ROperator.hxx                  |  3 +-
 core/inc/SOFIE/ROperator_Pool.hxx             | 96 +++++++++++++++++++
 .../TestCustomModelsFromONNXForAlpakaCuda.cxx | 43 +++++++++
 3 files changed, 141 insertions(+), 1 deletion(-)
diff --git a/core/inc/SOFIE/ROperator.hxx b/core/inc/SOFIE/ROperator.hxx
index c9ce5cd..db49a0d 100644
--- a/core/inc/SOFIE/ROperator.hxx
+++ b/core/inc/SOFIE/ROperator.hxx
@@ -38,7 +38,8 @@ enum class OperatorKind {
    UNARY_COS=22,
    UNARY_ABS=23,
    CLIP=24,
-   NOT=25
+   NOT=25,
+   POOL=26
 };
 
 inline const char* toString(OperatorKind kind) {
diff --git a/core/inc/SOFIE/ROperator_Pool.hxx b/core/inc/SOFIE/ROperator_Pool.hxx
index 8e11271..1eef141 100644
--- a/core/inc/SOFIE/ROperator_Pool.hxx
+++ b/core/inc/SOFIE/ROperator_Pool.hxx
@@ -80,6 +80,7 @@ public:
       }
       fInputTensorNames = { fNX };
       fOutputTensorNames = { fNY };
+      fKind = OperatorKind::POOL;
    }
 
    // return input type (defined abstract in ROperator class )
@@ -472,6 +473,101 @@ public:
       out << SP << "}\n";
 
 
+      return out.str();
+   }
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      if (fShapeX.empty() || fShapeY.empty())
+         throw std::runtime_error("SOFIE Pool called to Generate without being initialized first");
+      if (fPoolMode != MaxPool)
+         throw std::runtime_error("SOFIE Pool GPU: only MaxPool is supported so far");
+      if (fDim != 2)
+         throw std::runtime_error("SOFIE Pool GPU: only 2D is supported so far");
+
+      std::string kname = "MaxPoolKernel_" + opName;
+
+      std::stringstream op;
+      op << "\n//------ MAXPOOL_KERNEL_ALPAKA\n";
+      op << SP << "struct " << kname << " {\n";
+      op << SP << SP << "template<typename TAcc, typename T>\n";
+      op << SP << SP << "ALPAKA_FN_ACC void operator()(\n";
+      op << SP << SP << SP << "TAcc const& acc,\n";
+      op << SP << SP << SP << "T const* __restrict__ X,\n";
+      op << SP << SP << SP << "T* __restrict__ Y,\n";
+      op << SP << SP << SP << "std::size_t const totalOut) const {\n\n";
+
+      op << SP << SP << SP << "constexpr int H        = " << fShapeX[2]          << ";\n";
+      op << SP << SP << SP << "constexpr int W        = " << fShapeX[3]          << ";\n";
+      op << SP << SP << SP << "constexpr int OH       = " << fShapeY[2]          << ";\n";
+      op << SP << SP << SP << "constexpr int OW       = " << fShapeY[3]          << ";\n";
+      op << SP << SP << SP << "constexpr int kh       = " << fAttrKernelShape[0] << ";\n";
+      op << SP << SP << SP << "constexpr int kw       = " << fAttrKernelShape[1] << ";\n";
+      op << SP << SP << SP << "constexpr int sh       = " << fAttrStrides[0]     << ";\n";
+      op << SP << SP << SP << "constexpr int sw       = " << fAttrStrides[1]     << ";\n";
+      op << SP << SP << SP << "constexpr int pad_top  = " << fAttrPads[0]        << ";\n";
+      op << SP << SP << SP << "constexpr int pad_left = " << fAttrPads[2]        << ";\n\n";
+
+      op << SP << SP << SP << "auto const tid    = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op << SP << SP << SP << "auto const stride = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+      op << SP << SP << SP << "for (std::size_t idx = tid; idx < totalOut; idx += stride) {\n";
+      op << SP << SP << SP << SP << "int ow = idx % OW;\n";
+      op << SP << SP << SP << SP << "int oh = (idx / OW) % OH;\n";
+      op << SP << SP << SP << SP << "int nc = idx / (OH * OW);\n";
+      op << SP << SP << SP << SP << "int i  = oh * sh - pad_top;\n";
+      op << SP << SP << SP << SP << "int j  = ow * sw - pad_left;\n";
+      op << SP << SP << SP << SP << "std::size_t base = static_cast<std::size_t>(nc) * (H * W);\n\n";
+      op << SP << SP << SP << SP << "T value = static_cast<T>(-INFINITY);\n";
+      op << SP << SP << SP << SP << "for (int l = i; l < i + kh; ++l) {\n";
+      op << SP << SP << SP << SP << SP << "if (l < 0 || l >= H) continue;\n";
+      op << SP << SP << SP << SP << SP << "for (int m = j; m < j + kw; ++m) {\n";
+      op << SP << SP << SP << SP << SP << SP << "if (m < 0 || m >= W) continue;\n";
+      op << SP << SP << SP << SP << SP << SP << "T xv = X[base + l * W + m];\n";
+      op << SP << SP << SP << SP << SP << SP << "if (xv > value) value = xv;\n";
+      op << SP << SP << SP << SP << SP << "}\n";
+      op << SP << SP << SP << SP << "}\n";
+      op << SP << SP << SP << SP << "Y[idx] = value;\n";
+      op << SP << SP << SP << "}\n";
+      op << SP << SP << "}\n";
+      op << SP << "};\n";
+
+      return op.str();
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      if (fPoolMode != MaxPool || fDim != 2)
+         return "";
+      std::string kname = "MaxPoolKernel_" + opName;
+      return SP + kname + " maxPoolKernel_" + opName + ";\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      if (fShapeX.empty() || fShapeY.empty())
+         throw std::runtime_error("SOFIE Pool called to Generate without being initialized first");
+      if (fPoolMode != MaxPool)
+         throw std::runtime_error("SOFIE Pool GPU: only MaxPool is supported so far");
+      if (fDim != 2)
+         throw std::runtime_error("SOFIE Pool GPU: only 2D is supported so far");
+
+      std::size_t totalOut = ConvertShapeToLength(fShapeY);
+      std::string kname = "maxPoolKernel_" + opName;
+
+      std::stringstream out;
+      out << "\n//------ MAXPOOL_GPU_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_" << fNY << " = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"   << fNY << " = Vec::all(Idx{" << totalOut << "});\n";
+      out << SP << "auto const workDiv_" << fNY << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n";
+
+      out << SP << "auto task_" << fNY << " = alpaka::createTaskKernel<Acc>(workDiv_" << fNY
+          << ", " << kname
+          << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")"
+          << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+          << ", static_cast<Idx>(" << totalOut << "));\n";
+      out << SP << "alpaka::enqueue(queue, task_" << fNY << ");\n";
+
       return out.str();
    }
 };
diff --git a/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx b/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx
index 5ad9383..e513c27 100644
--- a/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx
+++ b/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx
@@ -164,6 +164,9 @@
 #include "ConvWithAsymmetricPadding_FromONNX_GPU_ALPAKA.hxx"
 #include "input_models/references/ConvWithAsymmetricPadding.ref.hxx"
 
+#include "MaxPool2d_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/MaxPool2d.ref.hxx"
+
 #include "BatchNorm_FromONNX_GPU_ALPAKA.hxx"
 #include "BatchNormRelu_FromONNX_GPU_ALPAKA.hxx"
 
@@ -2442,6 +2445,46 @@ TEST_F(SofieAlpakaTest, ConvWithAsymmetricPadding)
    }
 }
 
+TEST_F(SofieAlpakaTest, MaxPool2d)
+{
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   std::vector<float> input({
+       0.6266,  0.1656,  0.2753, -0.4558, -1.4592,  0.9285, -1.3410,  1.3223, -0.5936, -1.3648,
+      -0.2989,  0.5901, -0.8845, -0.0433,  0.8314, -1.7159, -0.5765,  0.8678,  1.0257,  0.7847,
+      -0.3421, -1.2364, -0.5805,  0.4421,  1.2184,  0.5043,  1.6823, -1.0483, -2.2798, -1.8927,
+       0.7716,  0.0405,  0.3121, -0.3011, -0.3266, -1.9660,  1.0837,  0.2317,  0.9084, -0.3285,
+      -0.9398, -0.2065, -0.9499, -0.9739, -0.1288, -0.1375, -1.2612,  0.8810,  0.8506,  0.4455
+   });
+
+   auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+   float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+   for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+   auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+   alpaka::memcpy(queue, input_d, input_h);
+   alpaka::wait(queue);
+
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{sizeof(MaxPool2d_ExpectedOutput::output) / sizeof(float)}));
+
+   {
+      SOFIE_MaxPool2d::Session<alpaka::TagGpuCudaRt> session("MaxPool2d_FromONNX_GPU_ALPAKA.dat");
+      auto result = session.infer(input_d);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float* correct = MaxPool2d_ExpectedOutput::output;
+   constexpr size_t nOut_maxpool2d = sizeof(MaxPool2d_ExpectedOutput::output) / sizeof(float);
+
+   for (size_t i = 0; i < nOut_maxpool2d; ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+   }
+}
+
 TEST_F(SofieAlpakaTest, BatchNormalization)
 {
     constexpr float TOLERANCE = DEFAULT_TOLERANCE;

From ae76b18d870d7261cf81311ceeacf5a7374daaf7 Mon Sep 17 00:00:00 2001
From: Harsh Chauhan <harsh2005.hc@gmail.com>
Date: Fri, 29 May 2026 21:57:53 +0530
Subject: [PATCH 2/4] empty codegen for unsupported pool variants

---
 core/inc/SOFIE/ROperator_Pool.hxx | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/core/inc/SOFIE/ROperator_Pool.hxx b/core/inc/SOFIE/ROperator_Pool.hxx
index 1eef141..728b3d8 100644
--- a/core/inc/SOFIE/ROperator_Pool.hxx
+++ b/core/inc/SOFIE/ROperator_Pool.hxx
@@ -480,10 +480,8 @@ public:
       opName = "op_" + opName;
       if (fShapeX.empty() || fShapeY.empty())
          throw std::runtime_error("SOFIE Pool called to Generate without being initialized first");
-      if (fPoolMode != MaxPool)
-         throw std::runtime_error("SOFIE Pool GPU: only MaxPool is supported so far");
-      if (fDim != 2)
-         throw std::runtime_error("SOFIE Pool GPU: only 2D is supported so far");
+      if (fPoolMode != MaxPool || fDim != 2)
+         return "";
 
       std::string kname = "MaxPoolKernel_" + opName;
 
@@ -547,10 +545,8 @@ public:
       opName = "op_" + opName;
       if (fShapeX.empty() || fShapeY.empty())
          throw std::runtime_error("SOFIE Pool called to Generate without being initialized first");
-      if (fPoolMode != MaxPool)
-         throw std::runtime_error("SOFIE Pool GPU: only MaxPool is supported so far");
-      if (fDim != 2)
-         throw std::runtime_error("SOFIE Pool GPU: only 2D is supported so far");
+      if (fPoolMode != MaxPool || fDim != 2)
+         return "";
 
       std::size_t totalOut = ConvertShapeToLength(fShapeY);
       std::string kname = "maxPoolKernel_" + opName;

From 21b0fbbf937c48af0dcef19f59ec38f33e585a7b Mon Sep 17 00:00:00 2001
From: Harsh Chauhan <harsh2005.hc@gmail.com>
Date: Fri, 29 May 2026 22:47:31 +0530
Subject: [PATCH 3/4] 1d and 3d maxpool gpu kernels and tests

---
 core/inc/SOFIE/ROperator_Pool.hxx             | 144 +++++++++++++-----
 .../TestCustomModelsFromONNXForAlpakaCuda.cxx |  82 ++++++++++
 2 files changed, 191 insertions(+), 35 deletions(-)

diff --git a/core/inc/SOFIE/ROperator_Pool.hxx b/core/inc/SOFIE/ROperator_Pool.hxx
index 728b3d8..ca66e1e 100644
--- a/core/inc/SOFIE/ROperator_Pool.hxx
+++ b/core/inc/SOFIE/ROperator_Pool.hxx
@@ -480,7 +480,7 @@ public:
       opName = "op_" + opName;
       if (fShapeX.empty() || fShapeY.empty())
          throw std::runtime_error("SOFIE Pool called to Generate without being initialized first");
-      if (fPoolMode != MaxPool || fDim != 2)
+      if (fPoolMode != MaxPool)
          return "";
 
       std::string kname = "MaxPoolKernel_" + opName;
@@ -495,38 +495,112 @@ public:
       op << SP << SP << SP << "T* __restrict__ Y,\n";
       op << SP << SP << SP << "std::size_t const totalOut) const {\n\n";
 
-      op << SP << SP << SP << "constexpr int H        = " << fShapeX[2]          << ";\n";
-      op << SP << SP << SP << "constexpr int W        = " << fShapeX[3]          << ";\n";
-      op << SP << SP << SP << "constexpr int OH       = " << fShapeY[2]          << ";\n";
-      op << SP << SP << SP << "constexpr int OW       = " << fShapeY[3]          << ";\n";
-      op << SP << SP << SP << "constexpr int kh       = " << fAttrKernelShape[0] << ";\n";
-      op << SP << SP << SP << "constexpr int kw       = " << fAttrKernelShape[1] << ";\n";
-      op << SP << SP << SP << "constexpr int sh       = " << fAttrStrides[0]     << ";\n";
-      op << SP << SP << SP << "constexpr int sw       = " << fAttrStrides[1]     << ";\n";
-      op << SP << SP << SP << "constexpr int pad_top  = " << fAttrPads[0]        << ";\n";
-      op << SP << SP << SP << "constexpr int pad_left = " << fAttrPads[2]        << ";\n\n";
-
-      op << SP << SP << SP << "auto const tid    = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
-      op << SP << SP << SP << "auto const stride = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
-
-      op << SP << SP << SP << "for (std::size_t idx = tid; idx < totalOut; idx += stride) {\n";
-      op << SP << SP << SP << SP << "int ow = idx % OW;\n";
-      op << SP << SP << SP << SP << "int oh = (idx / OW) % OH;\n";
-      op << SP << SP << SP << SP << "int nc = idx / (OH * OW);\n";
-      op << SP << SP << SP << SP << "int i  = oh * sh - pad_top;\n";
-      op << SP << SP << SP << SP << "int j  = ow * sw - pad_left;\n";
-      op << SP << SP << SP << SP << "std::size_t base = static_cast<std::size_t>(nc) * (H * W);\n\n";
-      op << SP << SP << SP << SP << "T value = static_cast<T>(-INFINITY);\n";
-      op << SP << SP << SP << SP << "for (int l = i; l < i + kh; ++l) {\n";
-      op << SP << SP << SP << SP << SP << "if (l < 0 || l >= H) continue;\n";
-      op << SP << SP << SP << SP << SP << "for (int m = j; m < j + kw; ++m) {\n";
-      op << SP << SP << SP << SP << SP << SP << "if (m < 0 || m >= W) continue;\n";
-      op << SP << SP << SP << SP << SP << SP << "T xv = X[base + l * W + m];\n";
-      op << SP << SP << SP << SP << SP << SP << "if (xv > value) value = xv;\n";
-      op << SP << SP << SP << SP << SP << "}\n";
-      op << SP << SP << SP << SP << "}\n";
-      op << SP << SP << SP << SP << "Y[idx] = value;\n";
-      op << SP << SP << SP << "}\n";
+      if (fDim == 1) {
+         op << SP << SP << SP << "constexpr int H        = " << fShapeX[2]          << ";\n";
+         op << SP << SP << SP << "constexpr int OH       = " << fShapeY[2]          << ";\n";
+         op << SP << SP << SP << "constexpr int kh       = " << fAttrKernelShape[0] << ";\n";
+         op << SP << SP << SP << "constexpr int sh       = " << fAttrStrides[0]     << ";\n";
+         op << SP << SP << SP << "constexpr int pad_top  = " << fAttrPads[0]        << ";\n\n";
+
+         op << SP << SP << SP << "auto const tid    = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+         op << SP << SP << SP << "auto const stride = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+         op << SP << SP << SP << "for (std::size_t idx = tid; idx < totalOut; idx += stride) {\n";
+         op << SP << SP << SP << SP << "int oh = idx % OH;\n";
+         op << SP << SP << SP << SP << "int nc = idx / OH;\n";
+         op << SP << SP << SP << SP << "int i  = oh * sh - pad_top;\n";
+         op << SP << SP << SP << SP << "std::size_t base = static_cast<std::size_t>(nc) * H;\n\n";
+         op << SP << SP << SP << SP << "T value = static_cast<T>(-INFINITY);\n";
+         op << SP << SP << SP << SP << "for (int l = i; l < i + kh; ++l) {\n";
+         op << SP << SP << SP << SP << SP << "if (l < 0 || l >= H) continue;\n";
+         op << SP << SP << SP << SP << SP << "T xv = X[base + l];\n";
+         op << SP << SP << SP << SP << SP << "if (xv > value) value = xv;\n";
+         op << SP << SP << SP << SP << "}\n";
+         op << SP << SP << SP << SP << "Y[idx] = value;\n";
+         op << SP << SP << SP << "}\n";
+      }
+      else if (fDim == 2) {
+         op << SP << SP << SP << "constexpr int H        = " << fShapeX[2]          << ";\n";
+         op << SP << SP << SP << "constexpr int W        = " << fShapeX[3]          << ";\n";
+         op << SP << SP << SP << "constexpr int OH       = " << fShapeY[2]          << ";\n";
+         op << SP << SP << SP << "constexpr int OW       = " << fShapeY[3]          << ";\n";
+         op << SP << SP << SP << "constexpr int kh       = " << fAttrKernelShape[0] << ";\n";
+         op << SP << SP << SP << "constexpr int kw       = " << fAttrKernelShape[1] << ";\n";
+         op << SP << SP << SP << "constexpr int sh       = " << fAttrStrides[0]     << ";\n";
+         op << SP << SP << SP << "constexpr int sw       = " << fAttrStrides[1]     << ";\n";
+         op << SP << SP << SP << "constexpr int pad_top  = " << fAttrPads[0]        << ";\n";
+         op << SP << SP << SP << "constexpr int pad_left = " << fAttrPads[2]        << ";\n\n";
+
+         op << SP << SP << SP << "auto const tid    = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+         op << SP << SP << SP << "auto const stride = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+         op << SP << SP << SP << "for (std::size_t idx = tid; idx < totalOut; idx += stride) {\n";
+         op << SP << SP << SP << SP << "int ow = idx % OW;\n";
+         op << SP << SP << SP << SP << "int oh = (idx / OW) % OH;\n";
+         op << SP << SP << SP << SP << "int nc = idx / (OH * OW);\n";
+         op << SP << SP << SP << SP << "int i  = oh * sh - pad_top;\n";
+         op << SP << SP << SP << SP << "int j  = ow * sw - pad_left;\n";
+         op << SP << SP << SP << SP << "std::size_t base = static_cast<std::size_t>(nc) * (H * W);\n\n";
+         op << SP << SP << SP << SP << "T value = static_cast<T>(-INFINITY);\n";
+         op << SP << SP << SP << SP << "for (int l = i; l < i + kh; ++l) {\n";
+         op << SP << SP << SP << SP << SP << "if (l < 0 || l >= H) continue;\n";
+         op << SP << SP << SP << SP << SP << "for (int m = j; m < j + kw; ++m) {\n";
+         op << SP << SP << SP << SP << SP << SP << "if (m < 0 || m >= W) continue;\n";
+         op << SP << SP << SP << SP << SP << SP << "T xv = X[base + l * W + m];\n";
+         op << SP << SP << SP << SP << SP << SP << "if (xv > value) value = xv;\n";
+         op << SP << SP << SP << SP << SP << "}\n";
+         op << SP << SP << SP << SP << "}\n";
+         op << SP << SP << SP << SP << "Y[idx] = value;\n";
+         op << SP << SP << SP << "}\n";
+      }
+      else if (fDim == 3) {
+         op << SP << SP << SP << "constexpr int H         = " << fShapeX[2]          << ";\n";
+         op << SP << SP << SP << "constexpr int W         = " << fShapeX[3]          << ";\n";
+         op << SP << SP << SP << "constexpr int D         = " << fShapeX[4]          << ";\n";
+         op << SP << SP << SP << "constexpr int OH        = " << fShapeY[2]          << ";\n";
+         op << SP << SP << SP << "constexpr int OW        = " << fShapeY[3]          << ";\n";
+         op << SP << SP << SP << "constexpr int OD        = " << fShapeY[4]          << ";\n";
+         op << SP << SP << SP << "constexpr int kh        = " << fAttrKernelShape[0] << ";\n";
+         op << SP << SP << SP << "constexpr int kw        = " << fAttrKernelShape[1] << ";\n";
+         op << SP << SP << SP << "constexpr int kd        = " << fAttrKernelShape[2] << ";\n";
+         op << SP << SP << SP << "constexpr int sh        = " << fAttrStrides[0]     << ";\n";
+         op << SP << SP << SP << "constexpr int sw        = " << fAttrStrides[1]     << ";\n";
+         op << SP << SP << SP << "constexpr int sd        = " << fAttrStrides[2]     << ";\n";
+         op << SP << SP << SP << "constexpr int pad_top   = " << fAttrPads[0]        << ";\n";
+         op << SP << SP << SP << "constexpr int pad_left  = " << fAttrPads[2]        << ";\n";
+         op << SP << SP << SP << "constexpr int pad_front = " << fAttrPads[4]        << ";\n\n";
+
+         op << SP << SP << SP << "auto const tid    = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+         op << SP << SP << SP << "auto const stride = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+         op << SP << SP << SP << "for (std::size_t idx = tid; idx < totalOut; idx += stride) {\n";
+         op << SP << SP << SP << SP << "int od = idx % OD;\n";
+         op << SP << SP << SP << SP << "int ow = (idx / OD) % OW;\n";
+         op << SP << SP << SP << SP << "int oh = (idx / (OD * OW)) % OH;\n";
+         op << SP << SP << SP << SP << "int nc = idx / (OD * OW * OH);\n";
+         op << SP << SP << SP << SP << "int i  = oh * sh - pad_top;\n";
+         op << SP << SP << SP << SP << "int j  = ow * sw - pad_left;\n";
+         op << SP << SP << SP << SP << "int k  = od * sd - pad_front;\n";
+         op << SP << SP << SP << SP << "std::size_t base = static_cast<std::size_t>(nc) * (H * W * D);\n\n";
+         op << SP << SP << SP << SP << "T value = static_cast<T>(-INFINITY);\n";
+         op << SP << SP << SP << SP << "for (int l = i; l < i + kh; ++l) {\n";
+         op << SP << SP << SP << SP << SP << "if (l < 0 || l >= H) continue;\n";
+         op << SP << SP << SP << SP << SP << "for (int m = j; m < j + kw; ++m) {\n";
+         op << SP << SP << SP << SP << SP << SP << "if (m < 0 || m >= W) continue;\n";
+         op << SP << SP << SP << SP << SP << SP << "for (int p = k; p < k + kd; ++p) {\n";
+         op << SP << SP << SP << SP << SP << SP << SP << "if (p < 0 || p >= D) continue;\n";
+         op << SP << SP << SP << SP << SP << SP << SP << "T xv = X[base + l * (W * D) + m * D + p];\n";
+         op << SP << SP << SP << SP << SP << SP << SP << "if (xv > value) value = xv;\n";
+         op << SP << SP << SP << SP << SP << SP << "}\n";
+         op << SP << SP << SP << SP << SP << "}\n";
+         op << SP << SP << SP << SP << "}\n";
+         op << SP << SP << SP << SP << "Y[idx] = value;\n";
+         op << SP << SP << SP << "}\n";
+      }
+      else {
+         return "";
+      }
+
       op << SP << SP << "}\n";
       op << SP << "};\n";
 
@@ -535,7 +609,7 @@ public:
 
    std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
       opName = "op_" + opName;
-      if (fPoolMode != MaxPool || fDim != 2)
+      if (fPoolMode != MaxPool)
          return "";
       std::string kname = "MaxPoolKernel_" + opName;
       return SP + kname + " maxPoolKernel_" + opName + ";\n";
@@ -545,7 +619,7 @@ public:
       opName = "op_" + opName;
       if (fShapeX.empty() || fShapeY.empty())
          throw std::runtime_error("SOFIE Pool called to Generate without being initialized first");
-      if (fPoolMode != MaxPool || fDim != 2)
+      if (fPoolMode != MaxPool)
          return "";
 
       std::size_t totalOut = ConvertShapeToLength(fShapeY);
diff --git a/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx b/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx
index e513c27..62a7166 100644
--- a/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx
+++ b/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx
@@ -164,9 +164,15 @@
 #include "ConvWithAsymmetricPadding_FromONNX_GPU_ALPAKA.hxx"
 #include "input_models/references/ConvWithAsymmetricPadding.ref.hxx"
 
+#include "MaxPool1d_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/MaxPool1d.ref.hxx"
+
 #include "MaxPool2d_FromONNX_GPU_ALPAKA.hxx"
 #include "input_models/references/MaxPool2d.ref.hxx"
 
+#include "MaxPool3d_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/MaxPool3d.ref.hxx"
+
 #include "BatchNorm_FromONNX_GPU_ALPAKA.hxx"
 #include "BatchNormRelu_FromONNX_GPU_ALPAKA.hxx"
 
@@ -2485,6 +2491,82 @@ TEST_F(SofieAlpakaTest, MaxPool2d)
    }
 }
 
+TEST_F(SofieAlpakaTest, MaxPool1d)
+{
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   std::vector<float> input({
+       0.0907,  0.1029,  0.8143,  1.4497, -0.7785,  0.3825, -0.3764,  1.5785, -0.0835,  0.1622,
+       1.5867,  0.9823, -0.8821,  0.4439, -0.1378, -0.2273, -0.0198, -2.0230,  0.0905,  0.6674,
+      -1.4290, -1.3100, -0.9439, -0.0833, -0.1919,  0.6886,  0.9389, -1.2914, -1.3584, -2.0341,
+      -0.3269,  0.1704,  1.1776,  1.3972, -1.8874, -1.5334,  1.1541,  0.3011,  0.6569, -2.3504,
+       0.4033,  0.1142,  2.2846, -1.3948, -0.8573,  0.5756, -1.0864,  0.2283,  0.8947,  1.7627,
+      -0.1657,  0.0649, -1.6066,  0.4162, -1.1525, -0.8184,  1.1324, -1.1086,  0.1061,  1.0071
+   }); // took from reference output
+
+   auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+   float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+   for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+   auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+   alpaka::memcpy(queue, input_d, input_h);
+   alpaka::wait(queue);
+
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{sizeof(MaxPool1d_ExpectedOutput::output) / sizeof(float)}));
+
+   {
+      SOFIE_MaxPool1d::Session<alpaka::TagGpuCudaRt> session("MaxPool1d_FromONNX_GPU_ALPAKA.dat");
+      auto result = session.infer(input_d);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float* correct = MaxPool1d_ExpectedOutput::output;
+   constexpr size_t nOut_maxpool1d = sizeof(MaxPool1d_ExpectedOutput::output) / sizeof(float);
+   for (size_t i = 0; i < nOut_maxpool1d; ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+   }
+}
+
+TEST_F(SofieAlpakaTest, MaxPool3d)
+{
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   std::vector<float> input({
+      -2.6496,  1.0476, -0.5153,  0.3771,  0.4129, -0.3077, -0.8717, -0.8040, -0.3525,
+      -0.1765, -0.3364,  0.8737, -0.2381, -0.8297,  0.4666,  0.6984, -0.6760,  0.6298,
+       1.3833,  0.1101,  0.2039, -0.5477,  0.2341,  0.9181,  0.3842,  0.2428,  1.7924
+   });// took from reference output
+
+   auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+   float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+   for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+   auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+   alpaka::memcpy(queue, input_d, input_h);
+   alpaka::wait(queue);
+
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{sizeof(MaxPool3d_ExpectedOutput::output) / sizeof(float)}));
+
+   {
+      SOFIE_MaxPool3d::Session<alpaka::TagGpuCudaRt> session("MaxPool3d_FromONNX_GPU_ALPAKA.dat");
+      auto result = session.infer(input_d);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float* correct = MaxPool3d_ExpectedOutput::output;
+   constexpr size_t nOut_maxpool3d = sizeof(MaxPool3d_ExpectedOutput::output) / sizeof(float);
+
+   for (size_t i = 0; i < nOut_maxpool3d; ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+   }
+}
+
 TEST_F(SofieAlpakaTest, BatchNormalization)
 {
     constexpr float TOLERANCE = DEFAULT_TOLERANCE;

From 22b059c855ed73f73de65808ee029e3be840fe53 Mon Sep 17 00:00:00 2001
From: Harsh Chauhan <harsh2005.hc@gmail.com>
Date: Sun, 7 Jun 2026 15:53:16 +0530
Subject: [PATCH 4/4] added AvgPool and GlobalAvgPool GPU kernels + tests

---
 core/inc/SOFIE/ROperator_Pool.hxx             |  76 ++++++--
 .../TestCustomModelsFromONNXForAlpakaCuda.cxx | 168 ++++++++++++++++++
 .../input_models/AvgPoolCountIncludePad.onnx  | Bin 0 -> 215 bytes
 core/test/input_models/AvgPoolPad.onnx        | Bin 0 -> 177 bytes
 core/test/input_models/GlobalAvgPool2d.onnx   | Bin 0 -> 127 bytes
 .../references/AvgPoolCountIncludePad.ref.hxx |   3 +
 .../references/AvgPoolPad.ref.hxx             |   3 +
 .../references/GlobalAvgPool2d.ref.hxx        |   3 +
 8 files changed, 234 insertions(+), 19 deletions(-)
 create mode 100644 core/test/input_models/AvgPoolCountIncludePad.onnx
 create mode 100644 core/test/input_models/AvgPoolPad.onnx
 create mode 100644 core/test/input_models/GlobalAvgPool2d.onnx
 create mode 100644 core/test/input_models/references/AvgPoolCountIncludePad.ref.hxx
 create mode 100644 core/test/input_models/references/AvgPoolPad.ref.hxx
 create mode 100644 core/test/input_models/references/GlobalAvgPool2d.ref.hxx

diff --git a/core/inc/SOFIE/ROperator_Pool.hxx b/core/inc/SOFIE/ROperator_Pool.hxx
index ca66e1e..7d99b64 100644
--- a/core/inc/SOFIE/ROperator_Pool.hxx
+++ b/core/inc/SOFIE/ROperator_Pool.hxx
@@ -480,13 +480,49 @@ public:
       opName = "op_" + opName;
       if (fShapeX.empty() || fShapeY.empty())
          throw std::runtime_error("SOFIE Pool called to Generate without being initialized first");
-      if (fPoolMode != MaxPool)
+      if (fPoolMode != MaxPool && fPoolMode != AveragePool)
          return "";
 
-      std::string kname = "MaxPoolKernel_" + opName;
+      const bool isAvg = (fPoolMode == AveragePool);
+      bool doPadding = false;
+      for (auto & e : fAttrPads) doPadding |= (e > 0);
+      // count_include_pad == 0 with padding: divide by the in-bounds cells counted
+      // at run time; otherwise by the constant kernel area (CPU Generate above).
+      const bool runtimeCount = isAvg && fAttrCountIncludePad == 0 && doPadding;
+
+      const std::string kname = (isAvg ? "AvgPoolKernel_" : "MaxPoolKernel_") + opName;
+
+      // Mode dependent fragments, so the index math stays shared across 1D/2D/3D.
+      auto emitInit = [&](const std::string & ind) {
+         std::string s;
+         if (isAvg) {
+            s += ind + "T value = static_cast<T>(0);\n";
+            if (runtimeCount) s += ind + "int count = 0;\n";
+         } else {
+            s += ind + "T value = static_cast<T>(-INFINITY);\n";
+         }
+         return s;
+      };
+      auto emitAccum = [&](const std::string & ind, const std::string & xidx) {
+         std::string s;
+         if (isAvg) {
+            s += ind + "value += X[" + xidx + "];\n";
+            if (runtimeCount) s += ind + "++count;\n";
+         } else {
+            s += ind + "T xv = X[" + xidx + "];\n";
+            s += ind + "if (xv > value) value = xv;\n";
+         }
+         return s;
+      };
+      auto emitFinal = [&](const std::string & ind, const std::string & area) {
+         std::string s;
+         if (isAvg)
+            s += ind + "value /= static_cast<T>(" + (runtimeCount ? std::string("count") : area) + ");\n";
+         return s;
+      };
 
       std::stringstream op;
-      op << "\n//------ MAXPOOL_KERNEL_ALPAKA\n";
+      op << "\n//------ " << (isAvg ? "AVGPOOL" : "MAXPOOL") << "_KERNEL_ALPAKA\n";
       op << SP << "struct " << kname << " {\n";
       op << SP << SP << "template<typename TAcc, typename T>\n";
       op << SP << SP << "ALPAKA_FN_ACC void operator()(\n";
@@ -510,12 +546,12 @@ public:
          op << SP << SP << SP << SP << "int nc = idx / OH;\n";
          op << SP << SP << SP << SP << "int i  = oh * sh - pad_top;\n";
          op << SP << SP << SP << SP << "std::size_t base = static_cast<std::size_t>(nc) * H;\n\n";
-         op << SP << SP << SP << SP << "T value = static_cast<T>(-INFINITY);\n";
+         op << emitInit(SP + SP + SP + SP);
          op << SP << SP << SP << SP << "for (int l = i; l < i + kh; ++l) {\n";
          op << SP << SP << SP << SP << SP << "if (l < 0 || l >= H) continue;\n";
-         op << SP << SP << SP << SP << SP << "T xv = X[base + l];\n";
-         op << SP << SP << SP << SP << SP << "if (xv > value) value = xv;\n";
+         op << emitAccum(SP + SP + SP + SP + SP, "base + l");
          op << SP << SP << SP << SP << "}\n";
+         op << emitFinal(SP + SP + SP + SP, "kh");
          op << SP << SP << SP << SP << "Y[idx] = value;\n";
          op << SP << SP << SP << "}\n";
       }
@@ -541,15 +577,15 @@ public:
          op << SP << SP << SP << SP << "int i  = oh * sh - pad_top;\n";
          op << SP << SP << SP << SP << "int j  = ow * sw - pad_left;\n";
          op << SP << SP << SP << SP << "std::size_t base = static_cast<std::size_t>(nc) * (H * W);\n\n";
-         op << SP << SP << SP << SP << "T value = static_cast<T>(-INFINITY);\n";
+         op << emitInit(SP + SP + SP + SP);
          op << SP << SP << SP << SP << "for (int l = i; l < i + kh; ++l) {\n";
          op << SP << SP << SP << SP << SP << "if (l < 0 || l >= H) continue;\n";
          op << SP << SP << SP << SP << SP << "for (int m = j; m < j + kw; ++m) {\n";
          op << SP << SP << SP << SP << SP << SP << "if (m < 0 || m >= W) continue;\n";
-         op << SP << SP << SP << SP << SP << SP << "T xv = X[base + l * W + m];\n";
-         op << SP << SP << SP << SP << SP << SP << "if (xv > value) value = xv;\n";
+         op << emitAccum(SP + SP + SP + SP + SP + SP, "base + l * W + m");
          op << SP << SP << SP << SP << SP << "}\n";
          op << SP << SP << SP << SP << "}\n";
+         op << emitFinal(SP + SP + SP + SP, "kh * kw");
          op << SP << SP << SP << SP << "Y[idx] = value;\n";
          op << SP << SP << SP << "}\n";
       }
@@ -582,18 +618,18 @@ public:
          op << SP << SP << SP << SP << "int j  = ow * sw - pad_left;\n";
          op << SP << SP << SP << SP << "int k  = od * sd - pad_front;\n";
          op << SP << SP << SP << SP << "std::size_t base = static_cast<std::size_t>(nc) * (H * W * D);\n\n";
-         op << SP << SP << SP << SP << "T value = static_cast<T>(-INFINITY);\n";
+         op << emitInit(SP + SP + SP + SP);
          op << SP << SP << SP << SP << "for (int l = i; l < i + kh; ++l) {\n";
          op << SP << SP << SP << SP << SP << "if (l < 0 || l >= H) continue;\n";
          op << SP << SP << SP << SP << SP << "for (int m = j; m < j + kw; ++m) {\n";
          op << SP << SP << SP << SP << SP << SP << "if (m < 0 || m >= W) continue;\n";
          op << SP << SP << SP << SP << SP << SP << "for (int p = k; p < k + kd; ++p) {\n";
          op << SP << SP << SP << SP << SP << SP << SP << "if (p < 0 || p >= D) continue;\n";
-         op << SP << SP << SP << SP << SP << SP << SP << "T xv = X[base + l * (W * D) + m * D + p];\n";
-         op << SP << SP << SP << SP << SP << SP << SP << "if (xv > value) value = xv;\n";
+         op << emitAccum(SP + SP + SP + SP + SP + SP + SP, "base + l * (W * D) + m * D + p");
          op << SP << SP << SP << SP << SP << SP << "}\n";
          op << SP << SP << SP << SP << SP << "}\n";
          op << SP << SP << SP << SP << "}\n";
+         op << emitFinal(SP + SP + SP + SP, "kh * kw * kd");
          op << SP << SP << SP << SP << "Y[idx] = value;\n";
          op << SP << SP << SP << "}\n";
       }
@@ -609,24 +645,26 @@ public:
 
    std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
       opName = "op_" + opName;
-      if (fPoolMode != MaxPool)
-         return "";
-      std::string kname = "MaxPoolKernel_" + opName;
-      return SP + kname + " maxPoolKernel_" + opName + ";\n";
+      if (fPoolMode == MaxPool)
+         return SP + "MaxPoolKernel_" + opName + " maxPoolKernel_" + opName + ";\n";
+      if (fPoolMode == AveragePool)
+         return SP + "AvgPoolKernel_" + opName + " avgPoolKernel_" + opName + ";\n";
+      return "";
    }
 
    std::string Generate_GPU_ALPAKA(std::string opName) override {
       opName = "op_" + opName;
       if (fShapeX.empty() || fShapeY.empty())
          throw std::runtime_error("SOFIE Pool called to Generate without being initialized first");
-      if (fPoolMode != MaxPool)
+      if (fPoolMode != MaxPool && fPoolMode != AveragePool)
          return "";
 
+      const bool isAvg = (fPoolMode == AveragePool);
       std::size_t totalOut = ConvertShapeToLength(fShapeY);
-      std::string kname = "maxPoolKernel_" + opName;
+      std::string kname = (isAvg ? "avgPoolKernel_" : "maxPoolKernel_") + opName;
 
       std::stringstream out;
-      out << "\n//------ MAXPOOL_GPU_ALPAKA\n";
+      out << "\n//------ " << (isAvg ? "AVGPOOL" : "MAXPOOL") << "_GPU_ALPAKA\n";
       out << SP << "auto const elementsPerThread_" << fNY << " = Vec::all(static_cast<Idx>(1));\n";
       out << SP << "auto const elementsPerGrid_"   << fNY << " = Vec::all(Idx{" << totalOut << "});\n";
       out << SP << "auto const workDiv_" << fNY << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n";
diff --git a/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx b/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx
index 62a7166..eca329f 100644
--- a/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx
+++ b/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx
@@ -173,6 +173,18 @@
 #include "MaxPool3d_FromONNX_GPU_ALPAKA.hxx"
 #include "input_models/references/MaxPool3d.ref.hxx"
 
+#include "AvgPool_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/AvgPool.ref.hxx"
+
+#include "AvgPoolPad_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/AvgPoolPad.ref.hxx"
+
+#include "AvgPoolCountIncludePad_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/AvgPoolCountIncludePad.ref.hxx"
+
+#include "GlobalAvgPool2d_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/GlobalAvgPool2d.ref.hxx"
+
 #include "BatchNorm_FromONNX_GPU_ALPAKA.hxx"
 #include "BatchNormRelu_FromONNX_GPU_ALPAKA.hxx"
 
@@ -2567,6 +2579,162 @@ TEST_F(SofieAlpakaTest, MaxPool3d)
    }
 }
 
+// AveragePool with no padding (kernel 3x2, stride [2,1]); this re uses the existing AvgPool model and CPU reference
+
+TEST_F(SofieAlpakaTest, AvgPool)
+{
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   std::vector<float> input({
+      0.4764, -0.1976,  1.6506, -0.2421,  0.6412,  1.9985,  0.3938,
+            0.1347,  0.2204, -0.7503,
+           0.2139,  0.7285, -0.0210, -0.4585, -1.5333, -0.4772,  0.5560,
+            0.6323, -2.5372,  1.4906,
+          -1.1062, -0.9703,  0.2366, -0.9184,  0.3014,  0.7985, -0.6841,
+           -2.2854, -2.7728, -1.2806,
+          -1.0947, -0.5990, -0.3033, -1.9042, -0.5403,  0.2332,  0.9215,
+           -0.1549,  0.0557, -0.5567,
+          -1.4971,  0.5386, -0.2922,  0.4860, -0.3973, -0.4624,  0.4514,
+            0.2385,  0.3783, -1.0500
+   });
+
+   auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+   float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+   for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+   auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+   alpaka::memcpy(queue, input_d, input_h);
+   alpaka::wait(queue);
+
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{sizeof(AvgPool_ExpectedOutput::output) / sizeof(float)}));
+
+   {
+      SOFIE_AvgPool::Session<alpaka::TagGpuCudaRt> session("AvgPool_FromONNX_GPU_ALPAKA.dat");
+      auto result = session.infer(input_d);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float* correct = AvgPool_ExpectedOutput::output;
+   constexpr size_t nOut_avgpool = sizeof(AvgPool_ExpectedOutput::output) / sizeof(float);
+
+   for (size_t i = 0; i < nOut_avgpool; ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+   }
+}
+
+// AveragePool 3x3, pads 1 all round, count_include_pad=0 (default)
+
+TEST_F(SofieAlpakaTest, AvgPoolPad)
+{
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   std::vector<float> input(16);
+   for (size_t i = 0; i < input.size(); ++i) input[i] = float(i);
+
+   auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+   float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+   for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+   auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+   alpaka::memcpy(queue, input_d, input_h);
+   alpaka::wait(queue);
+
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{sizeof(AvgPoolPad_ExpectedOutput::output) / sizeof(float)}));
+
+   {
+      SOFIE_AvgPoolPad::Session<alpaka::TagGpuCudaRt> session("AvgPoolPad_FromONNX_GPU_ALPAKA.dat");
+      auto result = session.infer(input_d);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float* correct = AvgPoolPad_ExpectedOutput::output;
+   constexpr size_t nOut_avgpoolpad = sizeof(AvgPoolPad_ExpectedOutput::output) / sizeof(float);
+
+   for (size_t i = 0; i < nOut_avgpoolpad; ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+   }
+}
+
+// Same model as AvgPoolPad but count_include_pad = 1, so the divisor is the full
+// kernel area (kh*kw). The border values differ from AvgPoolPad, which is what this test pins down
+
+TEST_F(SofieAlpakaTest, AvgPoolCountIncludePad)
+{
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   std::vector<float> input(16);
+   for (size_t i = 0; i < input.size(); ++i) input[i] = float(i);
+
+   auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+   float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+   for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+   auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+   alpaka::memcpy(queue, input_d, input_h);
+   alpaka::wait(queue);
+
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{sizeof(AvgPoolCountIncludePad_ExpectedOutput::output) / sizeof(float)}));
+
+   {
+      SOFIE_AvgPoolCountIncludePad::Session<alpaka::TagGpuCudaRt> session("AvgPoolCountIncludePad_FromONNX_GPU_ALPAKA.dat");
+      auto result = session.infer(input_d);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float* correct = AvgPoolCountIncludePad_ExpectedOutput::output;
+   constexpr size_t nOut_avgpoolinc = sizeof(AvgPoolCountIncludePad_ExpectedOutput::output) / sizeof(float);
+
+   for (size_t i = 0; i < nOut_avgpoolinc; ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+   }
+}
+
+// GlobalAveragePool: one output per channel = the mean of the whole channel.
+// Input x[1,2,3,3] = iota 0..17, so the channel means are 4 and 13.
+
+TEST_F(SofieAlpakaTest, GlobalAvgPool2d)
+{
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   std::vector<float> input(18);
+   for (size_t i = 0; i < input.size(); ++i) input[i] = float(i);
+
+   auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+   float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+   for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+   auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+   alpaka::memcpy(queue, input_d, input_h);
+   alpaka::wait(queue);
+
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{sizeof(GlobalAvgPool2d_ExpectedOutput::output) / sizeof(float)}));
+
+   {
+      SOFIE_GlobalAvgPool2d::Session<alpaka::TagGpuCudaRt> session("GlobalAvgPool2d_FromONNX_GPU_ALPAKA.dat");
+      auto result = session.infer(input_d);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float* correct = GlobalAvgPool2d_ExpectedOutput::output;
+   constexpr size_t nOut_globalavg = sizeof(GlobalAvgPool2d_ExpectedOutput::output) / sizeof(float);
+
+   for (size_t i = 0; i < nOut_globalavg; ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+   }
+}
+
 TEST_F(SofieAlpakaTest, BatchNormalization)
 {
     constexpr float TOLERANCE = DEFAULT_TOLERANCE;
diff --git a/core/test/input_models/AvgPoolCountIncludePad.onnx b/core/test/input_models/AvgPoolCountIncludePad.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..029843156d54d23c9439818b79793bbc54fb1333
GIT binary patch
literal 215
zcmd;J7ve3bEXl~vi!V+t$}A|c+RMn5$;DV9#8|1s?O2vtl$f3xke{EUCBY?_oL`z(
z5}%ouoKu>T8efo@BEh(Tkx5IGizhp^C@(c9zBnVXAk~4{fq4NVyOtmq3s6b110w{2
zgao+Qi%W_!Q&PdAKoKD^$Fg*g@y;NlJs~CsB&I}3gPb77CBnfdB*4YQ!3f1HK$;~9
PC|QY1(uswOL4Xec#V;@&

literal 0
HcmV?d00001

diff --git a/core/test/input_models/AvgPoolPad.onnx b/core/test/input_models/AvgPoolPad.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..66eee7b87b18bffc70598b8cc6192e618f97d74a
GIT binary patch
literal 177
zcmd;J7ve3bEXl~vi!V+t$}A|cn$F1O&&60F#8|1s?O2vtl$f3xke{EUCCbH<om!Na
zniF4~kyw!G!0f=hfRSBGkc*`tF{Rjn5duL%0$l9HB}JJjsbEo{h!B@!Svts&fW(w2
jX^`P!Tp}EdLIPY&9E?!R0;E}zfRdHCB%N5e7zFqLXFVe8

literal 0
HcmV?d00001

diff --git a/core/test/input_models/GlobalAvgPool2d.onnx b/core/test/input_models/GlobalAvgPool2d.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..f9d74928fd5f9e9efd81273ebad806e39014c57a
GIT binary patch
literal 127
zcmd;J7ve3bEXl~vi!V+t$}A|cO5>8`VyqBitW*+o&&f|p%yBGBElNyJ4am>W5#oml
urGvOeDN)iOwPIW%9E?H&TudB{K+FWh%pjZul&r)e2~x!9#KOfOzy|;Y4j3l@

literal 0
HcmV?d00001

diff --git a/core/test/input_models/references/AvgPoolCountIncludePad.ref.hxx b/core/test/input_models/references/AvgPoolCountIncludePad.ref.hxx
new file mode 100644
index 0000000..b33af03
--- /dev/null
+++ b/core/test/input_models/references/AvgPoolCountIncludePad.ref.hxx
@@ -0,0 +1,3 @@
+namespace AvgPoolCountIncludePad_ExpectedOutput {
+float output[] = {1.1111111640930176, 2.0, 2.6666667461395264, 2.0, 3.0, 5.0, 6.0, 4.333333492279053, 5.666666507720947, 9.0, 10.0, 7.0, 4.666666507720947, 7.333333492279053, 8.0, 5.55555534362793};
+} // namespace AvgPoolCountIncludePad_ExpectedOutput
diff --git a/core/test/input_models/references/AvgPoolPad.ref.hxx b/core/test/input_models/references/AvgPoolPad.ref.hxx
new file mode 100644
index 0000000..51f0e8f
--- /dev/null
+++ b/core/test/input_models/references/AvgPoolPad.ref.hxx
@@ -0,0 +1,3 @@
+namespace AvgPoolPad_ExpectedOutput {
+float output[] = {2.5, 3.0, 4.0, 4.5, 4.5, 5.0, 6.0, 6.5, 8.5, 9.0, 10.0, 10.5, 10.5, 11.0, 12.0, 12.5};
+} // namespace AvgPoolPad_ExpectedOutput
diff --git a/core/test/input_models/references/GlobalAvgPool2d.ref.hxx b/core/test/input_models/references/GlobalAvgPool2d.ref.hxx
new file mode 100644
index 0000000..4eee5e7
--- /dev/null
+++ b/core/test/input_models/references/GlobalAvgPool2d.ref.hxx
@@ -0,0 +1,3 @@
+namespace GlobalAvgPool2d_ExpectedOutput {
+float output[] = {4.0, 13.0};
+} // namespace GlobalAvgPool2d_ExpectedOutput