From 014ab107e37ee2cd4db164d78d650bc66fc5ab30 Mon Sep 17 00:00:00 2001 From: Harsh Chauhan Date: Mon, 15 Jun 2026 18:47:16 +0530 Subject: [PATCH] fixing cpu elu formula and added support for Elu GPU alpaka kernel with gtest --- core/inc/SOFIE/ROperator_Elu.hxx | 40 ++++++++++++++++++- .../TestCustomModelsFromONNXForAlpakaCuda.cxx | 37 +++++++++++++++++ 2 files changed, 76 insertions(+), 1 deletion(-) diff --git a/core/inc/SOFIE/ROperator_Elu.hxx b/core/inc/SOFIE/ROperator_Elu.hxx index 6588b61..31582ee 100644 --- a/core/inc/SOFIE/ROperator_Elu.hxx +++ b/core/inc/SOFIE/ROperator_Elu.hxx @@ -69,11 +69,49 @@ public: out << "\n//------ ELU \n"; out << SP << "for (int id = 0; id < " << length << " ; id++){\n"; - out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNX << "[id] >= 0 )? tensor_" << fNX << "[id] : "<< OpName << "_alpha * std::exp(tensor_"<< fNX<<"[id]) - 1);\n"; + out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNX << "[id] >= 0 )? tensor_" << fNX << "[id] : "<< OpName << "_alpha * (std::exp(tensor_"<< fNX<<"[id]) - 1));\n"; out << SP << "}\n"; return out.str(); } + + std::vector GetStdLibs() override { return { std::string("cmath") }; } + + // elu gpu kernel + std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override { + std::string op; + op = "\n//------ ELU_KERNEL_ALPAKA\n"; + op += "struct EluKernel {\n"; + op += SP + "template\n"; + op += SP + "ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* __restrict__ data, T* __restrict__ out, std::size_t numElements, T alpha) const {\n"; + op += SP + SP + "const auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + "if (idx < numElements) { out[idx] = data[idx] >= T(0) ? data[idx]:alpha * (exp(data[idx]) - T(1)); }\n"; + op += SP + "}\n"; + op += "};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + return SP + "EluKernel eluKernel;\n"; + } + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("SOFIE Elu called to Generate_GPU_ALPAKA without being initialized"); + } + std::stringstream out; + std::string length = ConvertDimShapeToLength(fShape); + out << "\n//------ ELU_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"<(workDiv_" << fNX + << ", eluKernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast(" << length << "), static_cast(" + << std::setprecision(std::numeric_limits::max_digits10) << falpha << "));\n"; + out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n"; + return out.str(); + } }; }//SOFIE diff --git a/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx b/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx index 5ad9383..ecfb7b6 100644 --- a/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx +++ b/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx @@ -176,6 +176,9 @@ #include "Clip_FromONNX_GPU_ALPAKA.hxx" #include "Not_FromONNX_GPU_ALPAKA.hxx" +#include "Elu_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/Elu.ref.hxx" + #include "GNN_model_FromONNX_GPU_ALPAKA.hxx" #include @@ -3161,3 +3164,37 @@ TEST_F(SofieAlpakaTest, Logic_BitwiseNot) for (std::size_t i = 0; i < N; ++i) EXPECT_EQ(res[i], ref[i]) << " index=" << i; } + +TEST_F(SofieAlpakaTest, Elu) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // same input as the CPU Elu test: spans negative + positive + std::vector input({1.0f, -2.0f, 3.0f, 0.5f, -1.0f, 2.0f}); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + constexpr size_t nOut = sizeof(Elu_ExpectedOutput::outputs) / sizeof(float); + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{nOut})); + + { + SOFIE_Elu::Session session; + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Elu_ExpectedOutput::outputs; + for (size_t i = 0; i < nOut; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; + } +}