Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 39 additions & 1 deletion core/inc/SOFIE/ROperator_Elu.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,49 @@ public:

out << "\n//------ ELU \n";
out << SP << "for (int id = 0; id < " << length << " ; id++){\n";
out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNX << "[id] >= 0 )? tensor_" << fNX << "[id] : "<< OpName << "_alpha * std::exp(tensor_"<< fNX<<"[id]) - 1);\n";
out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNX << "[id] >= 0 )? tensor_" << fNX << "[id] : "<< OpName << "_alpha * (std::exp(tensor_"<< fNX<<"[id]) - 1));\n";
out << SP << "}\n";
return out.str();
}

std::vector<std::string> GetStdLibs() override { return { std::string("cmath") }; }

// elu gpu kernel
std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override {
std::string op;
op = "\n//------ ELU_KERNEL_ALPAKA\n";
op += "struct EluKernel {\n";
op += SP + "template<typename TAcc, typename T>\n";
op += SP + "ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* __restrict__ data, T* __restrict__ out, std::size_t numElements, T alpha) const {\n";
op += SP + SP + "const auto idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
op += SP + SP + "if (idx < numElements) { out[idx] = data[idx] >= T(0) ? data[idx]:alpha * (exp(data[idx]) - T(1)); }\n";
op += SP + "}\n";
op += "};\n";
return op;
}

std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override {
return SP + "EluKernel eluKernel;\n";
}

std::string Generate_GPU_ALPAKA(std::string OpName) override {
OpName = "op_" + OpName;
if (fShape.empty()) {
throw std::runtime_error("SOFIE Elu called to Generate_GPU_ALPAKA without being initialized");
}
std::stringstream out;
std::string length = ConvertDimShapeToLength(fShape);
out << "\n//------ ELU_GPU_ALPAKA\n";
out << SP << "auto const elementsPerThread_"<<fNX<<" = Vec::all(static_cast<Idx>(1));\n";
out << SP << "auto const elementsPerGrid_"<<fNX<<" = Vec::all(Idx{"<< length << "});\n";
out << SP << "auto const workDiv_" << fNX << " = sofie_workdiv(elementsPerGrid_" << fNX << ");\n";
out << SP << "auto task_" << OpName << " = alpaka::createTaskKernel<Acc>(workDiv_" << fNX
<< ", eluKernel, alpaka::getPtrNative(deviceBuf_" << fNX
<< "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast<Idx>(" << length << "), static_cast<float>("
<< std::setprecision(std::numeric_limits<float>::max_digits10) << falpha << "));\n";
out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n";
return out.str();
}
};

}//SOFIE
Expand Down
37 changes: 37 additions & 0 deletions core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,9 @@
#include "Clip_FromONNX_GPU_ALPAKA.hxx"
#include "Not_FromONNX_GPU_ALPAKA.hxx"

#include "Elu_FromONNX_GPU_ALPAKA.hxx"
#include "input_models/references/Elu.ref.hxx"

#include "GNN_model_FromONNX_GPU_ALPAKA.hxx"

#include <alpaka/alpaka.hpp>
Expand Down Expand Up @@ -3161,3 +3164,37 @@ TEST_F(SofieAlpakaTest, Logic_BitwiseNot)
for (std::size_t i = 0; i < N; ++i)
EXPECT_EQ(res[i], ref[i]) << " index=" << i;
}

TEST_F(SofieAlpakaTest, Elu)
{
constexpr float TOLERANCE = DEFAULT_TOLERANCE;

// same input as the CPU Elu test: spans negative + positive
std::vector<float> input({1.0f, -2.0f, 3.0f, 0.5f, -1.0f, 2.0f});

auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];

auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
alpaka::memcpy(queue, input_d, input_h);
alpaka::wait(queue);

constexpr size_t nOut = sizeof(Elu_ExpectedOutput::outputs) / sizeof(float);
auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{nOut}));

{
SOFIE_Elu::Session<alpaka::TagGpuCudaRt> session;
auto result = session.infer(input_d);
alpaka::wait(queue);
cudaDeviceSynchronize();
alpaka::memcpy(queue, result_h, result);
alpaka::wait(queue);
}

float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
float* correct = Elu_ExpectedOutput::outputs;
for (size_t i = 0; i < nOut; ++i) {
EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
}
}