From 9979d37b85926940639278f79f14969bfa02948d Mon Sep 17 00:00:00 2001 From: Harsh Chauhan Date: Tue, 9 Jun 2026 15:00:01 +0530 Subject: [PATCH 1/3] fix dilation>1 in conv for cpu and gpu + test --- core/inc/SOFIE/ROperator_Conv.hxx | 31 ++++++++++---- .../TestCustomModelsFromONNXForAlpakaCuda.cxx | 40 ++++++++++++++++++ core/test/input_models/ConvWithDilation.onnx | Bin 0 -> 317 bytes .../references/ConvWithDilation.ref.hxx | 3 ++ 4 files changed, 66 insertions(+), 8 deletions(-) create mode 100644 core/test/input_models/ConvWithDilation.onnx create mode 100644 core/test/input_models/references/ConvWithDilation.ref.hxx diff --git a/core/inc/SOFIE/ROperator_Conv.hxx b/core/inc/SOFIE/ROperator_Conv.hxx index 835a0ff..31ad69e 100644 --- a/core/inc/SOFIE/ROperator_Conv.hxx +++ b/core/inc/SOFIE/ROperator_Conv.hxx @@ -430,6 +430,9 @@ public: out << SP << SP << "}\n"; out << SP << "}\n"; + // dilation already folded into the expanded kernel and dilated _f layout above + fAttrDilations = std::vector(3, 1); + //out << SP << "char " << OpName << "_transA = 'T';\n"; out << SP << "char " << OpName << "_transA = 'N';\n"; out << SP << "char " << OpName << "_transB = 'N';\n"; @@ -634,6 +637,10 @@ public: size_t ocstride = fShapeW[1] * icstride; size_t wTotalElements = ConvertShapeToLength(fShapeW); + // effective (dilation-expanded) kernel extents, used for the dense im2col decode + size_t kHeightEff = (fDim > 1) ? fAttrKernelShape[ih] : 1; + size_t kWidthEff = fAttrKernelShape[iw]; + std::string op; // Kernel 1: Weight vectorisation — reorder W into _f with dilation layout @@ -709,13 +716,13 @@ public: op += SP + SP + SP + SP + "std::size_t const ic = col_row / " + std::to_string(kernelSize) + "u;\n"; op += SP + SP + SP + SP + "std::size_t const k_rem = col_row % " + std::to_string(kernelSize) + "u;\n"; if (fDim > 2) { - op += SP + SP + SP + SP + "std::size_t const kd = k_rem / " + std::to_string(kHeight * kWidth) + "u;\n"; - op += SP + SP + SP + SP + "std::size_t const kh = (k_rem / " + std::to_string(kWidth) + "u) % " + std::to_string(kHeight) + "u;\n"; - op += SP + SP + SP + SP + "std::size_t const kw = k_rem % " + std::to_string(kWidth) + "u;\n\n"; + op += SP + SP + SP + SP + "std::size_t const kd = k_rem / " + std::to_string(kHeightEff * kWidthEff) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const kh = (k_rem / " + std::to_string(kWidthEff) + "u) % " + std::to_string(kHeightEff) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const kw = k_rem % " + std::to_string(kWidthEff) + "u;\n\n"; } else if (fDim > 1) { op += SP + SP + SP + SP + "std::size_t const kd = 0u;\n"; - op += SP + SP + SP + SP + "std::size_t const kh = k_rem / " + std::to_string(kWidth) + "u;\n"; - op += SP + SP + SP + SP + "std::size_t const kw = k_rem % " + std::to_string(kWidth) + "u;\n\n"; + op += SP + SP + SP + SP + "std::size_t const kh = k_rem / " + std::to_string(kWidthEff) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const kw = k_rem % " + std::to_string(kWidthEff) + "u;\n\n"; } else { op += SP + SP + SP + SP + "std::size_t const kd = 0u;\n"; op += SP + SP + SP + SP + "std::size_t const kh = 0u;\n"; @@ -740,7 +747,7 @@ public: // applying it here would make id_in negative and zero the whole output. if (fDim >= 3) { op += SP + SP + SP + SP + "int64_t const id_in = static_cast(od * " + std::to_string(fAttrStrides[0]) - + "u + kd * " + std::to_string(fAttrDilations[0]) + "u) - " + std::to_string(fAttrPads[0]) + ";\n"; + + "u + kd) - " + std::to_string(fAttrPads[0]) + ";\n"; } else { op += SP + SP + SP + SP + "int64_t const id_in = 0;\n"; } @@ -750,7 +757,7 @@ public: size_t const hIdx = (fDim > 2) ? 1 : 0; if (fDim >= 2) { op += SP + SP + SP + SP + "int64_t const ih_in = static_cast(oh * " + std::to_string(fAttrStrides[hIdx]) - + "u + kh * " + std::to_string(fAttrDilations[hIdx]) + "u) - " + std::to_string(fAttrPads[hIdx]) + ";\n"; + + "u + kh) - " + std::to_string(fAttrPads[hIdx]) + ";\n"; } else { op += SP + SP + SP + SP + "int64_t const ih_in = 0;\n"; } @@ -759,7 +766,7 @@ public: { size_t const wIdx = fDim - 1; op += SP + SP + SP + SP + "int64_t const iw_in = static_cast(ow * " + std::to_string(fAttrStrides[wIdx]) - + "u + kw * " + std::to_string(fAttrDilations[wIdx]) + "u) - " + std::to_string(fAttrPads[wIdx]) + ";\n\n"; + + "u + kw) - " + std::to_string(fAttrPads[wIdx]) + ";\n\n"; } op += SP + SP + SP + SP + "bool const in_bounds =\n"; @@ -846,6 +853,14 @@ public: std::stringstream out; out << "\n//------ CONV_GPU_ALPAKA\n"; + // dilation>1 leaves gaps in the dilated _f layout; zero it so those slots stay 0 + bool hasDilation = false; + for (size_t d = 0; d < fDim; ++d) if (fAttrDilations[d] > 1) hasDilation = true; + if (hasDilation) { + out << SP << "alpaka::memset(queue, deviceBuf_" << convK << ", 0);\n"; + out << SP << "alpaka::wait(queue);\n"; + } + // ----------------------------------------------------------------------- // Step 1: Weight vectorisation kernel — runs once, fully on GPU // ----------------------------------------------------------------------- diff --git a/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx b/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx index 5ad9383..5b99e29 100644 --- a/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx +++ b/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx @@ -164,6 +164,9 @@ #include "ConvWithAsymmetricPadding_FromONNX_GPU_ALPAKA.hxx" #include "input_models/references/ConvWithAsymmetricPadding.ref.hxx" +#include "ConvWithDilation_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/ConvWithDilation.ref.hxx" + #include "BatchNorm_FromONNX_GPU_ALPAKA.hxx" #include "BatchNormRelu_FromONNX_GPU_ALPAKA.hxx" @@ -2442,6 +2445,43 @@ TEST_F(SofieAlpakaTest, ConvWithAsymmetricPadding) } } +// Exercises the Conv GPU dilation>1 path (issue #32). x[1,1,7,7], W[2,1,3,3] iota, 3x3 +// kernel, dilation 2, no padding. Before the fix the dilation was double-counted. +TEST_F(SofieAlpakaTest, ConvWithDilation) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input(49); + std::iota(input.begin(), input.end(), 0.0f); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{sizeof(ConvWithDilation_ExpectedOutput::all_ones) / sizeof(float)})); + + { + SOFIE_ConvWithDilation::Session session("ConvWithDilation_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = ConvWithDilation_ExpectedOutput::all_ones; + constexpr size_t nOut_dilation = sizeof(ConvWithDilation_ExpectedOutput::all_ones) / sizeof(float); + + for (size_t i = 0; i < nOut_dilation; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; + } +} + TEST_F(SofieAlpakaTest, BatchNormalization) { constexpr float TOLERANCE = DEFAULT_TOLERANCE; diff --git a/core/test/input_models/ConvWithDilation.onnx b/core/test/input_models/ConvWithDilation.onnx new file mode 100644 index 0000000000000000000000000000000000000000..bf631ca194bdca3efcfb6bd12ee44e3ac8a02196 GIT binary patch literal 317 zcmYk%y-LL}7zW_{6cU3<4C>+{j-8xc%sD!UlUu;0R$FNB(1s?6-8y<12fPt4#Y^$c zvBOF7@WJ=yhe%s3O!&wB�mR2IB(Kl_h)KYYe&bK Date: Tue, 9 Jun 2026 15:51:47 +0530 Subject: [PATCH 2/3] add cpu dilation test --- core/test/TestCustomModelsFromONNX.cxx | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/core/test/TestCustomModelsFromONNX.cxx b/core/test/TestCustomModelsFromONNX.cxx index 902cbcc..6469f6a 100644 --- a/core/test/TestCustomModelsFromONNX.cxx +++ b/core/test/TestCustomModelsFromONNX.cxx @@ -80,6 +80,9 @@ #include "ConvWithAsymmetricPadding_FromONNX.hxx" #include "input_models/references/ConvWithAsymmetricPadding.ref.hxx" +#include "ConvWithDilation_FromONNX.hxx" +#include "input_models/references/ConvWithDilation.ref.hxx" + #include "MaxPool1d_FromONNX.hxx" #include "input_models/references/MaxPool1d.ref.hxx" @@ -967,6 +970,25 @@ TEST(ONNX, ConvWithStridesNoPadding) } +TEST(ONNX, ConvWithDilation) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input(49); + std::iota(input.begin(), input.end(), 0.0f); + SOFIE_ConvWithDilation::Session s("ConvWithDilation_FromONNX.dat"); + std::vector output = s.infer(input.data()); + + EXPECT_EQ(output.size(), sizeof(ConvWithDilation_ExpectedOutput::all_ones) / sizeof(float)); + + float *correct = ConvWithDilation_ExpectedOutput::all_ones; + + for (size_t i = 0; i < output.size(); ++i) { + EXPECT_LE(std::abs(output[i] - correct[i]), TOLERANCE); + } +} + + // Disables test (asymmetric padding not supported) TEST(DISABLED_ONNX, ConvWithAsymmetricPadding) { From ec97e1efaf87425e896083b979c5f0f37b4a3046 Mon Sep 17 00:00:00 2001 From: Harsh Chauhan Date: Tue, 9 Jun 2026 16:24:46 +0530 Subject: [PATCH 3/3] remove cpu dilation test --- core/test/TestCustomModelsFromONNX.cxx | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/core/test/TestCustomModelsFromONNX.cxx b/core/test/TestCustomModelsFromONNX.cxx index 6469f6a..902cbcc 100644 --- a/core/test/TestCustomModelsFromONNX.cxx +++ b/core/test/TestCustomModelsFromONNX.cxx @@ -80,9 +80,6 @@ #include "ConvWithAsymmetricPadding_FromONNX.hxx" #include "input_models/references/ConvWithAsymmetricPadding.ref.hxx" -#include "ConvWithDilation_FromONNX.hxx" -#include "input_models/references/ConvWithDilation.ref.hxx" - #include "MaxPool1d_FromONNX.hxx" #include "input_models/references/MaxPool1d.ref.hxx" @@ -970,25 +967,6 @@ TEST(ONNX, ConvWithStridesNoPadding) } -TEST(ONNX, ConvWithDilation) -{ - constexpr float TOLERANCE = DEFAULT_TOLERANCE; - - std::vector input(49); - std::iota(input.begin(), input.end(), 0.0f); - SOFIE_ConvWithDilation::Session s("ConvWithDilation_FromONNX.dat"); - std::vector output = s.infer(input.data()); - - EXPECT_EQ(output.size(), sizeof(ConvWithDilation_ExpectedOutput::all_ones) / sizeof(float)); - - float *correct = ConvWithDilation_ExpectedOutput::all_ones; - - for (size_t i = 0; i < output.size(); ++i) { - EXPECT_LE(std::abs(output[i] - correct[i]), TOLERANCE); - } -} - - // Disables test (asymmetric padding not supported) TEST(DISABLED_ONNX, ConvWithAsymmetricPadding) {