From 4f63c424884e7b77ba84d8b9175f813f8a8d2019 Mon Sep 17 00:00:00 2001 From: Harsh Chauhan Date: Wed, 3 Jun 2026 13:34:13 +0530 Subject: [PATCH 1/2] Add ConvBatch4 batch test for conv --- core/test/ConvBatchModelGenerator.py | 92 ++++++++++++++++++ .../TestCustomModelsFromONNXForAlpakaCuda.cxx | 37 +++++++ core/test/input_models/ConvBatch4.onnx | Bin 0 -> 471 bytes .../references/ConvBatch4.ref.hxx | 5 + .../references/ConvBatch4_input.ref.hxx | 5 + 5 files changed, 139 insertions(+) create mode 100644 core/test/ConvBatchModelGenerator.py create mode 100644 core/test/input_models/ConvBatch4.onnx create mode 100644 core/test/input_models/references/ConvBatch4.ref.hxx create mode 100644 core/test/input_models/references/ConvBatch4_input.ref.hxx diff --git a/core/test/ConvBatchModelGenerator.py b/core/test/ConvBatchModelGenerator.py new file mode 100644 index 0000000..1082790 --- /dev/null +++ b/core/test/ConvBatchModelGenerator.py @@ -0,0 +1,92 @@ +#!/usr/bin/python3 +# +# ConvBatchModelGenerator.py +# +# Generates a batch>1 Conv ONNX model and its reference output for the SOFIE +# alpaka GPU test oracle (ConvBatch4). The reference is computed with +# onnxruntime so it reflects the exact ONNX Conv semantics, independent of +# anything SOFIE does. This model is the correctness oracle for the +# strided-batched Conv GEMM work (Conv batch>1 path). +# +# Usage: python3 ConvBatchModelGenerator.py +# Needs: pip install onnx numpy +# Writes: input_models/ConvBatch4.onnx +# input_models/references/ConvBatch4.ref.hxx (expected output) +# input_models/references/ConvBatch4_input.ref.hxx (input data) + +import os +import numpy as np +import onnx +from onnx import helper, TensorProto, numpy_helper + + +def conv2d_ref(X, W, Bs, pad, stride): + # Plain cross-correlation, matching ONNX Conv semantics for the + # symmetric-pad, unit-stride, group=1, no-dilation case used here. + Bn, Cin, H, Wd = X.shape + Cout, _, K, _ = W.shape + Xp = np.pad(X, ((0, 0), (0, 0), (pad, pad), (pad, pad)), mode="constant") + oH = (H + 2 * pad - K) // stride + 1 + oW = (Wd + 2 * pad - K) // stride + 1 + Y = np.empty((Bn, Cout, oH, oW), np.float32) + for b in range(Bn): + for co in range(Cout): + for i in range(oH): + for j in range(oW): + patch = Xp[b, :, i * stride:i * stride + K, j * stride:j * stride + K] + Y[b, co, i, j] = np.float32(np.sum(patch * W[co]) + Bs[co]) + return Y + +NAME = "ConvBatch4" +B, Cin, Cout, H, W = 4, 2, 3, 5, 5 # batch=4 is the point of this oracle +K, PAD, STRIDE = 3, 1, 1 + +OH = (H + 2 * PAD - K) // STRIDE + 1 +OW = (W + 2 * PAD - K) // STRIDE + 1 + +np.random.seed(42) +X = np.random.randn(B, Cin, H, W).astype(np.float32) +Wt = (np.random.randn(Cout, Cin, K, K) * 0.2).astype(np.float32) +Bs = (np.random.randn(Cout) * 0.1).astype(np.float32) + +node = helper.make_node( + "Conv", ["input", "W", "B"], ["output"], + kernel_shape=[K, K], pads=[PAD, PAD, PAD, PAD], + strides=[STRIDE, STRIDE], dilations=[1, 1], group=1) + +graph = helper.make_graph( + [node], NAME, + [helper.make_tensor_value_info("input", TensorProto.FLOAT, [B, Cin, H, W])], + [helper.make_tensor_value_info("output", TensorProto.FLOAT, [B, Cout, OH, OW])], + [numpy_helper.from_array(Wt, "W"), numpy_helper.from_array(Bs, "B")]) + +model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)]) +model.ir_version = 8 +onnx.checker.check_model(model) + +# Reference output computed independently of SOFIE. +Y = conv2d_ref(X, Wt, Bs, PAD, STRIDE) + +here = os.path.dirname(os.path.abspath(__file__)) +imdir = os.path.join(here, "input_models") +refdir = os.path.join(imdir, "references") +os.makedirs(refdir, exist_ok=True) +onnx.save(model, os.path.join(imdir, NAME + ".onnx")) + + +def emit(path, ns, decl, arr): + body = ", ".join("{:.8f}f".format(v) for v in arr.reshape(-1)) + with open(path, "w") as f: + f.write("// Auto-generated by ConvBatchModelGenerator.py - DO NOT EDIT\n") + f.write("#pragma once\n") + f.write("namespace {} {{\n".format(ns)) + f.write(" {} = {{{}}};\n".format(decl, body)) + f.write("}} // namespace {}\n".format(ns)) + + +emit(os.path.join(refdir, NAME + "_input.ref.hxx"), + NAME + "_Input", "static float data[{}]".format(X.size), X) +emit(os.path.join(refdir, NAME + ".ref.hxx"), + NAME + "_ExpectedOutput", "float output[]", Y) + +print("wrote {}.onnx and references; input {} output {}".format(NAME, X.shape, Y.shape)) diff --git a/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx b/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx index 5ad9383..893026f 100644 --- a/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx +++ b/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx @@ -164,6 +164,10 @@ #include "ConvWithAsymmetricPadding_FromONNX_GPU_ALPAKA.hxx" #include "input_models/references/ConvWithAsymmetricPadding.ref.hxx" +#include "ConvBatch4_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/ConvBatch4.ref.hxx" +#include "input_models/references/ConvBatch4_input.ref.hxx" + #include "BatchNorm_FromONNX_GPU_ALPAKA.hxx" #include "BatchNormRelu_FromONNX_GPU_ALPAKA.hxx" @@ -2442,6 +2446,39 @@ TEST_F(SofieAlpakaTest, ConvWithAsymmetricPadding) } } +TEST_F(SofieAlpakaTest, ConvBatch4) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // Batch=4 input from the generated reference header + constexpr size_t N = sizeof(ConvBatch4_Input::data) / sizeof(float); + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < N; ++i) input_ptr[i] = ConvBatch4_Input::data[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{N})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + constexpr size_t nOut = sizeof(ConvBatch4_ExpectedOutput::output) / sizeof(float); + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{nOut})); + + { + SOFIE_ConvBatch4::Session session("ConvBatch4_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = ConvBatch4_ExpectedOutput::output; + for (size_t i = 0; i < nOut; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; + } +} + TEST_F(SofieAlpakaTest, BatchNormalization) { constexpr float TOLERANCE = DEFAULT_TOLERANCE; diff --git a/core/test/input_models/ConvBatch4.onnx b/core/test/input_models/ConvBatch4.onnx new file mode 100644 index 0000000000000000000000000000000000000000..6d8fded09d0a5c71fa7f888dc3086c64f2a50028 GIT binary patch literal 471 zcmdUOFKU3%41)YcXx}Ig-7kpxiUFO53wt=#Z`_g||+wuC}w0l() zY^Nm(@uQO$PyAHree=WJ_6F`|w2P95Mud Date: Wed, 3 Jun 2026 14:12:40 +0530 Subject: [PATCH 2/2] Use strided-batched GEMM for non-grouped Conv batch path --- core/inc/SOFIE/ROperator_Conv.hxx | 47 ++++++++++++++++--------------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/core/inc/SOFIE/ROperator_Conv.hxx b/core/inc/SOFIE/ROperator_Conv.hxx index 835a0ff..2cdbcbc 100644 --- a/core/inc/SOFIE/ROperator_Conv.hxx +++ b/core/inc/SOFIE/ROperator_Conv.hxx @@ -317,7 +317,9 @@ public: } std::vector shape1 = {fShapeW[0], fShapeW[1], kernelSize}; - std::vector shape2 = {Dim{fShapeW[1]}, Dim{kernelSize}, channelDim }; + // _xcol holds the im2col of every batch sample, so the non-grouped GPU path can + // run a single strided-batched GEMM over all samples (each gets its own slice). + std::vector shape2 = {fShapeX[0], Dim{fShapeW[1]}, Dim{kernelSize}, channelDim }; model.AddIntermediateTensor(fNX +"_f", ConvertStringToType(fType), shape1 ); model.AddIntermediateTensor(fNX +"_xcol", ConvertStringToType(fType), shape2 ); convK = fNX +"_f"; @@ -874,22 +876,21 @@ public: // Step 3 + 4: Im2Col then GEMM — structure differs for grouped vs non-grouped // ----------------------------------------------------------------------- if (fAttrGroup == 1) { - // Non-grouped: single im2col per batch, then GEMM - out << SP << SP << "// Step 3: im2col\n"; + // Non-grouped: im2col this sample into its own _xcol slice (slice n). The + // single strided-batched GEMM over all samples is issued after the loop. out << SP << SP << "{\n"; out << SP << SP << SP << "auto const elementsPerThread_im2col = Vec::all(static_cast(1));\n"; out << SP << SP << SP << "auto const elementsPerGrid_im2col = Vec::all(Idx{" << colElements << "});\n"; out << SP << SP << SP << "auto const workDiv_im2col = sofie_workdiv(elementsPerGrid_im2col);\n"; out << SP << SP << SP << "alpaka::exec(queue, workDiv_im2col, im2colKernel_" << opName << ", alpaka::getPtrNative(deviceBuf_" << fNX << ") + x_offset" - << ", alpaka::getPtrNative(deviceBuf_" << imcol << ")" + << ", alpaka::getPtrNative(deviceBuf_" << imcol << ") + n * " << colElements << "u" << ", static_cast(" << colElements << "));\n"; - out << SP << SP << SP << "alpaka::wait(queue);\n"; out << SP << SP << "}\n\n"; if (!fNB.empty()) { size_t biasElements = gemm_n * gemm_m; - out << SP << SP << "// Step 4a: broadcast bias into output slice\n"; + out << SP << SP << "// broadcast bias into this sample's output slice\n"; out << SP << SP << "{\n"; out << SP << SP << SP << "auto const elementsPerThread_bias = Vec::all(static_cast(1));\n"; out << SP << SP << SP << "auto const elementsPerGrid_bias = Vec::all(Idx{" << biasElements << "});\n"; @@ -898,24 +899,8 @@ public: << ", alpaka::getPtrNative(deviceBuf_" << fNB << ")" << ", alpaka::getPtrNative(deviceBuf_" << fNY << ") + out_offset" << ", static_cast(" << biasElements << "));\n"; - out << SP << SP << SP << "alpaka::wait(queue);\n"; out << SP << SP << "}\n\n"; - out << SP << SP << "// Step 4b: GEMM beta=1 accumulates onto bias-initialised output\n"; - out << SP << SP << "blas.matmul('n', 'n', " - << gemm_m << ", " << gemm_n << ", " << gemm_k - << ", 1.0f, alpaka::getPtrNative(deviceBuf_" << imcol << ")" - << ", alpaka::getPtrNative(deviceBuf_" << convK << ")" - << ", 1.0f, alpaka::getPtrNative(deviceBuf_" << fNY << ") + out_offset);\n\n"; - } else { - out << SP << SP << "// Step 4: GEMM beta=0 (no bias)\n"; - out << SP << SP << "blas.matmul('n', 'n', " - << gemm_m << ", " << gemm_n << ", " << gemm_k - << ", 1.0f, alpaka::getPtrNative(deviceBuf_" << imcol << ")" - << ", alpaka::getPtrNative(deviceBuf_" << convK << ")" - << ", 0.0f, alpaka::getPtrNative(deviceBuf_" << fNY << ") + out_offset);\n\n"; } - // Wait for GEMM to finish before next batch overwrites the shared _xcol buffer. - out << SP << SP << "alpaka::wait(queue);\n\n"; } else { // Grouped convolution: im2col and GEMM per group with group-adjusted input pointer. @@ -970,6 +955,21 @@ public: } out << SP << "}\n"; // end batch loop + + // Non-grouped: replace the per-sample matmul loop with one strided-batched GEMM. + // Each sample reads its own _xcol slice (strideA = colElements) and writes its own + // output block (strideC = gemm_n*gemm_m); the weight _f is shared, so strideB = 0. + if (fAttrGroup == 1) { + std::string convBeta = fNB.empty() ? "0.0f" : "1.0f"; + out << SP << "alpaka::wait(queue);\n"; + out << SP << "blas.gemmStridedBatched('n', 'n', " + << gemm_m << ", " << gemm_n << ", " << gemm_k << ", 1.0f, " + << "alpaka::getPtrNative(deviceBuf_" << imcol << "), " << gemm_m << ", " << colElements << ", " + << "alpaka::getPtrNative(deviceBuf_" << convK << "), " << gemm_k << ", 0, " + << convBeta << ", alpaka::getPtrNative(deviceBuf_" << fNY << "), " + << gemm_m << ", " << gemm_n * gemm_m << ", " << bsize << ");\n"; + out << SP << "alpaka::wait(queue);\n"; + } return out.str(); } @@ -979,6 +979,9 @@ public: std::string GetBlasConfig(){ + // Non-grouped Conv uses gemmStridedBatched (legacy cuBLAS, no cuBLASLt layout + // registration). Grouped Conv still uses the per-group matmul path below. + if (fAttrGroup == 1) return ""; size_t oDepth_ = (fDim > 2) ? fShapeY[2].dim : 1; size_t oHeight_ = (fDim > 1) ? fShapeY[fDim].dim : 1; size_t oWidth_ = fShapeY[fDim + 1].dim;