From aaf2d08622ad1022351d0f20fc0d53bda35caf2b Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Mon, 2 Mar 2026 11:55:47 +0100 Subject: [PATCH 01/23] feat: configure leading dimensions explicitly --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index cbf18e0..7c9b328 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -118,10 +118,10 @@ class BlasCuda { } } - void AddLayoutConfig(std::size_t m, std::size_t n, std::size_t k) { - CheckAndAddLayout(k, m); - CheckAndAddLayout(k, n); - CheckAndAddLayout(m, n); + void AddLayoutConfig(std::size_t m, std::size_t n, std::size_t k, std::size_t lda, std::size_t ldb, std::size_t ldc) { + CheckAndAddLayout(k, m, lda); + CheckAndAddLayout(k, n, ldb); + CheckAndAddLayout(m, n, ldc); } template @@ -313,11 +313,10 @@ gemmrelu(char transa, char transb, const unsigned int m, private: alpaka::QueueCudaRtNonBlocking m_queue; - void CheckAndAddLayout(size_t rows, size_t cols) { + void CheckAndAddLayout(size_t rows, size_t cols, size_t ld) { auto key = std::make_pair(rows, cols); if (LayoutStore.find(key) == LayoutStore.end()) { cublasLtMatrixLayout_t temp = nullptr; - size_t ld = rows; CHECK_CUBLAS( cublasLtMatrixLayoutCreate(&temp, CUDA_R_32F, rows, cols, ld)); LayoutStore.emplace(key, temp); From 9aa4b884854ed82985667e8481710f3124d7dd48 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Mon, 2 Mar 2026 14:31:55 +0100 Subject: [PATCH 02/23] fix: set stream while initializing handle --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 7c9b328..58c0f27 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -49,7 +49,6 @@ struct PairEq { class BlasCuda { cublasLtHandle_t ltHandle = nullptr; - cublasHandle_t handle = nullptr; cublasLtMatmulDesc_t operationDesc = nullptr; cublasLtMatmulPreference_t preference = nullptr; void *d_workspace = nullptr; @@ -72,7 +71,7 @@ class BlasCuda { BlasCuda(alpaka::QueueCudaRtNonBlocking &queue) : m_queue{queue} { stream = static_cast(m_queue.getNativeHandle()); CHECK_CUBLAS(cublasLtCreate(<Handle)); - CHECK_CUBLAS(cublasCreate(&handle)); + CHECK_CUBLAS(cublasSetStream(ltHandle, stream)); heuristic = {}; CHECK_CUBLAS(cublasLtMatmulDescCreate(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); From 2498197aec00d4c65f4c46eec23d75e253b468b6 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Mon, 2 Mar 2026 14:52:00 +0100 Subject: [PATCH 03/23] feat: print requested workspace --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 58c0f27..1c9dd24 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -71,7 +71,6 @@ class BlasCuda { BlasCuda(alpaka::QueueCudaRtNonBlocking &queue) : m_queue{queue} { stream = static_cast(m_queue.getNativeHandle()); CHECK_CUBLAS(cublasLtCreate(<Handle)); - CHECK_CUBLAS(cublasSetStream(ltHandle, stream)); heuristic = {}; CHECK_CUBLAS(cublasLtMatmulDescCreate(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); @@ -170,7 +169,8 @@ gemm(char transa, char transb, const unsigned int m, 1, &localHeuristic, &returnedResults)); - + std::cout << "Requested workspace: " + << localHeuristic.workspaceSize << std::endl; if (returnedResults == 0) { cublasLtMatmulDescDestroy(localDesc); std::cerr << "No suitable cuBLASLt algorithm found!\n"; @@ -237,7 +237,8 @@ gemmrelu(char transa, char transb, const unsigned int m, 1, &localHeuristic, &error_flag)); - + std::cout << "Requested workspace: " + << localHeuristic.workspaceSize << std::endl; if (error_flag == 0) { cublasLtMatmulDescDestroy(localDesc); std::cerr << "No suitable cuBLASLt algorithm found!\n"; From 901c5ae33fd687b7c3ad0b01babf0b0008846ca7 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Mon, 2 Mar 2026 17:31:13 +0100 Subject: [PATCH 04/23] fix: provide layout order during adding --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 1c9dd24..e921ebb 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -319,6 +319,8 @@ gemmrelu(char transa, char transb, const unsigned int m, cublasLtMatrixLayout_t temp = nullptr; CHECK_CUBLAS( cublasLtMatrixLayoutCreate(&temp, CUDA_R_32F, rows, cols, ld)); + cublasLtMatrixLayoutSetAttribute( + temp, CUBLASLT_MATRIX_LAYOUT_ORDER, CUBLASLT_ORDER_COL, sizeof(int)); LayoutStore.emplace(key, temp); } } From 8a27ffe044d87c4f87269bd3e5d8772769edae7d Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Tue, 3 Mar 2026 10:38:08 +0100 Subject: [PATCH 05/23] fix: layout order configuration --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index e921ebb..1c9dd24 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -319,8 +319,6 @@ gemmrelu(char transa, char transb, const unsigned int m, cublasLtMatrixLayout_t temp = nullptr; CHECK_CUBLAS( cublasLtMatrixLayoutCreate(&temp, CUDA_R_32F, rows, cols, ld)); - cublasLtMatrixLayoutSetAttribute( - temp, CUBLASLT_MATRIX_LAYOUT_ORDER, CUBLASLT_ORDER_COL, sizeof(int)); LayoutStore.emplace(key, temp); } } From 54dce6320accc9aea63bb3401cb19da36dbef779 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Tue, 3 Mar 2026 10:46:56 +0100 Subject: [PATCH 06/23] fix: remove look for requested workspace --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 1c9dd24..47d4b98 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -169,8 +169,6 @@ gemm(char transa, char transb, const unsigned int m, 1, &localHeuristic, &returnedResults)); - std::cout << "Requested workspace: " - << localHeuristic.workspaceSize << std::endl; if (returnedResults == 0) { cublasLtMatmulDescDestroy(localDesc); std::cerr << "No suitable cuBLASLt algorithm found!\n"; From c5128e65d644826766fdaba0417432765d4d357b Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Fri, 17 Apr 2026 11:31:30 +0200 Subject: [PATCH 07/23] feat: matmul method for without bias matrix multiplication condition --- .../backends/cuda/sofieBLAS_cublas.hpp | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 47d4b98..831d0cb 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -308,6 +308,66 @@ gemmrelu(char transa, char transb, const unsigned int m, workspaceSize, stream)); } + // matmul without bias + template + inline void + matmul(char transa, char transb, const unsigned int m, + const unsigned int n, const unsigned int k, + const float alpha, + alpaka::BufCudaRt, TIdx> const &A, + alpaka::BufCudaRt, TIdx> const &B, + const float beta, + alpaka::BufCudaRt, TIdx> &C) + { + cublasLtMatmulDesc_t localDesc = nullptr; + CHECK_CUBLAS(cublasLtMatmulDescCreate(&localDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); + + cublasOperation_t transB_op = charToCuBlasTranspose(transb); + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + localDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transB_op, sizeof(transB_op))); + + cublasOperation_t transA_op = charToCuBlasTranspose(transa); + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + localDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transA_op, sizeof(transA_op))); + + + cublasLtMatmulHeuristicResult_t localHeuristic{}; + int returnedResults = 0; + CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic( + ltHandle, + localDesc, + LayoutStore.at({k, m}), + LayoutStore.at({k, n}), + LayoutStore.at({m, n}), + LayoutStore.at({m, n}), + preference, + 1, + &localHeuristic, + &returnedResults)); + if (returnedResults == 0) { + cublasLtMatmulDescDestroy(localDesc); + std::cerr << "No suitable cuBLASLt algorithm found!\n"; + exit(EXIT_FAILURE); + } + + CHECK_CUBLAS(cublasLtMatmul( + ltHandle, + localDesc, + &alpha, + alpaka::getPtrNative(A), LayoutStore.at({k, m}), + alpaka::getPtrNative(B), LayoutStore.at({k, n}), + &beta, + alpaka::getPtrNative(C), LayoutStore.at({m, n}), + alpaka::getPtrNative(C), LayoutStore.at({m, n}), + &(localHeuristic.algo), + d_workspace, + workspaceSize, + stream)); + + cudaDeviceSynchronize(); + CHECK_CUBLAS(cublasLtMatmulDescDestroy(localDesc)); + } + private: alpaka::QueueCudaRtNonBlocking m_queue; From 9282ba9770d9064f6d77e8571081957b4051c754 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Fri, 17 Apr 2026 11:42:30 +0200 Subject: [PATCH 08/23] feat: option to pass gpu pointers directly --- .../backends/cuda/sofieBLAS_cublas.hpp | 27 ++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 831d0cb..39d3b5b 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -308,6 +308,7 @@ gemmrelu(char transa, char transb, const unsigned int m, workspaceSize, stream)); } + // matmul without bias template inline void @@ -319,6 +320,24 @@ gemmrelu(char transa, char transb, const unsigned int m, const float beta, alpaka::BufCudaRt, TIdx> &C) { + + matmul(transa, transb, m, n, k, alpha, + reinterpret_cast(alpaka::getPtrNative(A)), + reinterpret_cast(alpaka::getPtrNative(B)), + beta, + reinterpret_cast(alpaka::getPtrNative(C))); + } + + template + inline void + matmul(char transa, char transb, const unsigned int m, + const unsigned int n, const unsigned int k, + const float alpha, + void const &A, + void const &B, + const float beta, + void &C) + { cublasLtMatmulDesc_t localDesc = nullptr; CHECK_CUBLAS(cublasLtMatmulDescCreate(&localDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); @@ -354,11 +373,11 @@ gemmrelu(char transa, char transb, const unsigned int m, ltHandle, localDesc, &alpha, - alpaka::getPtrNative(A), LayoutStore.at({k, m}), - alpaka::getPtrNative(B), LayoutStore.at({k, n}), + A, LayoutStore.at({k, m}), + B, LayoutStore.at({k, n}), &beta, - alpaka::getPtrNative(C), LayoutStore.at({m, n}), - alpaka::getPtrNative(C), LayoutStore.at({m, n}), + C, LayoutStore.at({m, n}), + C, LayoutStore.at({m, n}), &(localHeuristic.algo), d_workspace, workspaceSize, From fc9073798360fabe61c6de31dc4e3d8c2da76049 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Fri, 17 Apr 2026 12:08:02 +0200 Subject: [PATCH 09/23] fix: function signature for matmul method with void pointers --- .../sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 39d3b5b..c174ad3 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -322,21 +322,21 @@ gemmrelu(char transa, char transb, const unsigned int m, { matmul(transa, transb, m, n, k, alpha, - reinterpret_cast(alpaka::getPtrNative(A)), - reinterpret_cast(alpaka::getPtrNative(B)), + alpaka::getPtrNative(A), + alpaka::getPtrNative(B), beta, - reinterpret_cast(alpaka::getPtrNative(C))); + alpaka::getPtrNative(C)); } - + template inline void matmul(char transa, char transb, const unsigned int m, const unsigned int n, const unsigned int k, const float alpha, - void const &A, - void const &B, + void const * A, + void const * B, const float beta, - void &C) + void * C) { cublasLtMatmulDesc_t localDesc = nullptr; CHECK_CUBLAS(cublasLtMatmulDescCreate(&localDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); From b591c4a11085a480821077713205a8da06675fb3 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Fri, 17 Apr 2026 12:16:20 +0200 Subject: [PATCH 10/23] fix: template types for arguments to function with pointer signatures --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index c174ad3..b8abe23 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -333,10 +333,10 @@ gemmrelu(char transa, char transb, const unsigned int m, matmul(char transa, char transb, const unsigned int m, const unsigned int n, const unsigned int k, const float alpha, - void const * A, - void const * B, + T const * A, + T const * B, const float beta, - void * C) + T * C) { cublasLtMatmulDesc_t localDesc = nullptr; CHECK_CUBLAS(cublasLtMatmulDescCreate(&localDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); From 19b5b3f4951f8caf46b742d965612f13d58696af Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Fri, 17 Apr 2026 12:40:23 +0200 Subject: [PATCH 11/23] fix: use float signatures for pointer arguments for blascuda --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index b8abe23..9b9c5e4 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -333,10 +333,10 @@ gemmrelu(char transa, char transb, const unsigned int m, matmul(char transa, char transb, const unsigned int m, const unsigned int n, const unsigned int k, const float alpha, - T const * A, - T const * B, + float * A, + float * B, const float beta, - T * C) + float * C) { cublasLtMatmulDesc_t localDesc = nullptr; CHECK_CUBLAS(cublasLtMatmulDescCreate(&localDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); From c9954648aec4460d4a193296954df8201dbab3e7 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Fri, 17 Apr 2026 12:50:40 +0200 Subject: [PATCH 12/23] fix: use explicit data type for signatures with pointer arguments --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 9b9c5e4..57da5e7 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -328,7 +328,6 @@ gemmrelu(char transa, char transb, const unsigned int m, alpaka::getPtrNative(C)); } - template inline void matmul(char transa, char transb, const unsigned int m, const unsigned int n, const unsigned int k, From 75f75a4ef66a0654ff4e8380f0ecee19c208e62b Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Fri, 17 Apr 2026 12:58:37 +0200 Subject: [PATCH 13/23] fix: non transpose axis for matmul method --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 57da5e7..21269fc 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -354,7 +354,7 @@ gemmrelu(char transa, char transb, const unsigned int m, CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic( ltHandle, localDesc, - LayoutStore.at({k, m}), + LayoutStore.at({m, k}), LayoutStore.at({k, n}), LayoutStore.at({m, n}), LayoutStore.at({m, n}), @@ -372,7 +372,7 @@ gemmrelu(char transa, char transb, const unsigned int m, ltHandle, localDesc, &alpha, - A, LayoutStore.at({k, m}), + A, LayoutStore.at({m, k}), B, LayoutStore.at({k, n}), &beta, C, LayoutStore.at({m, n}), From ed303fbc7edef361332afbc857f65f1e9e8c26b1 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Fri, 17 Apr 2026 13:38:18 +0200 Subject: [PATCH 14/23] fix (experimental): layout shape order --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 21269fc..de36507 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -117,7 +117,7 @@ class BlasCuda { } void AddLayoutConfig(std::size_t m, std::size_t n, std::size_t k, std::size_t lda, std::size_t ldb, std::size_t ldc) { - CheckAndAddLayout(k, m, lda); + CheckAndAddLayout(m, k, lda); CheckAndAddLayout(k, n, ldb); CheckAndAddLayout(m, n, ldc); } From 54a94013c8203a1b3ac88dd365623dde0a957da6 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Mon, 4 May 2026 16:55:12 +0200 Subject: [PATCH 15/23] feat: cuda cleanup and cpu blas api --- .gitignore | 5 +- include/.vscode/settings.json | 8 - .../sofieBLAS/backends/cpu/sofieBLAS_cpu.hpp | 86 ++- .../backends/cuda/sofieBLAS_cublas.hpp | 461 +++++-------- tests/CMakeLists.txt | 14 +- tests/test.cc | 634 +++++++++++++++--- 6 files changed, 796 insertions(+), 412 deletions(-) delete mode 100644 include/.vscode/settings.json diff --git a/.gitignore b/.gitignore index 475c359..2f9ffe1 100644 --- a/.gitignore +++ b/.gitignore @@ -41,4 +41,7 @@ *.dwo # build files -**/build/ \ No newline at end of file +**/build/ + +# vscode settings +.vscode/ \ No newline at end of file diff --git a/include/.vscode/settings.json b/include/.vscode/settings.json deleted file mode 100644 index 64f45f4..0000000 --- a/include/.vscode/settings.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "files.associations": { - "array": "cpp", - "string": "cpp", - "string_view": "cpp", - "span": "cpp" - } -} diff --git a/include/sofieBLAS/backends/cpu/sofieBLAS_cpu.hpp b/include/sofieBLAS/backends/cpu/sofieBLAS_cpu.hpp index 263703c..23b205f 100644 --- a/include/sofieBLAS/backends/cpu/sofieBLAS_cpu.hpp +++ b/include/sofieBLAS/backends/cpu/sofieBLAS_cpu.hpp @@ -15,6 +15,8 @@ #endif #include +#include +#include class BlasCpu { public: @@ -36,18 +38,82 @@ class BlasCpu { } } + // C = alpha * op(A) * op(B) + beta * C (no bias, leading dims inferred) + template + inline void matmul(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::BufCpu, TIdx> const &A, + alpaka::BufCpu, TIdx> const &B, + float beta, + alpaka::BufCpu, TIdx> &C) { + int lda = (transa == 'N' || transa == 'n') ? static_cast(m) + : static_cast(k); + int ldb = (transb == 'N' || transb == 'n') ? static_cast(k) + : static_cast(n); + cblas_sgemm(CblasColMajor, charToTranspose(transa), charToTranspose(transb), + static_cast(m), static_cast(n), static_cast(k), + alpha, alpaka::getPtrNative(A), lda, alpaka::getPtrNative(B), + ldb, beta, alpaka::getPtrNative(C), static_cast(m)); + } + + // C = alpha * op(A) * op(B) + beta * bias + bias_vec (bias_vec broadcast per + // row) Matches the cuBLASLt EPILOGUE_BIAS semantics: the bias buffer serves + // as both the beta-scaled accumulator and provides the per-row bias vector + // (first m elements). template inline void - gemm(char transa, char transb, const unsigned int m, const unsigned int n, - const unsigned int k, const float alpha, - alpaka::BufCpu, TIdx> const &A, const int lda, - alpaka::BufCpu, TIdx> const &B, const int ldb, - const float beta, alpaka::BufCpu, TIdx> &C, - const int ldc) { - CBLAS_TRANSPOSE TransA = charToTranspose(transa); - CBLAS_TRANSPOSE TransB = charToTranspose(transb); - cblas_sgemm(CblasColMajor, TransA, TransB, m, n, k, alpha, A.data(), lda, - B.data(), ldb, beta, C.data(), ldc); + gemm(char transa, char transb, unsigned int m, unsigned int n, unsigned int k, + float alpha, alpaka::BufCpu, TIdx> const &A, + alpaka::BufCpu, TIdx> const &B, float beta, + alpaka::BufCpu, TIdx> &bias, + alpaka::BufCpu, TIdx> &C) { + int lda = (transa == 'N' || transa == 'n') ? static_cast(m) + : static_cast(k); + int ldb = (transb == 'N' || transb == 'n') ? static_cast(k) + : static_cast(n); + // Step 1: C = alpha * op(A) * op(B) (beta=0 so C is fully overwritten) + cblas_sgemm(CblasColMajor, charToTranspose(transa), charToTranspose(transb), + static_cast(m), static_cast(n), static_cast(k), + alpha, alpaka::getPtrNative(A), lda, alpaka::getPtrNative(B), + ldb, 0.0f, alpaka::getPtrNative(C), static_cast(m)); + // Step 2: C += beta * bias_matrix + bias_vec (per-row broadcast) + float *c = alpaka::getPtrNative(C); + const float *b = alpaka::getPtrNative(bias); + for (unsigned int j = 0; j < n; ++j) + for (unsigned int i = 0; i < m; ++i) + c[j * m + i] += beta * b[j * m + i] + b[i]; + } + + // C = relu(alpha * op(A) * op(B) + beta * bias + bias_vec) + template + inline void gemmrelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::BufCpu, TIdx> const &A, + alpaka::BufCpu, TIdx> const &B, + float beta, + alpaka::BufCpu, TIdx> &bias, + alpaka::BufCpu, TIdx> &C) { + gemm(transa, transb, m, n, k, alpha, A, B, beta, bias, C); + float *c = alpaka::getPtrNative(C); + for (unsigned int i = 0; i < m * n; ++i) + c[i] = c[i] > 0.0f ? c[i] : 0.0f; + } + + // C = gelu(alpha * op(A) * op(B) + beta * bias + bias_vec) + // Uses the standard GELU: x * 0.5 * (1 + erf(x / sqrt(2))) + template + inline void gemmgelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::BufCpu, TIdx> const &A, + alpaka::BufCpu, TIdx> const &B, + float beta, + alpaka::BufCpu, TIdx> &bias, + alpaka::BufCpu, TIdx> &C) { + gemm(transa, transb, m, n, k, alpha, A, B, beta, bias, C); + float *c = alpaka::getPtrNative(C); + constexpr float kInvSqrt2 = 0.7071067811865476f; + for (unsigned int i = 0; i < m * n; ++i) + c[i] *= 0.5f * (1.0f + std::erff(c[i] * kInvSqrt2)); } }; diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index de36507..3c4e93c 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -15,7 +15,7 @@ #include #define CHECK_CUDA(err) \ - if (err != cudaSuccess) { \ + if ((err) != cudaSuccess) { \ std::cerr << "CUDA error: " << cudaGetErrorString(err) << " at line " \ << __LINE__ << "\n"; \ exit(EXIT_FAILURE); \ @@ -23,10 +23,9 @@ #define CHECK_CUBLAS(status) \ do { \ - cublasStatus_t s = (status); \ - if (s != CUBLAS_STATUS_SUCCESS) { \ - std::cerr << "cuBLAS error " << s << " at line " << __LINE__ \ - << std::endl; \ + cublasStatus_t _s = (status); \ + if (_s != CUBLAS_STATUS_SUCCESS) { \ + std::cerr << "cuBLAS error " << _s << " at line " << __LINE__ << "\n"; \ exit(EXIT_FAILURE); \ } \ } while (0) @@ -49,31 +48,23 @@ struct PairEq { class BlasCuda { cublasLtHandle_t ltHandle = nullptr; - cublasLtMatmulDesc_t operationDesc = nullptr; cublasLtMatmulPreference_t preference = nullptr; void *d_workspace = nullptr; - size_t workspaceSize = 1 << 22; // 4MB + size_t workspaceSize = 1 << 22; // 4 MB cudaStream_t stream = nullptr; - cublasLtMatmulHeuristicResult_t heuristic; - cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT; - int error_flag = 0; - std::unordered_map, cublasLtMatrixLayout_t, PairHash, PairEq> - LayoutStore; + layoutStore; public: - BlasCuda(const BlasCuda&) = delete; - BlasCuda& operator=(const BlasCuda&) = delete; - BlasCuda(BlasCuda&&) = delete; - BlasCuda& operator=(BlasCuda&&) = delete; + BlasCuda(const BlasCuda &) = delete; + BlasCuda &operator=(const BlasCuda &) = delete; + BlasCuda(BlasCuda &&) = delete; + BlasCuda &operator=(BlasCuda &&) = delete; BlasCuda(alpaka::QueueCudaRtNonBlocking &queue) : m_queue{queue} { stream = static_cast(m_queue.getNativeHandle()); CHECK_CUBLAS(cublasLtCreate(<Handle)); - heuristic = {}; - CHECK_CUBLAS(cublasLtMatmulDescCreate(&operationDesc, CUBLAS_COMPUTE_32F, - CUDA_R_32F)); CHECK_CUBLAS(cublasLtMatmulPreferenceCreate(&preference)); CHECK_CUDA(cudaMalloc(&d_workspace, workspaceSize)); CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( @@ -82,22 +73,15 @@ class BlasCuda { } ~BlasCuda() { - for (auto& [key, layout] : LayoutStore) { - if (layout) { + for (auto &[key, layout] : layoutStore) + if (layout) cublasLtMatrixLayoutDestroy(layout); - } - } - LayoutStore.clear(); - if (preference) cublasLtMatmulPreferenceDestroy(preference); - if (operationDesc) - cublasLtMatmulDescDestroy(operationDesc); if (ltHandle) cublasLtDestroy(ltHandle); if (d_workspace) cudaFree(d_workspace); - } inline cublasOperation_t charToCuBlasTranspose(char trans) { @@ -116,308 +100,175 @@ class BlasCuda { } } - void AddLayoutConfig(std::size_t m, std::size_t n, std::size_t k, std::size_t lda, std::size_t ldb, std::size_t ldc) { - CheckAndAddLayout(m, k, lda); - CheckAndAddLayout(k, n, ldb); - CheckAndAddLayout(m, n, ldc); + // Register matrix layouts for a given (m, n, k, lda, ldb, ldc, transa, + // transb). Must be called before gemm/gemmrelu/gemmgelu/matmul for each + // unique combination. + void addLayoutConfig(std::size_t m, std::size_t n, std::size_t k, + std::size_t lda, std::size_t ldb, std::size_t ldc, + char transa, char transb) { + // Physical A: (m×k) if NoTrans, (k×m) if Trans + if (transa == 'N' || transa == 'n') + checkAndAddLayout(m, k, lda); + else + checkAndAddLayout(k, m, lda); + // Physical B: (k×n) if NoTrans, (n×k) if Trans + if (transb == 'N' || transb == 'n') + checkAndAddLayout(k, n, ldb); + else + checkAndAddLayout(n, k, ldb); + // C is always (m×n) + checkAndAddLayout(m, n, ldc); } -template -inline void -gemm(char transa, char transb, const unsigned int m, - const unsigned int n, const unsigned int k, - const float alpha, - alpaka::BufCudaRt, TIdx> const &A, - alpaka::BufCudaRt, TIdx> const &B, - const float beta, - alpaka::BufCudaRt, TIdx> &bias, - alpaka::BufCudaRt, TIdx> &C) -{ - cublasLtMatmulDesc_t localDesc = nullptr; - CHECK_CUBLAS(cublasLtMatmulDescCreate(&localDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); - - cublasOperation_t transB_op = charToCuBlasTranspose(transb); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transB_op, sizeof(transB_op))); - - cublasOperation_t transA_op = charToCuBlasTranspose(transa); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transA_op, sizeof(transA_op))); - - void *bias_ptr = reinterpret_cast(alpaka::getPtrNative(bias)); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias_ptr, sizeof(bias_ptr))); - - cublasLtEpilogue_t ep = CUBLASLT_EPILOGUE_BIAS; - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, - CUBLASLT_MATMUL_DESC_EPILOGUE, - &ep, - sizeof(ep))); - - - cublasLtMatmulHeuristicResult_t localHeuristic{}; - int returnedResults = 0; - CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic( - ltHandle, - localDesc, - LayoutStore.at({k, m}), - LayoutStore.at({k, n}), - LayoutStore.at({m, n}), - LayoutStore.at({m, n}), - preference, - 1, - &localHeuristic, - &returnedResults)); - if (returnedResults == 0) { - cublasLtMatmulDescDestroy(localDesc); - std::cerr << "No suitable cuBLASLt algorithm found!\n"; - exit(EXIT_FAILURE); - } - - CHECK_CUBLAS(cublasLtMatmul( - ltHandle, - localDesc, - &alpha, - alpaka::getPtrNative(A), LayoutStore.at({k, m}), - alpaka::getPtrNative(B), LayoutStore.at({k, n}), - &beta, - alpaka::getPtrNative(bias), LayoutStore.at({m, n}), - alpaka::getPtrNative(C), LayoutStore.at({m, n}), - &(localHeuristic.algo), - d_workspace, - workspaceSize, - stream)); - - cudaDeviceSynchronize(); - CHECK_CUBLAS(cublasLtMatmulDescDestroy(localDesc)); -} - -template -inline void -gemmrelu(char transa, char transb, const unsigned int m, - const unsigned int n, const unsigned int k, - const float alpha, - alpaka::BufCudaRt, TIdx> const &A, - alpaka::BufCudaRt, TIdx> const &B, - const float beta, - alpaka::BufCudaRt, TIdx> &bias, - alpaka::BufCudaRt, TIdx> &C) -{ - cublasLtMatmulDesc_t localDesc = nullptr; - CHECK_CUBLAS(cublasLtMatmulDescCreate(&localDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); - - cublasOperation_t transB_op = charToCuBlasTranspose(transb); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transB_op, sizeof(transB_op))); - - cublasOperation_t transA_op = charToCuBlasTranspose(transa); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transA_op, sizeof(transA_op))); - - void *bias_ptr = reinterpret_cast(alpaka::getPtrNative(bias)); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias_ptr, sizeof(bias_ptr))); - - cublasLtEpilogue_t ep = CUBLASLT_EPILOGUE_RELU_BIAS; - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &ep, sizeof(ep))); - - cublasLtMatmulHeuristicResult_t localHeuristic{}; - CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic( - ltHandle, - localDesc, - LayoutStore.at({k, m}), - LayoutStore.at({k, n}), - LayoutStore.at({m, n}), - LayoutStore.at({m, n}), - preference, - 1, - &localHeuristic, - &error_flag)); - std::cout << "Requested workspace: " - << localHeuristic.workspaceSize << std::endl; - if (error_flag == 0) { - cublasLtMatmulDescDestroy(localDesc); - std::cerr << "No suitable cuBLASLt algorithm found!\n"; - exit(EXIT_FAILURE); - } - - CHECK_CUBLAS(cublasLtMatmul( - ltHandle, - localDesc, - &alpha, - alpaka::getPtrNative(A), LayoutStore.at({k, m}), - alpaka::getPtrNative(B), LayoutStore.at({k, n}), - &beta, - alpaka::getPtrNative(bias), LayoutStore.at({m, n}), - alpaka::getPtrNative(C), LayoutStore.at({m, n}), - &(localHeuristic.algo), - d_workspace, - workspaceSize, - stream)); - - cudaDeviceSynchronize(); - CHECK_CUBLAS(cublasLtMatmulDescDestroy(localDesc)); -} + // C = alpha * op(A) * op(B) + beta * bias + bias_vec (bias_vec broadcast per + // row) + template + inline void + gemm(char transa, char transb, unsigned int m, unsigned int n, unsigned int k, + float alpha, alpaka::BufCudaRt, TIdx> const &A, + alpaka::BufCudaRt, TIdx> const &B, float beta, + alpaka::BufCudaRt, TIdx> &bias, + alpaka::BufCudaRt, TIdx> &C) { + const void *bptr = alpaka::getPtrNative(bias); + auto desc = + makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_BIAS, bptr); + executeMatmul(desc, alpha, alpaka::getPtrNative(A), alpaka::getPtrNative(B), + beta, alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } + // C = relu(alpha * op(A) * op(B) + beta * bias + bias_vec) template - inline void gemmgelu(char transa, char transb, const unsigned int m, - const unsigned int n, const unsigned int k, - const float alpha, + inline void gemmrelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, alpaka::BufCudaRt, TIdx> const &A, alpaka::BufCudaRt, TIdx> const &B, - const float beta, + float beta, alpaka::BufCudaRt, TIdx> &bias, alpaka::BufCudaRt, TIdx> &C) { - - cublasLtMatmulDesc_t localDesc = nullptr; - CHECK_CUBLAS(cublasLtMatmulDescCreate(&localDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); - - void *bias_ptr = reinterpret_cast(alpaka::getPtrNative(bias)); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias_ptr, - sizeof(bias_ptr))); - - cublasOperation_t transB = charToCuBlasTranspose(transb); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transB, sizeof(transB))); - - cublasOperation_t transA = charToCuBlasTranspose(transa); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transA, sizeof(transA))); - SetGeluActivation(); - - cublasLtMatmulHeuristicResult_t localHeuristic{}; - CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic( - ltHandle, localDesc, - LayoutStore.at({k, m}), - LayoutStore.at({k, n}), - LayoutStore.at({m, n}), - LayoutStore.at({m, n}), - preference, 1, &localHeuristic, &error_flag)); - if (error_flag == 0) { - std::cerr << "No suitable cuBLASLt algorithm found!\n"; - exit(EXIT_FAILURE); - } - - CHECK_CUBLAS(cublasLtMatmul( - ltHandle, localDesc, &alpha, alpaka::getPtrNative(A), LayoutStore.at({k, m}), - alpaka::getPtrNative(B), LayoutStore.at({k, n}), &beta, alpaka::getPtrNative(bias), LayoutStore.at({m, n}), - alpaka::getPtrNative(C), LayoutStore.at({m, n}), &(localHeuristic.algo), d_workspace, - workspaceSize, stream)); + const void *bptr = alpaka::getPtrNative(bias); + auto desc = + makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_RELU_BIAS, bptr); + executeMatmul(desc, alpha, alpaka::getPtrNative(A), alpaka::getPtrNative(B), + beta, alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); } - - // matmul without bias + // C = gelu(alpha * op(A) * op(B) + beta * bias + bias_vec) template - inline void - matmul(char transa, char transb, const unsigned int m, - const unsigned int n, const unsigned int k, - const float alpha, - alpaka::BufCudaRt, TIdx> const &A, - alpaka::BufCudaRt, TIdx> const &B, - const float beta, - alpaka::BufCudaRt, TIdx> &C) - { - - matmul(transa, transb, m, n, k, alpha, - alpaka::getPtrNative(A), - alpaka::getPtrNative(B), - beta, - alpaka::getPtrNative(C)); + inline void gemmgelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::BufCudaRt, TIdx> const &A, + alpaka::BufCudaRt, TIdx> const &B, + float beta, + alpaka::BufCudaRt, TIdx> &bias, + alpaka::BufCudaRt, TIdx> &C) { + const void *bptr = alpaka::getPtrNative(bias); + auto desc = + makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_GELU_BIAS, bptr); + executeMatmul(desc, alpha, alpaka::getPtrNative(A), alpaka::getPtrNative(B), + beta, alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); } - inline void - matmul(char transa, char transb, const unsigned int m, - const unsigned int n, const unsigned int k, - const float alpha, - float * A, - float * B, - const float beta, - float * C) - { - cublasLtMatmulDesc_t localDesc = nullptr; - CHECK_CUBLAS(cublasLtMatmulDescCreate(&localDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); - - cublasOperation_t transB_op = charToCuBlasTranspose(transb); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transB_op, sizeof(transB_op))); - - cublasOperation_t transA_op = charToCuBlasTranspose(transa); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transA_op, sizeof(transA_op))); - - - cublasLtMatmulHeuristicResult_t localHeuristic{}; - int returnedResults = 0; - CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic( - ltHandle, - localDesc, - LayoutStore.at({m, k}), - LayoutStore.at({k, n}), - LayoutStore.at({m, n}), - LayoutStore.at({m, n}), - preference, - 1, - &localHeuristic, - &returnedResults)); - if (returnedResults == 0) { - cublasLtMatmulDescDestroy(localDesc); - std::cerr << "No suitable cuBLASLt algorithm found!\n"; - exit(EXIT_FAILURE); - } - - CHECK_CUBLAS(cublasLtMatmul( - ltHandle, - localDesc, - &alpha, - A, LayoutStore.at({m, k}), - B, LayoutStore.at({k, n}), - &beta, - C, LayoutStore.at({m, n}), - C, LayoutStore.at({m, n}), - &(localHeuristic.algo), - d_workspace, - workspaceSize, - stream)); - - cudaDeviceSynchronize(); - CHECK_CUBLAS(cublasLtMatmulDescDestroy(localDesc)); + // C = alpha * op(A) * op(B) + beta * C (no bias) + template + inline void matmul(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::BufCudaRt, TIdx> const &A, + alpaka::BufCudaRt, TIdx> const &B, + float beta, + alpaka::BufCudaRt, TIdx> &C) { + auto desc = + makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_DEFAULT); + float *c = alpaka::getPtrNative(C); + executeMatmul(desc, alpha, alpaka::getPtrNative(A), alpaka::getPtrNative(B), + beta, c, c, layoutKeyA(transa, m, k), + layoutKeyB(transb, k, n), {m, n}); } private: alpaka::QueueCudaRtNonBlocking m_queue; - void CheckAndAddLayout(size_t rows, size_t cols, size_t ld) { + // Returns the layout map key for matrix A based on its transpose flag. + // Physical dimensions: NoTrans → (m×k), Trans → (k×m). + static std::pair + layoutKeyA(char trans, std::size_t m, std::size_t k) { + return (trans == 'N' || trans == 'n') ? std::make_pair(m, k) + : std::make_pair(k, m); + } + + // Returns the layout map key for matrix B based on its transpose flag. + // Physical dimensions: NoTrans → (k×n), Trans → (n×k). + static std::pair + layoutKeyB(char trans, std::size_t k, std::size_t n) { + return (trans == 'N' || trans == 'n') ? std::make_pair(k, n) + : std::make_pair(n, k); + } + + void checkAndAddLayout(std::size_t rows, std::size_t cols, std::size_t ld) { auto key = std::make_pair(rows, cols); - if (LayoutStore.find(key) == LayoutStore.end()) { - cublasLtMatrixLayout_t temp = nullptr; + if (layoutStore.find(key) == layoutStore.end()) { + cublasLtMatrixLayout_t layout = nullptr; CHECK_CUBLAS( - cublasLtMatrixLayoutCreate(&temp, CUDA_R_32F, rows, cols, ld)); - LayoutStore.emplace(key, temp); + cublasLtMatrixLayoutCreate(&layout, CUDA_R_32F, rows, cols, ld)); + layoutStore.emplace(key, layout); } } - void ResetActivation() { - epilogue = CUBLASLT_EPILOGUE_BIAS; - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc, - CUBLASLT_MATMUL_DESC_EPILOGUE, - &epilogue, sizeof(epilogue))); - } - - void SetReluActivation() { - epilogue = CUBLASLT_EPILOGUE_RELU_BIAS; - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc, - CUBLASLT_MATMUL_DESC_EPILOGUE, - &epilogue, sizeof(epilogue))); + // Creates a per-call matmul descriptor with transpose ops, epilogue, and + // optional bias pointer. Caller owns the returned descriptor. + cublasLtMatmulDesc_t makeDesc(cublasOperation_t transA, + cublasOperation_t transB, + cublasLtEpilogue_t epilogue, + const void *bias_ptr = nullptr) { + cublasLtMatmulDesc_t desc = nullptr; + CHECK_CUBLAS( + cublasLtMatmulDescCreate(&desc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + desc, CUBLASLT_MATMUL_DESC_TRANSA, &transA, sizeof(transA))); + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + desc, CUBLASLT_MATMUL_DESC_TRANSB, &transB, sizeof(transB))); + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + desc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue))); + if (bias_ptr) { + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias_ptr, + sizeof(bias_ptr))); + } + return desc; } - void SetGeluActivation() { - epilogue = CUBLASLT_EPILOGUE_GELU; - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc, - CUBLASLT_MATMUL_DESC_EPILOGUE, - &epilogue, sizeof(epilogue))); + // Runs heuristic selection, executes cublasLtMatmul, syncs stream, and + // destroys desc. D_in is the matrix scaled by beta (may equal C_out for + // in-place accumulation). + void executeMatmul(cublasLtMatmulDesc_t desc, float alpha, const float *A, + const float *B, float beta, const float *D_in, + float *C_out, + const std::pair &kA, + const std::pair &kB, + const std::pair &kC) { + cublasLtMatmulHeuristicResult_t h{}; + int returnedResults = 0; + CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic( + ltHandle, desc, layoutStore.at(kA), layoutStore.at(kB), + layoutStore.at(kC), layoutStore.at(kC), preference, 1, &h, + &returnedResults)); + if (returnedResults == 0) { + cublasLtMatmulDescDestroy(desc); + std::cerr << "No suitable cuBLASLt algorithm found!\n"; + exit(EXIT_FAILURE); + } + CHECK_CUBLAS(cublasLtMatmul(ltHandle, desc, &alpha, A, layoutStore.at(kA), + B, layoutStore.at(kB), &beta, D_in, + layoutStore.at(kC), C_out, layoutStore.at(kC), + &h.algo, d_workspace, workspaceSize, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + CHECK_CUBLAS(cublasLtMatmulDescDestroy(desc)); } }; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2ebded2..0a4aa01 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -19,10 +19,15 @@ endif() # --- Compiler flags --- set(CXXFLAGS -O2 -g -DALPAKA_HAS_STD_ATOMIC_REF) set(CXX_HOST_FLAGS -fPIC -pthread) -set(CUDA_ARCH "sm_86") -set(CXX_CUDA_FLAGS -arch=${CUDA_ARCH} -Wno-deprecated-gpu-targets --extended-lambda --expt-relaxed-constexpr) +set(CXX_CUDA_FLAGS -Wno-deprecated-gpu-targets --extended-lambda --expt-relaxed-constexpr) set(XCOMPILER_FLAGS -Xcompiler=-fPIC,-pthread) +# --- CUDA architecture: must be set before enable_language(CUDA) so all +# targets inherit a valid default (CMake 3.18+ requires CUDA_ARCHITECTURES +# to be non-empty on every target once CUDA is enabled globally). +set(CMAKE_CUDA_ARCHITECTURES 86) +enable_language(CUDA) + # --- Include directories --- include_directories(${ALPAKA_BASE}/include "../include") @@ -86,12 +91,9 @@ target_include_directories(test_cpu PRIVATE "../sofieBLAS/include" ${ALPAKA_BASE target_link_libraries(test_cpu PRIVATE ${BLAS_LIBS}) -set(TEST_SRC test.cc) - add_executable(test_cuda) target_sources(test_cuda PRIVATE test.cc) set_source_files_properties(test.cc PROPERTIES LANGUAGE CUDA) -enable_language(CUDA) set_target_properties(test_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON) target_compile_features(test_cuda PUBLIC cxx_std_20) @@ -104,7 +106,7 @@ target_compile_options(test_cuda PRIVATE target_compile_definitions(test_cuda PRIVATE ALPAKA_ACC_GPU_CUDA_ENABLED) target_include_directories(test_cuda PRIVATE ${ALPAKA_BASE}/include "../sofieBLAS/include" ${CUDA_BASE}/include) target_link_directories(test_cuda PRIVATE ${CUDA_BASE}/lib64) -target_link_libraries(test_cuda PRIVATE cublas cudart) +target_link_libraries(test_cuda PRIVATE cublasLt cublas cudart) # --- clean target equivalent --- add_custom_target(clean-all diff --git a/tests/test.cc b/tests/test.cc index 5812bc9..e106275 100644 --- a/tests/test.cc +++ b/tests/test.cc @@ -1,121 +1,591 @@ #include "sofieBLAS/sofieBLAS.hpp" #include +#include +#include #include #include -#include +#include +#include -// index and size type using Idx = uint32_t; - -// dimensions -using Dim0D = alpaka::DimInt<0u>; using Dim1D = alpaka::DimInt<1u>; -using Dim2D = alpaka::DimInt<2u>; -using Dim3D = alpaka::DimInt<3u>; - -// Print a column-major matrix -template -void print(alpaka::BufCpu const &M, TIdx size) { - assert(alpaka::getExtentProduct(M) == size * size); - - for (TIdx row = 0; row < size; ++row) { - for (TIdx col = 0; col < size; ++col) { - std::cout << std::fixed << std::setprecision(2) << std::setw(7) - << M[col * size + row] << " "; + +// --------------------------------------------------------------------------- +// Reference implementations (column-major, float) +// --------------------------------------------------------------------------- + +// Access element (row, col) of a column-major matrix with leading dim `ld`. +static inline float cm(const float *M, int row, int col, int ld) { + return M[col * ld + row]; +} + +// C = alpha * op(A) * op(B) + beta * C (in-place, column-major) +static void refMatmul(float *C, const float *A, const float *B, int m, int n, + int k, float alpha, float beta, bool transA, + bool transB) { + int lda = transA ? k : m; + int ldb = transB ? n : k; + for (int j = 0; j < n; ++j) { + for (int i = 0; i < m; ++i) { + float sum = 0.f; + for (int p = 0; p < k; ++p) { + float a = transA ? cm(A, p, i, lda) : cm(A, i, p, lda); + float b = transB ? cm(B, j, p, ldb) : cm(B, p, j, ldb); + sum += a * b; + } + C[j * m + i] = alpha * sum + beta * C[j * m + i]; } - std::cout << "\n"; } } -int main() { - constexpr Idx size = 4; +// C = alpha * op(A) * op(B) + beta * bias_matrix + bias_vec (per-row broadcast) +static void refGemm(float *C, const float *A, const float *B, const float *bias, + int m, int n, int k, float alpha, float beta, bool transA, + bool transB) { + int lda = transA ? k : m; + int ldb = transB ? n : k; + for (int j = 0; j < n; ++j) { + for (int i = 0; i < m; ++i) { + float sum = 0.f; + for (int p = 0; p < k; ++p) { + float a = transA ? cm(A, p, i, lda) : cm(A, i, p, lda); + float b = transB ? cm(B, j, p, ldb) : cm(B, p, j, ldb); + sum += a * b; + } + C[j * m + i] = alpha * sum + beta * bias[j * m + i] + bias[i]; + } + } +} - // Host platform and device - alpaka::PlatformCpu host_platform{}; - auto host = alpaka::getDevByIdx(host_platform, 0u); +static void refGemmRelu(float *C, const float *A, const float *B, + const float *bias, int m, int n, int k, float alpha, + float beta, bool transA, bool transB) { + refGemm(C, A, B, bias, m, n, k, alpha, beta, transA, transB); + for (int i = 0; i < m * n; ++i) + C[i] = C[i] > 0.f ? C[i] : 0.f; +} - // Allocate matrices (column-major) - auto A = alpaka::allocBuf(host, size * size); - auto B = alpaka::allocBuf(host, size * size); - auto C = alpaka::allocBuf(host, size * size); +static void refGemmGelu(float *C, const float *A, const float *B, + const float *bias, int m, int n, int k, float alpha, + float beta, bool transA, bool transB) { + refGemm(C, A, B, bias, m, n, k, alpha, beta, transA, transB); + constexpr float kInvSqrt2 = 0.7071067811865476f; + for (int i = 0; i < m * n; ++i) + C[i] *= 0.5f * (1.f + std::erff(C[i] * kInvSqrt2)); +} - // Fill A and B with random floats centered around 0 - std::random_device rd; - std::mt19937 gen(rd()); - std::normal_distribution dist(0.0f, 10.0f); +static int gFailures = 0; - for (int i = 0; i < size * size; ++i) { - A[i] = dist(gen); - B[i] = dist(gen); +static void checkClose(const float *got, const float *expected, int n, + const std::string &name, float rtol = 1e-4f, + float atol = 1e-4f) { + bool pass = true; + for (int i = 0; i < n; ++i) { + float diff = std::abs(got[i] - expected[i]); + float thr = atol + rtol * std::abs(expected[i]); + if (diff > thr) { + std::cerr << " FAIL [" << name << "] idx=" << i << " got=" << got[i] + << " expected=" << expected[i] << " diff=" << diff << "\n"; + pass = false; + } } - std::cout << "Matrix A:\n"; - print(A, size); - std::cout << '\n'; - std::cout << "Matrix B:\n"; - print(B, size); - std::cout << '\n'; + if (pass) + std::cout << " PASS " << name << "\n"; + else + ++gFailures; +} -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED +// --------------------------------------------------------------------------- +// Helpers to fill test matrices +// --------------------------------------------------------------------------- + +static void fillSeq(float *M, int n, float start = 1.f, float step = 1.f) { + for (int i = 0; i < n; ++i) + M[i] = start + static_cast(i) * step; +} + +static void fillVal(float *M, int n, float v) { + for (int i = 0; i < n; ++i) + M[i] = v; +} + +// --------------------------------------------------------------------------- +// CPU tests +// --------------------------------------------------------------------------- + +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + +static void runCpuTests() { + std::cout << "\n=== CPU Tests ===\n"; + + alpaka::PlatformCpu platform{}; + auto dev = alpaka::getDevByIdx(platform, 0u); + alpaka::Queue queue{dev}; + sofieBLAS blas(queue); + + constexpr int M = 4, N = 3, K = 5; + + // Allocate host buffers + auto hA = alpaka::allocBuf(dev, static_cast(M * K)); + auto hB = alpaka::allocBuf(dev, static_cast(K * N)); + auto hC = alpaka::allocBuf(dev, static_cast(M * N)); + auto hBias = alpaka::allocBuf(dev, static_cast(M * N)); + + float *A = alpaka::getPtrNative(hA); + float *B = alpaka::getPtrNative(hB); + float *C = alpaka::getPtrNative(hC); + float *bias = alpaka::getPtrNative(hBias); + + fillSeq(A, M * K); + fillSeq(B, K * N, 1.f, 0.5f); + fillSeq(bias, M * N, 0.1f, 0.1f); + + std::vector ref(M * N); + + // --- matmul NN --- + fillVal(C, M * N, 0.f); + blas.matmul('N', 'N', M, N, K, 1.f, hA, hB, 0.f, hC); + std::copy(C, C + M * N, ref.data()); + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), A, B, M, N, K, 1.f, 0.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::matmul NN"); + + // --- matmul TN (A^T: K×M physical → M×K logical) --- + // For TN we need A to be K×M so we create a separate buffer + { + auto hAt = alpaka::allocBuf(dev, static_cast(K * M)); + float *At = alpaka::getPtrNative(hAt); + fillSeq(At, K * M); + fillVal(C, M * N, 0.f); + blas.matmul('T', 'N', M, N, K, 1.f, hAt, hB, 0.f, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), At, B, M, N, K, 1.f, 0.f, true, false); + checkClose(C, ref.data(), M * N, "cpu::matmul TN"); + } + + // --- matmul NT --- + { + auto hBt = alpaka::allocBuf(dev, static_cast(N * K)); + float *Bt = alpaka::getPtrNative(hBt); + fillSeq(Bt, N * K, 1.f, 0.5f); + fillVal(C, M * N, 0.f); + blas.matmul('N', 'T', M, N, K, 1.f, hA, hBt, 0.f, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), A, Bt, M, N, K, 1.f, 0.f, false, true); + checkClose(C, ref.data(), M * N, "cpu::matmul NT"); + } + + // --- matmul TT --- { - alpaka::PlatformCudaRt platform; - alpaka::DevCudaRt device = alpaka::getDevByIdx(platform, 0u); - alpaka::Queue queue{device}; + auto hAt = alpaka::allocBuf(dev, static_cast(K * M)); + auto hBt = alpaka::allocBuf(dev, static_cast(N * K)); + float *At = alpaka::getPtrNative(hAt); + float *Bt = alpaka::getPtrNative(hBt); + fillSeq(At, K * M); + fillSeq(Bt, N * K, 1.f, 0.5f); + fillVal(C, M * N, 0.f); + blas.matmul('T', 'T', M, N, K, 1.f, hAt, hBt, 0.f, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), At, Bt, M, N, K, 1.f, 0.f, true, true); + checkClose(C, ref.data(), M * N, "cpu::matmul TT"); + } - const Idx m = size; // rows of A and C - const Idx n = size; // columns of B and C - const Idx k = size; // columns of A and rows of B + // --- matmul: alpha scaling --- + fillVal(C, M * N, 0.f); + blas.matmul('N', 'N', M, N, K, 2.5f, hA, hB, 0.f, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), A, B, M, N, K, 2.5f, 0.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::matmul alpha=2.5"); - const float alpha = 1.0f; - const float beta = 0.0f; + // --- matmul: beta accumulation --- + fillSeq(C, M * N, 10.f); // pre-fill C + blas.matmul('N', 'N', M, N, K, 1.f, hA, hB, 0.5f, hC); + { + std::vector C0(M * N); + fillSeq(C0.data(), M * N, 10.f); + refMatmul(ref.data(), A, B, M, N, K, 1.f, 0.5f, false, false); + // ref already has the pre-filled values baked in via refMatmul beta path + // but refMatmul reads C in-place, so we need to re-run with correct init + std::copy(C0.begin(), C0.end(), ref.data()); + refMatmul(ref.data(), A, B, M, N, K, 1.f, 0.5f, false, false); + } + checkClose(C, ref.data(), M * N, "cpu::matmul beta=0.5"); - const Idx lda = size; // leading dimension of A - const Idx ldb = size; // leading dimension of B - const Idx ldc = size; + // --- gemm NN (beta=0, no prior accumulation) --- + fillVal(C, M * N, 0.f); + fillSeq(bias, M * N, 0.1f, 0.1f); + blas.gemm('N', 'N', M, N, K, 1.f, hA, hB, 0.f, hBias, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemm(ref.data(), A, B, bias, M, N, K, 1.f, 0.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::gemm NN beta=0"); - auto A_d = alpaka::allocAsyncBuf(queue, size * size); - auto B_d = alpaka::allocAsyncBuf(queue, size * size); - auto C_d = alpaka::allocAsyncBuf(queue, size * size); - alpaka::memcpy(queue, A_d, A); - alpaka::memcpy(queue, B_d, B); + // --- gemm NN (beta=1 accumulation) --- + fillVal(C, M * N, 0.f); + blas.gemm('N', 'N', M, N, K, 1.f, hA, hB, 1.f, hBias, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemm(ref.data(), A, B, bias, M, N, K, 1.f, 1.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::gemm NN beta=1"); - sofieBLAS blas(queue); - blas.gemm('n', 'n', m, n, k, alpha, A_d, lda, B_d, ldb, beta, C_d, ldc); - alpaka::memcpy(queue, C, C_d); + // --- gemm TN --- + { + auto hAt = alpaka::allocBuf(dev, static_cast(K * M)); + float *At = alpaka::getPtrNative(hAt); + fillSeq(At, K * M); + fillVal(C, M * N, 0.f); + blas.gemm('T', 'N', M, N, K, 1.f, hAt, hB, 0.f, hBias, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemm(ref.data(), At, B, bias, M, N, K, 1.f, 0.f, true, false); + checkClose(C, ref.data(), M * N, "cpu::gemm TN"); + } + // --- gemmrelu: all-positive matmul result stays unchanged --- + { + // A and B with positive values ensure result is positive before bias + auto hAp = alpaka::allocBuf(dev, static_cast(M * K)); + auto hBp = alpaka::allocBuf(dev, static_cast(K * N)); + auto hBiasp = alpaka::allocBuf(dev, static_cast(M * N)); + float *Ap = alpaka::getPtrNative(hAp); + float *Bp = alpaka::getPtrNative(hBp); + float *biasp = alpaka::getPtrNative(hBiasp); + fillSeq(Ap, M * K, 0.1f, 0.1f); + fillSeq(Bp, K * N, 0.1f, 0.1f); + fillVal(biasp, M * N, 0.f); + fillVal(C, M * N, 0.f); + blas.gemmrelu('N', 'N', M, N, K, 1.f, hAp, hBp, 0.f, hBiasp, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmRelu(ref.data(), Ap, Bp, biasp, M, N, K, 1.f, 0.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::gemmrelu all-positive"); + } + + // --- gemmrelu: negative values clamped to zero --- + { + // Use alpha=-1 to force negative results + auto hBiasz = alpaka::allocBuf(dev, static_cast(M * N)); + fillVal(alpaka::getPtrNative(hBiasz), M * N, 0.f); + fillVal(C, M * N, 0.f); + blas.gemmrelu('N', 'N', M, N, K, -1.f, hA, hB, 0.f, hBiasz, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmRelu(ref.data(), A, B, alpaka::getPtrNative(hBiasz), M, N, K, -1.f, + 0.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::gemmrelu alpha=-1 (clamped)"); + } + + // --- gemmrelu with bias --- + fillVal(C, M * N, 0.f); + fillSeq(bias, M * N, -5.f, 2.f); // mixed sign bias + blas.gemmrelu('N', 'N', M, N, K, 1.f, hA, hB, 0.f, hBias, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmRelu(ref.data(), A, B, bias, M, N, K, 1.f, 0.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::gemmrelu with mixed bias"); + + // --- gemmgelu NN --- + fillVal(C, M * N, 0.f); + fillVal(bias, M * N, 0.f); + blas.gemmgelu('N', 'N', M, N, K, 1.f, hA, hB, 0.f, hBias, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmGelu(ref.data(), A, B, bias, M, N, K, 1.f, 0.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::gemmgelu NN"); + + // --- gemmgelu with bias --- + fillVal(C, M * N, 0.f); + fillSeq(bias, M * N, -2.f, 0.5f); + blas.gemmgelu('N', 'N', M, N, K, 1.f, hA, hB, 0.f, hBias, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmGelu(ref.data(), A, B, bias, M, N, K, 1.f, 0.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::gemmgelu with bias"); + + // --- gemmgelu TN --- + { + auto hAt = alpaka::allocBuf(dev, static_cast(K * M)); + float *At = alpaka::getPtrNative(hAt); + fillSeq(At, K * M); + fillVal(C, M * N, 0.f); + fillVal(bias, M * N, 0.f); + blas.gemmgelu('T', 'N', M, N, K, 1.f, hAt, hB, 0.f, hBias, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmGelu(ref.data(), At, B, bias, M, N, K, 1.f, 0.f, true, false); + checkClose(C, ref.data(), M * N, "cpu::gemmgelu TN"); + } + + // --- edge: zero matrix --- + { + auto hZ = alpaka::allocBuf(dev, static_cast(M * K)); + fillVal(alpaka::getPtrNative(hZ), M * K, 0.f); + fillVal(C, M * N, 99.f); + fillVal(bias, M * N, 0.f); + blas.matmul('N', 'N', M, N, K, 1.f, hZ, hB, 0.f, hC); + std::fill(ref.begin(), ref.end(), 0.f); + checkClose(C, ref.data(), M * N, "cpu::matmul zero-A"); + } + + // --- edge: identity-like (square, known result) --- + { + constexpr int S = 3; + auto hI = alpaka::allocBuf(dev, static_cast(S * S)); + auto hX = alpaka::allocBuf(dev, static_cast(S * S)); + auto hY = alpaka::allocBuf(dev, static_cast(S * S)); + float *I = alpaka::getPtrNative(hI); + float *X = alpaka::getPtrNative(hX); + float *Y = alpaka::getPtrNative(hY); + fillVal(I, S * S, 0.f); + for (int i = 0; i < S; ++i) + I[i * S + i] = 1.f; + fillSeq(X, S * S); + fillVal(Y, S * S, 0.f); + blas.matmul('N', 'N', S, S, S, 1.f, hI, hX, 0.f, hY); + checkClose(Y, X, S * S, "cpu::matmul identity×X=X"); + } +} + +#endif // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + +// --------------------------------------------------------------------------- +// CUDA tests +// --------------------------------------------------------------------------- + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + +// Helper: infer packed column-major leading dimensions +static int ldaFor(char trans, int m, int k) { + return (trans == 'N' || trans == 'n') ? m : k; +} +static int ldbFor(char trans, int k, int n) { + return (trans == 'N' || trans == 'n') ? k : n; +} + +static void runCudaTests() { + std::cout << "\n=== CUDA Tests ===\n"; + + alpaka::PlatformCudaRt platform{}; + auto dev = alpaka::getDevByIdx(platform, 0u); + alpaka::Queue queue{dev}; + sofieBLAS blas(queue); + + alpaka::PlatformCpu hostPlatform{}; + auto hostDev = alpaka::getDevByIdx(hostPlatform, 0u); + + constexpr int M = 4, N = 3, K = 5; + + auto hA = alpaka::allocBuf(hostDev, static_cast(M * K)); + auto hB = alpaka::allocBuf(hostDev, static_cast(K * N)); + auto hC = alpaka::allocBuf(hostDev, static_cast(M * N)); + auto hBias = alpaka::allocBuf(hostDev, static_cast(M * N)); + + float *A = alpaka::getPtrNative(hA); + float *B = alpaka::getPtrNative(hB); + float *bias = alpaka::getPtrNative(hBias); + + fillSeq(A, M * K); + fillSeq(B, K * N, 1.f, 0.5f); + fillVal(bias, M * N, 0.f); + + auto dA = alpaka::allocAsyncBuf(queue, static_cast(M * K)); + auto dB = alpaka::allocAsyncBuf(queue, static_cast(K * N)); + auto dC = alpaka::allocAsyncBuf(queue, static_cast(M * N)); + auto dBias = + alpaka::allocAsyncBuf(queue, static_cast(M * N)); + + alpaka::memcpy(queue, dA, hA); + alpaka::memcpy(queue, dB, hB); + alpaka::memcpy(queue, dBias, hBias); + alpaka::wait(queue); + + std::vector ref(M * N); + float *C = alpaka::getPtrNative(hC); + + auto verify = [&](const std::string &name) { + alpaka::memcpy(queue, hC, dC); + alpaka::wait(queue); + checkClose(C, ref.data(), M * N, name); + }; + + // ---- matmul NN ---- + blas.addLayoutConfig(M, N, K, ldaFor('N', M, K), ldbFor('N', K, N), M, 'N', + 'N'); + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), A, B, M, N, K, 1.f, 0.f, false, false); + blas.matmul('N', 'N', M, N, K, 1.f, dA, dB, 0.f, dC); + verify("cuda::matmul NN"); + + // ---- matmul TN ---- + { + auto hAt = alpaka::allocBuf(hostDev, static_cast(K * M)); + float *At = alpaka::getPtrNative(hAt); + fillSeq(At, K * M); + auto dAt = + alpaka::allocAsyncBuf(queue, static_cast(K * M)); + alpaka::memcpy(queue, dAt, hAt); alpaka::wait(queue); - std::cout << "CUDA Matrix C = A × B:\n"; - print(C, size); - std::cout << '\n'; + blas.addLayoutConfig(M, N, K, ldaFor('T', M, K), ldbFor('N', K, N), M, 'T', + 'N'); + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), At, B, M, N, K, 1.f, 0.f, true, false); + blas.matmul('T', 'N', M, N, K, 1.f, dAt, dB, 0.f, dC); + verify("cuda::matmul TN"); } -#endif -#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + // ---- matmul NT ---- { - alpaka::PlatformCpu platform; - alpaka::DevCpu device = alpaka::getDevByIdx(platform, 0u); - alpaka::Queue queue{device}; + auto hBt = alpaka::allocBuf(hostDev, static_cast(N * K)); + float *Bt = alpaka::getPtrNative(hBt); + fillSeq(Bt, N * K, 1.f, 0.5f); + auto dBt = + alpaka::allocAsyncBuf(queue, static_cast(N * K)); + alpaka::memcpy(queue, dBt, hBt); + alpaka::wait(queue); + blas.addLayoutConfig(M, N, K, ldaFor('N', M, K), ldbFor('T', K, N), M, 'N', + 'T'); + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), A, Bt, M, N, K, 1.f, 0.f, false, true); + blas.matmul('N', 'T', M, N, K, 1.f, dA, dBt, 0.f, dC); + verify("cuda::matmul NT"); + } + + // ---- matmul alpha=2.5 ---- + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), A, B, M, N, K, 2.5f, 0.f, false, false); + blas.matmul('N', 'N', M, N, K, 2.5f, dA, dB, 0.f, dC); + verify("cuda::matmul alpha=2.5"); - const Idx m = size; // rows of A and C - const Idx n = size; // columns of B and C - const Idx k = size; // columns of A and rows of B + // ---- gemm NN beta=0 ---- + fillSeq(bias, M * N, 0.1f, 0.1f); + alpaka::memcpy(queue, dBias, hBias); + alpaka::wait(queue); + std::fill(ref.begin(), ref.end(), 0.f); + refGemm(ref.data(), A, B, bias, M, N, K, 1.f, 0.f, false, false); + blas.gemm('N', 'N', M, N, K, 1.f, dA, dB, 0.f, dBias, dC); + verify("cuda::gemm NN beta=0"); - const float alpha = 1.0f; - const float beta = 0.0f; + // ---- gemm NN beta=1 ---- + // D_in = bias, so result = A*B + 1*bias_matrix + bias_vec + std::fill(ref.begin(), ref.end(), 0.f); + refGemm(ref.data(), A, B, bias, M, N, K, 1.f, 1.f, false, false); + blas.gemm('N', 'N', M, N, K, 1.f, dA, dB, 1.f, dBias, dC); + verify("cuda::gemm NN beta=1"); - const Idx lda = size; // leading dimension of A - const Idx ldb = size; // leading dimension of B - const Idx ldc = size; // leading dimension of C + // ---- gemm TN ---- + { + auto hAt = alpaka::allocBuf(hostDev, static_cast(K * M)); + float *At = alpaka::getPtrNative(hAt); + fillSeq(At, K * M); + auto dAt = + alpaka::allocAsyncBuf(queue, static_cast(K * M)); + alpaka::memcpy(queue, dAt, hAt); + alpaka::wait(queue); + blas.addLayoutConfig(M, N, K, ldaFor('T', M, K), ldbFor('N', K, N), M, 'T', + 'N'); + std::fill(ref.begin(), ref.end(), 0.f); + refGemm(ref.data(), At, B, bias, M, N, K, 1.f, 0.f, true, false); + blas.gemm('T', 'N', M, N, K, 1.f, dAt, dB, 0.f, dBias, dC); + verify("cuda::gemm TN"); + } - sofieBLAS blas(queue); - blas.gemm('n', 'n', m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); + // ---- gemmrelu: all-positive (relu is identity) ---- + { + auto hAp = alpaka::allocBuf(hostDev, static_cast(M * K)); + auto hBp = alpaka::allocBuf(hostDev, static_cast(K * N)); + auto hBiasz = + alpaka::allocBuf(hostDev, static_cast(M * N)); + float *Ap = alpaka::getPtrNative(hAp); + float *Bp = alpaka::getPtrNative(hBp); + fillSeq(Ap, M * K, 0.1f, 0.1f); + fillSeq(Bp, K * N, 0.1f, 0.1f); + fillVal(alpaka::getPtrNative(hBiasz), M * N, 0.f); + auto dAp = + alpaka::allocAsyncBuf(queue, static_cast(M * K)); + auto dBp = + alpaka::allocAsyncBuf(queue, static_cast(K * N)); + auto dBiasz = + alpaka::allocAsyncBuf(queue, static_cast(M * N)); + alpaka::memcpy(queue, dAp, hAp); + alpaka::memcpy(queue, dBp, hBp); + alpaka::memcpy(queue, dBiasz, hBiasz); + alpaka::wait(queue); + blas.addLayoutConfig(M, N, K, M, K, M, 'N', 'N'); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmRelu(ref.data(), Ap, Bp, alpaka::getPtrNative(hBiasz), M, N, K, 1.f, + 0.f, false, false); + blas.gemmrelu('N', 'N', M, N, K, 1.f, dAp, dBp, 0.f, dBiasz, dC); + verify("cuda::gemmrelu all-positive"); + } + // ---- gemmrelu: alpha=-1 forces negatives -> clamped to zero ---- + { + auto hBiasz = + alpaka::allocBuf(hostDev, static_cast(M * N)); + fillVal(alpaka::getPtrNative(hBiasz), M * N, 0.f); + auto dBiasz = + alpaka::allocAsyncBuf(queue, static_cast(M * N)); + alpaka::memcpy(queue, dBiasz, hBiasz); alpaka::wait(queue); - std::cout << "CPU Matrix C = A × B:\n"; - print(C, size); - std::cout << '\n'; + std::fill(ref.begin(), ref.end(), 0.f); + refGemmRelu(ref.data(), A, B, alpaka::getPtrNative(hBiasz), M, N, K, -1.f, + 0.f, false, false); + blas.gemmrelu('N', 'N', M, N, K, -1.f, dA, dB, 0.f, dBiasz, dC); + verify("cuda::gemmrelu alpha=-1 (clamped)"); } + + // ---- gemmrelu with mixed bias ---- + fillSeq(bias, M * N, -5.f, 2.f); + alpaka::memcpy(queue, dBias, hBias); + alpaka::wait(queue); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmRelu(ref.data(), A, B, bias, M, N, K, 1.f, 0.f, false, false); + blas.gemmrelu('N', 'N', M, N, K, 1.f, dA, dB, 0.f, dBias, dC); + verify("cuda::gemmrelu with mixed bias"); + + // ---- gemmgelu NN ---- + fillVal(bias, M * N, 0.f); + alpaka::memcpy(queue, dBias, hBias); + alpaka::wait(queue); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmGelu(ref.data(), A, B, bias, M, N, K, 1.f, 0.f, false, false); + blas.gemmgelu('N', 'N', M, N, K, 1.f, dA, dB, 0.f, dBias, dC); + verify("cuda::gemmgelu NN"); + + // ---- gemmgelu with bias ---- + fillSeq(bias, M * N, -2.f, 0.5f); + alpaka::memcpy(queue, dBias, hBias); + alpaka::wait(queue); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmGelu(ref.data(), A, B, bias, M, N, K, 1.f, 0.f, false, false); + blas.gemmgelu('N', 'N', M, N, K, 1.f, dA, dB, 0.f, dBias, dC); + verify("cuda::gemmgelu with bias"); + + // ---- edge: zero A ---- + { + auto hZero = alpaka::allocBuf(hostDev, static_cast(M * K)); + fillVal(alpaka::getPtrNative(hZero), M * K, 0.f); + auto dZero = + alpaka::allocAsyncBuf(queue, static_cast(M * K)); + alpaka::memcpy(queue, dZero, hZero); + alpaka::wait(queue); + std::fill(ref.begin(), ref.end(), 0.f); + blas.matmul('N', 'N', M, N, K, 1.f, dZero, dB, 0.f, dC); + verify("cuda::matmul zero-A"); + } +} + +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED + +// --------------------------------------------------------------------------- +// main +// --------------------------------------------------------------------------- + +int main() { +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + runCpuTests(); #endif +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + runCudaTests(); +#endif + + std::cout << "\n"; + if (gFailures == 0) + std::cout << "All tests passed.\n"; + else + std::cout << gFailures << " test(s) FAILED.\n"; - return 0; + return gFailures > 0 ? EXIT_FAILURE : EXIT_SUCCESS; } From 066fdaef7ae0704baab7787f212255c7b71e252f Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Mon, 4 May 2026 17:15:24 +0200 Subject: [PATCH 16/23] feat: matmul method on raw cuda pointers --- .../sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 3c4e93c..8631751 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -191,6 +191,19 @@ class BlasCuda { layoutKeyB(transb, k, n), {m, n}); } + // matmul on raw pointers directly (no layout caching, caller must ensure + // correct leading dims and layout) + template + inline void matmul(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, T const &A, T const &B, + float beta, T &C) { + auto desc = + makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_DEFAULT); + executeMatmul(desc, alpha, A, B, beta, C, C, layoutKeyA(transa, m, k), + layoutKeyB(transb, k, n), {m, n}); + } + private: alpaka::QueueCudaRtNonBlocking m_queue; From 1f1c4c72d5e88c31b956d19b1f0f86e72870d182 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Mon, 4 May 2026 17:28:39 +0200 Subject: [PATCH 17/23] fix: remove extra template parameter in cuda matmul method on raw pointers --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 8631751..189d821 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -193,7 +193,7 @@ class BlasCuda { // matmul on raw pointers directly (no layout caching, caller must ensure // correct leading dims and layout) - template + template inline void matmul(char transa, char transb, unsigned int m, unsigned int n, unsigned int k, float alpha, T const &A, T const &B, float beta, T &C) { From a8679396411d1cc033ad8dab668d0f874eacb162 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Mon, 4 May 2026 17:35:29 +0200 Subject: [PATCH 18/23] fix: method signature for matmul on cuda raw pointers --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 189d821..a89368f 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -195,8 +195,8 @@ class BlasCuda { // correct leading dims and layout) template inline void matmul(char transa, char transb, unsigned int m, unsigned int n, - unsigned int k, float alpha, T const &A, T const &B, - float beta, T &C) { + unsigned int k, float alpha, T const * A, T const * B, + float beta, T * C) { auto desc = makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), CUBLASLT_EPILOGUE_DEFAULT); From ad41e993137b9d2d529f6963db2d3a8db3134d68 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Mon, 4 May 2026 18:00:02 +0200 Subject: [PATCH 19/23] fix: dimension order for checkAddLayout method in cuda --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index a89368f..5edf813 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -113,9 +113,9 @@ class BlasCuda { checkAndAddLayout(k, m, lda); // Physical B: (k×n) if NoTrans, (n×k) if Trans if (transb == 'N' || transb == 'n') - checkAndAddLayout(k, n, ldb); - else checkAndAddLayout(n, k, ldb); + else + checkAndAddLayout(k, n, ldb); // C is always (m×n) checkAndAddLayout(m, n, ldc); } From f86d299c33f9598f204e31447fe24872057787ad Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Mon, 4 May 2026 18:26:46 +0200 Subject: [PATCH 20/23] fix: correct transpose values while adding layouts for cuda gemm methods --- include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 5edf813..a89368f 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -113,9 +113,9 @@ class BlasCuda { checkAndAddLayout(k, m, lda); // Physical B: (k×n) if NoTrans, (n×k) if Trans if (transb == 'N' || transb == 'n') - checkAndAddLayout(n, k, ldb); - else checkAndAddLayout(k, n, ldb); + else + checkAndAddLayout(n, k, ldb); // C is always (m×n) checkAndAddLayout(m, n, ldc); } From 7b2133c9d7d1d81c81a71a2f8ceb9c94f8248390 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Mon, 11 May 2026 10:22:45 +0200 Subject: [PATCH 21/23] feat: gemm apis for alpaka views --- .../sofieBLAS/backends/cpu/sofieBLAS_cpu.hpp | 70 +++++++++++++++++++ .../backends/cuda/sofieBLAS_cublas.hpp | 68 ++++++++++++++++++ 2 files changed, 138 insertions(+) diff --git a/include/sofieBLAS/backends/cpu/sofieBLAS_cpu.hpp b/include/sofieBLAS/backends/cpu/sofieBLAS_cpu.hpp index 23b205f..e2fe0b2 100644 --- a/include/sofieBLAS/backends/cpu/sofieBLAS_cpu.hpp +++ b/include/sofieBLAS/backends/cpu/sofieBLAS_cpu.hpp @@ -56,6 +56,23 @@ class BlasCpu { ldb, beta, alpaka::getPtrNative(C), static_cast(m)); } + template + inline void matmul(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::ViewPlainPtr, TIdx> const &A, + alpaka::ViewPlainPtr, TIdx> const &B, + float beta, + alpaka::ViewPlainPtr, TIdx> &C) { + int lda = (transa == 'N' || transa == 'n') ? static_cast(m) + : static_cast(k); + int ldb = (transb == 'N' || transb == 'n') ? static_cast(k) + : static_cast(n); + cblas_sgemm(CblasColMajor, charToTranspose(transa), charToTranspose(transb), + static_cast(m), static_cast(n), static_cast(k), + alpha, alpaka::getPtrNative(A), lda, alpaka::getPtrNative(B), + ldb, beta, alpaka::getPtrNative(C), static_cast(m)); + } + // C = alpha * op(A) * op(B) + beta * bias + bias_vec (bias_vec broadcast per // row) Matches the cuBLASLt EPILOGUE_BIAS semantics: the bias buffer serves // as both the beta-scaled accumulator and provides the per-row bias vector @@ -84,6 +101,30 @@ class BlasCpu { c[j * m + i] += beta * b[j * m + i] + b[i]; } + template + inline void + gemm(char transa, char transb, unsigned int m, unsigned int n, unsigned int k, + float alpha, + alpaka::ViewPlainPtr, TIdx> const &A, + alpaka::ViewPlainPtr, TIdx> const &B, + float beta, + alpaka::ViewPlainPtr, TIdx> &bias, + alpaka::ViewPlainPtr, TIdx> &C) { + int lda = (transa == 'N' || transa == 'n') ? static_cast(m) + : static_cast(k); + int ldb = (transb == 'N' || transb == 'n') ? static_cast(k) + : static_cast(n); + cblas_sgemm(CblasColMajor, charToTranspose(transa), charToTranspose(transb), + static_cast(m), static_cast(n), static_cast(k), + alpha, alpaka::getPtrNative(A), lda, alpaka::getPtrNative(B), + ldb, 0.0f, alpaka::getPtrNative(C), static_cast(m)); + float *c = alpaka::getPtrNative(C); + const float *b = alpaka::getPtrNative(bias); + for (unsigned int j = 0; j < n; ++j) + for (unsigned int i = 0; i < m; ++i) + c[j * m + i] += beta * b[j * m + i] + b[i]; + } + // C = relu(alpha * op(A) * op(B) + beta * bias + bias_vec) template inline void gemmrelu(char transa, char transb, unsigned int m, unsigned int n, @@ -99,6 +140,20 @@ class BlasCpu { c[i] = c[i] > 0.0f ? c[i] : 0.0f; } + template + inline void gemmrelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::ViewPlainPtr, TIdx> const &A, + alpaka::ViewPlainPtr, TIdx> const &B, + float beta, + alpaka::ViewPlainPtr, TIdx> &bias, + alpaka::ViewPlainPtr, TIdx> &C) { + gemm(transa, transb, m, n, k, alpha, A, B, beta, bias, C); + float *c = alpaka::getPtrNative(C); + for (unsigned int i = 0; i < m * n; ++i) + c[i] = c[i] > 0.0f ? c[i] : 0.0f; + } + // C = gelu(alpha * op(A) * op(B) + beta * bias + bias_vec) // Uses the standard GELU: x * 0.5 * (1 + erf(x / sqrt(2))) template @@ -115,6 +170,21 @@ class BlasCpu { for (unsigned int i = 0; i < m * n; ++i) c[i] *= 0.5f * (1.0f + std::erff(c[i] * kInvSqrt2)); } + + template + inline void gemmgelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::ViewPlainPtr, TIdx> const &A, + alpaka::ViewPlainPtr, TIdx> const &B, + float beta, + alpaka::ViewPlainPtr, TIdx> &bias, + alpaka::ViewPlainPtr, TIdx> &C) { + gemm(transa, transb, m, n, k, alpha, A, B, beta, bias, C); + float *c = alpaka::getPtrNative(C); + constexpr float kInvSqrt2 = 0.7071067811865476f; + for (unsigned int i = 0; i < m * n; ++i) + c[i] *= 0.5f * (1.0f + std::erff(c[i] * kInvSqrt2)); + } }; namespace traits { diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index a89368f..dc502f6 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -138,6 +138,24 @@ class BlasCuda { layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); } + template + inline void + gemm(char transa, char transb, unsigned int m, unsigned int n, unsigned int k, + float alpha, + alpaka::ViewPlainPtr, TIdx> const &A, + alpaka::ViewPlainPtr, TIdx> const &B, + float beta, + alpaka::ViewPlainPtr, TIdx> &bias, + alpaka::ViewPlainPtr, TIdx> &C) { + const void *bptr = alpaka::getPtrNative(bias); + auto desc = + makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_BIAS, bptr); + executeMatmul(desc, alpha, alpaka::getPtrNative(A), alpaka::getPtrNative(B), + beta, alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } + // C = relu(alpha * op(A) * op(B) + beta * bias + bias_vec) template inline void gemmrelu(char transa, char transb, unsigned int m, unsigned int n, @@ -156,6 +174,23 @@ class BlasCuda { layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); } + template + inline void gemmrelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::ViewPlainPtr, TIdx> const &A, + alpaka::ViewPlainPtr, TIdx> const &B, + float beta, + alpaka::ViewPlainPtr, TIdx> &bias, + alpaka::ViewPlainPtr, TIdx> &C) { + const void *bptr = alpaka::getPtrNative(bias); + auto desc = + makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_RELU_BIAS, bptr); + executeMatmul(desc, alpha, alpaka::getPtrNative(A), alpaka::getPtrNative(B), + beta, alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } + // C = gelu(alpha * op(A) * op(B) + beta * bias + bias_vec) template inline void gemmgelu(char transa, char transb, unsigned int m, unsigned int n, @@ -174,6 +209,23 @@ class BlasCuda { layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); } + template + inline void gemmgelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::ViewPlainPtr, TIdx> const &A, + alpaka::ViewPlainPtr, TIdx> const &B, + float beta, + alpaka::ViewPlainPtr, TIdx> &bias, + alpaka::ViewPlainPtr, TIdx> &C) { + const void *bptr = alpaka::getPtrNative(bias); + auto desc = + makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_GELU_BIAS, bptr); + executeMatmul(desc, alpha, alpaka::getPtrNative(A), alpaka::getPtrNative(B), + beta, alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } + // C = alpha * op(A) * op(B) + beta * C (no bias) template inline void matmul(char transa, char transb, unsigned int m, unsigned int n, @@ -191,6 +243,22 @@ class BlasCuda { layoutKeyB(transb, k, n), {m, n}); } + template + inline void matmul(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::ViewPlainPtr, TIdx> const &A, + alpaka::ViewPlainPtr, TIdx> const &B, + float beta, + alpaka::ViewPlainPtr, TIdx> &C) { + auto desc = + makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_DEFAULT); + T *c = alpaka::getPtrNative(C); + executeMatmul(desc, alpha, alpaka::getPtrNative(A), alpaka::getPtrNative(B), + beta, c, c, layoutKeyA(transa, m, k), + layoutKeyB(transb, k, n), {m, n}); + } + // matmul on raw pointers directly (no layout caching, caller must ensure // correct leading dims and layout) template From 6f2a9e16e3d4daabafdb46e1e771b67921b0a64b Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Mon, 11 May 2026 10:43:26 +0200 Subject: [PATCH 22/23] feat: gemm functions for cublas using raw pointers --- .../sofieBLAS/backends/cpu/sofieBLAS_cpu.hpp | 34 +++++++++++++++++++ .../backends/cuda/sofieBLAS_cublas.hpp | 34 +++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/include/sofieBLAS/backends/cpu/sofieBLAS_cpu.hpp b/include/sofieBLAS/backends/cpu/sofieBLAS_cpu.hpp index e2fe0b2..6cfeb1f 100644 --- a/include/sofieBLAS/backends/cpu/sofieBLAS_cpu.hpp +++ b/include/sofieBLAS/backends/cpu/sofieBLAS_cpu.hpp @@ -185,6 +185,40 @@ class BlasCpu { for (unsigned int i = 0; i < m * n; ++i) c[i] *= 0.5f * (1.0f + std::erff(c[i] * kInvSqrt2)); } + + // Raw-pointer overloads: accept T const*/T* from any BufXxx or ViewPlainPtr via getPtrNative() + template + inline void gemm(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, T const *A, T const *B, + float beta, T *bias, T *C) { + int lda = (transa == 'N' || transa == 'n') ? static_cast(m) : static_cast(k); + int ldb = (transb == 'N' || transb == 'n') ? static_cast(k) : static_cast(n); + cblas_sgemm(CblasColMajor, charToTranspose(transa), charToTranspose(transb), + static_cast(m), static_cast(n), static_cast(k), + alpha, A, lda, B, ldb, 0.0f, C, static_cast(m)); + for (unsigned int j = 0; j < n; ++j) + for (unsigned int i = 0; i < m; ++i) + C[j * m + i] += beta * bias[j * m + i] + bias[i]; + } + + template + inline void gemmrelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, T const *A, T const *B, + float beta, T *bias, T *C) { + gemm(transa, transb, m, n, k, alpha, A, B, beta, bias, C); + for (unsigned int i = 0; i < m * n; ++i) + C[i] = C[i] > 0.0f ? C[i] : 0.0f; + } + + template + inline void gemmgelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, T const *A, T const *B, + float beta, T *bias, T *C) { + gemm(transa, transb, m, n, k, alpha, A, B, beta, bias, C); + constexpr float kInvSqrt2 = 0.7071067811865476f; + for (unsigned int i = 0; i < m * n; ++i) + C[i] *= 0.5f * (1.0f + std::erff(C[i] * kInvSqrt2)); + } }; namespace traits { diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index dc502f6..33baadd 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -226,6 +226,40 @@ class BlasCuda { layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); } + // Raw-pointer overloads: accept T const*/T* from any BufXxx or ViewPlainPtr via getPtrNative() + template + inline void gemm(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, T const *A, T const *B, + float beta, T *bias, T *C) { + const void *bptr = bias; + auto desc = makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_BIAS, bptr); + executeMatmul(desc, alpha, A, B, beta, bias, C, + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } + + template + inline void gemmrelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, T const *A, T const *B, + float beta, T *bias, T *C) { + const void *bptr = bias; + auto desc = makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_RELU_BIAS, bptr); + executeMatmul(desc, alpha, A, B, beta, bias, C, + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } + + template + inline void gemmgelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, T const *A, T const *B, + float beta, T *bias, T *C) { + const void *bptr = bias; + auto desc = makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_GELU_BIAS, bptr); + executeMatmul(desc, alpha, A, B, beta, bias, C, + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } + // C = alpha * op(A) * op(B) + beta * C (no bias) template inline void matmul(char transa, char transb, unsigned int m, unsigned int n, From fa108fb4e6e3c896bfd6bb83dceafee240161474 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Mon, 25 May 2026 12:36:32 +0200 Subject: [PATCH 23/23] feat: strided batch computation for gemm in cublaslt --- .../backends/cuda/sofieBLAS_cublas.hpp | 374 +++++++++++------- 1 file changed, 228 insertions(+), 146 deletions(-) diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index 33baadd..f0b9e57 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -30,6 +30,7 @@ } \ } while (0) + struct PairHash { std::size_t operator()(const std::pair &p) const noexcept { @@ -46,16 +47,66 @@ struct PairEq { } }; +struct DescKey { + int transA; // CUBLAS_OP_N / CUBLAS_OP_T encoded as int + int transB; + int epilogue; // cublasLtEpilogue_t encoded as int + bool operator==(const DescKey &o) const noexcept { + return transA == o.transA && transB == o.transB && epilogue == o.epilogue; + } +}; + +struct DescKeyHash { + std::size_t operator()(const DescKey &k) const noexcept { + // Small values: simple polynomial hash + std::size_t h = static_cast(k.transA) * 97u + + static_cast(k.transB) * 31u + + static_cast(k.epilogue); + return h ^ (h >> 16); + } +}; + + +struct AlgoKey { + DescKey dk; + std::size_t rowsA, colsA; // physical dimensions of A in layoutStore + std::size_t rowsB, colsB; // physical dimensions of B in layoutStore + bool operator==(const AlgoKey &o) const noexcept { + return dk == o.dk + && rowsA == o.rowsA && colsA == o.colsA + && rowsB == o.rowsB && colsB == o.colsB; + } +}; + +struct AlgoKeyHash { + std::size_t operator()(const AlgoKey &k) const noexcept { + std::size_t h = DescKeyHash{}(k.dk); + auto mix = [&](std::size_t v) { + h ^= std::hash{}(v) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2); + }; + mix(k.rowsA); mix(k.colsA); + mix(k.rowsB); mix(k.colsB); + return h; + } +}; + class BlasCuda { - cublasLtHandle_t ltHandle = nullptr; + cublasLtHandle_t ltHandle = nullptr; + cublasHandle_t handle = nullptr; // legacy cuBLAS for batched ops cublasLtMatmulPreference_t preference = nullptr; - void *d_workspace = nullptr; - size_t workspaceSize = 1 << 22; // 4 MB + void *d_workspace = nullptr; + size_t workspaceSize = 1u << 25; // 32 MB (was 4 MB) cudaStream_t stream = nullptr; + std::unordered_map, cublasLtMatrixLayout_t, PairHash, PairEq> layoutStore; + std::unordered_map descStore; + + std::unordered_map + algoCache; + public: BlasCuda(const BlasCuda &) = delete; BlasCuda &operator=(const BlasCuda &) = delete; @@ -64,7 +115,12 @@ class BlasCuda { BlasCuda(alpaka::QueueCudaRtNonBlocking &queue) : m_queue{queue} { stream = static_cast(m_queue.getNativeHandle()); + CHECK_CUBLAS(cublasLtCreate(<Handle)); + + CHECK_CUBLAS(cublasCreate(&handle)); + CHECK_CUBLAS(cublasSetStream(handle, stream)); + CHECK_CUBLAS(cublasLtMatmulPreferenceCreate(&preference)); CHECK_CUDA(cudaMalloc(&d_workspace, workspaceSize)); CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( @@ -74,35 +130,25 @@ class BlasCuda { ~BlasCuda() { for (auto &[key, layout] : layoutStore) - if (layout) - cublasLtMatrixLayoutDestroy(layout); - if (preference) - cublasLtMatmulPreferenceDestroy(preference); - if (ltHandle) - cublasLtDestroy(ltHandle); - if (d_workspace) - cudaFree(d_workspace); + if (layout) cublasLtMatrixLayoutDestroy(layout); + for (auto &[key, desc] : descStore) + if (desc) cublasLtMatmulDescDestroy(desc); + if (preference) cublasLtMatmulPreferenceDestroy(preference); + if (ltHandle) cublasLtDestroy(ltHandle); + if (handle) cublasDestroy(handle); + if (d_workspace) cudaFree(d_workspace); } inline cublasOperation_t charToCuBlasTranspose(char trans) { switch (trans) { - case 'N': - case 'n': - return CUBLAS_OP_N; - case 'T': - case 't': - return CUBLAS_OP_T; - case 'C': - case 'c': - return CUBLAS_OP_C; + case 'N': case 'n': return CUBLAS_OP_N; + case 'T': case 't': return CUBLAS_OP_T; + case 'C': case 'c': return CUBLAS_OP_C; default: throw std::invalid_argument("Invalid transpose character for cuBLAS."); } } - // Register matrix layouts for a given (m, n, k, lda, ldb, ldc, transa, - // transb). Must be called before gemm/gemmrelu/gemmgelu/matmul for each - // unique combination. void addLayoutConfig(std::size_t m, std::size_t n, std::size_t k, std::size_t lda, std::size_t ldb, std::size_t ldc, char transa, char transb) { @@ -120,8 +166,6 @@ class BlasCuda { checkAndAddLayout(m, n, ldc); } - // C = alpha * op(A) * op(B) + beta * bias + bias_vec (bias_vec broadcast per - // row) template inline void gemm(char transa, char transb, unsigned int m, unsigned int n, unsigned int k, @@ -129,12 +173,11 @@ class BlasCuda { alpaka::BufCudaRt, TIdx> const &B, float beta, alpaka::BufCudaRt, TIdx> &bias, alpaka::BufCudaRt, TIdx> &C) { - const void *bptr = alpaka::getPtrNative(bias); - auto desc = - makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), - CUBLASLT_EPILOGUE_BIAS, bptr); - executeMatmul(desc, alpha, alpaka::getPtrNative(A), alpaka::getPtrNative(B), - beta, alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + executeMatmul(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_BIAS, alpha, + alpaka::getPtrNative(A), alpaka::getPtrNative(B), beta, + alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + static_cast(alpaka::getPtrNative(bias)), layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); } @@ -147,16 +190,24 @@ class BlasCuda { float beta, alpaka::ViewPlainPtr, TIdx> &bias, alpaka::ViewPlainPtr, TIdx> &C) { - const void *bptr = alpaka::getPtrNative(bias); - auto desc = - makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), - CUBLASLT_EPILOGUE_BIAS, bptr); - executeMatmul(desc, alpha, alpaka::getPtrNative(A), alpaka::getPtrNative(B), - beta, alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + executeMatmul(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_BIAS, alpha, + alpaka::getPtrNative(A), alpaka::getPtrNative(B), beta, + alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + static_cast(alpaka::getPtrNative(bias)), + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } + + template + inline void gemm(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, T const *A, T const *B, + float beta, T *bias, T *C) { + executeMatmul(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_BIAS, alpha, A, B, beta, bias, C, + static_cast(bias), layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); } - // C = relu(alpha * op(A) * op(B) + beta * bias + bias_vec) template inline void gemmrelu(char transa, char transb, unsigned int m, unsigned int n, unsigned int k, float alpha, @@ -165,12 +216,11 @@ class BlasCuda { float beta, alpaka::BufCudaRt, TIdx> &bias, alpaka::BufCudaRt, TIdx> &C) { - const void *bptr = alpaka::getPtrNative(bias); - auto desc = - makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), - CUBLASLT_EPILOGUE_RELU_BIAS, bptr); - executeMatmul(desc, alpha, alpaka::getPtrNative(A), alpaka::getPtrNative(B), - beta, alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + executeMatmul(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_RELU_BIAS, alpha, + alpaka::getPtrNative(A), alpaka::getPtrNative(B), beta, + alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + static_cast(alpaka::getPtrNative(bias)), layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); } @@ -182,16 +232,24 @@ class BlasCuda { float beta, alpaka::ViewPlainPtr, TIdx> &bias, alpaka::ViewPlainPtr, TIdx> &C) { - const void *bptr = alpaka::getPtrNative(bias); - auto desc = - makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), - CUBLASLT_EPILOGUE_RELU_BIAS, bptr); - executeMatmul(desc, alpha, alpaka::getPtrNative(A), alpaka::getPtrNative(B), - beta, alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + executeMatmul(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_RELU_BIAS, alpha, + alpaka::getPtrNative(A), alpaka::getPtrNative(B), beta, + alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + static_cast(alpaka::getPtrNative(bias)), + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } + + template + inline void gemmrelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, T const *A, T const *B, + float beta, T *bias, T *C) { + executeMatmul(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_RELU_BIAS, alpha, A, B, beta, bias, C, + static_cast(bias), layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); } - // C = gelu(alpha * op(A) * op(B) + beta * bias + bias_vec) template inline void gemmgelu(char transa, char transb, unsigned int m, unsigned int n, unsigned int k, float alpha, @@ -200,12 +258,11 @@ class BlasCuda { float beta, alpaka::BufCudaRt, TIdx> &bias, alpaka::BufCudaRt, TIdx> &C) { - const void *bptr = alpaka::getPtrNative(bias); - auto desc = - makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), - CUBLASLT_EPILOGUE_GELU_BIAS, bptr); - executeMatmul(desc, alpha, alpaka::getPtrNative(A), alpaka::getPtrNative(B), - beta, alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + executeMatmul(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_GELU_BIAS, alpha, + alpaka::getPtrNative(A), alpaka::getPtrNative(B), beta, + alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + static_cast(alpaka::getPtrNative(bias)), layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); } @@ -217,35 +274,11 @@ class BlasCuda { float beta, alpaka::ViewPlainPtr, TIdx> &bias, alpaka::ViewPlainPtr, TIdx> &C) { - const void *bptr = alpaka::getPtrNative(bias); - auto desc = - makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), - CUBLASLT_EPILOGUE_GELU_BIAS, bptr); - executeMatmul(desc, alpha, alpaka::getPtrNative(A), alpaka::getPtrNative(B), - beta, alpaka::getPtrNative(bias), alpaka::getPtrNative(C), - layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); - } - - // Raw-pointer overloads: accept T const*/T* from any BufXxx or ViewPlainPtr via getPtrNative() - template - inline void gemm(char transa, char transb, unsigned int m, unsigned int n, - unsigned int k, float alpha, T const *A, T const *B, - float beta, T *bias, T *C) { - const void *bptr = bias; - auto desc = makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), - CUBLASLT_EPILOGUE_BIAS, bptr); - executeMatmul(desc, alpha, A, B, beta, bias, C, - layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); - } - - template - inline void gemmrelu(char transa, char transb, unsigned int m, unsigned int n, - unsigned int k, float alpha, T const *A, T const *B, - float beta, T *bias, T *C) { - const void *bptr = bias; - auto desc = makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), - CUBLASLT_EPILOGUE_RELU_BIAS, bptr); - executeMatmul(desc, alpha, A, B, beta, bias, C, + executeMatmul(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_GELU_BIAS, alpha, + alpaka::getPtrNative(A), alpaka::getPtrNative(B), beta, + alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + static_cast(alpaka::getPtrNative(bias)), layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); } @@ -253,14 +286,12 @@ class BlasCuda { inline void gemmgelu(char transa, char transb, unsigned int m, unsigned int n, unsigned int k, float alpha, T const *A, T const *B, float beta, T *bias, T *C) { - const void *bptr = bias; - auto desc = makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), - CUBLASLT_EPILOGUE_GELU_BIAS, bptr); - executeMatmul(desc, alpha, A, B, beta, bias, C, + executeMatmul(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_GELU_BIAS, alpha, A, B, beta, bias, C, + static_cast(bias), layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); } - // C = alpha * op(A) * op(B) + beta * C (no bias) template inline void matmul(char transa, char transb, unsigned int m, unsigned int n, unsigned int k, float alpha, @@ -268,13 +299,12 @@ class BlasCuda { alpaka::BufCudaRt, TIdx> const &B, float beta, alpaka::BufCudaRt, TIdx> &C) { - auto desc = - makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), - CUBLASLT_EPILOGUE_DEFAULT); float *c = alpaka::getPtrNative(C); - executeMatmul(desc, alpha, alpaka::getPtrNative(A), alpaka::getPtrNative(B), - beta, c, c, layoutKeyA(transa, m, k), - layoutKeyB(transb, k, n), {m, n}); + executeMatmul(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_DEFAULT, alpha, + alpaka::getPtrNative(A), alpaka::getPtrNative(B), beta, + c, c, nullptr, + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); } template @@ -284,41 +314,56 @@ class BlasCuda { alpaka::ViewPlainPtr, TIdx> const &B, float beta, alpaka::ViewPlainPtr, TIdx> &C) { - auto desc = - makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), - CUBLASLT_EPILOGUE_DEFAULT); T *c = alpaka::getPtrNative(C); - executeMatmul(desc, alpha, alpaka::getPtrNative(A), alpaka::getPtrNative(B), - beta, c, c, layoutKeyA(transa, m, k), - layoutKeyB(transb, k, n), {m, n}); + executeMatmul(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_DEFAULT, alpha, + alpaka::getPtrNative(A), alpaka::getPtrNative(B), beta, + c, c, nullptr, + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); } - // matmul on raw pointers directly (no layout caching, caller must ensure - // correct leading dims and layout) + // Raw-pointer overload template inline void matmul(char transa, char transb, unsigned int m, unsigned int n, - unsigned int k, float alpha, T const * A, T const * B, - float beta, T * C) { - auto desc = - makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), - CUBLASLT_EPILOGUE_DEFAULT); - executeMatmul(desc, alpha, A, B, beta, C, C, layoutKeyA(transa, m, k), - layoutKeyB(transb, k, n), {m, n}); + unsigned int k, float alpha, T const *A, T const *B, + float beta, T *C) { + executeMatmul(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_DEFAULT, alpha, A, B, beta, C, C, nullptr, + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } + + inline void gemmStridedBatched( + char transa, char transb, + int m, int n, int k, float alpha, + const float *A, int lda, long long strideA, + const float *B, int ldb, long long strideB, + float beta, + float *C, int ldc, long long strideC, + int batchCount) + { + CHECK_CUBLAS(cublasSgemmStridedBatched( + handle, + charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + m, n, k, + &alpha, + A, lda, strideA, + B, ldb, strideB, + &beta, + C, ldc, strideC, + batchCount)); + // No cudaStreamSynchronize — operations remain asynchronous on the stream } private: alpaka::QueueCudaRtNonBlocking m_queue; - // Returns the layout map key for matrix A based on its transpose flag. - // Physical dimensions: NoTrans → (m×k), Trans → (k×m). + static std::pair layoutKeyA(char trans, std::size_t m, std::size_t k) { return (trans == 'N' || trans == 'n') ? std::make_pair(m, k) : std::make_pair(k, m); } - // Returns the layout map key for matrix B based on its transpose flag. - // Physical dimensions: NoTrans → (k×n), Trans → (n×k). static std::pair layoutKeyB(char trans, std::size_t k, std::size_t n) { return (trans == 'N' || trans == 'n') ? std::make_pair(k, n) @@ -335,12 +380,14 @@ class BlasCuda { } } - // Creates a per-call matmul descriptor with transpose ops, epilogue, and - // optional bias pointer. Caller owns the returned descriptor. - cublasLtMatmulDesc_t makeDesc(cublasOperation_t transA, - cublasOperation_t transB, - cublasLtEpilogue_t epilogue, - const void *bias_ptr = nullptr) { + cublasLtMatmulDesc_t &getOrCreateDesc(cublasOperation_t transA, + cublasOperation_t transB, + cublasLtEpilogue_t epilogue) { + DescKey key{(int)transA, (int)transB, (int)epilogue}; + auto it = descStore.find(key); + if (it != descStore.end()) + return it->second; + cublasLtMatmulDesc_t desc = nullptr; CHECK_CUBLAS( cublasLtMatmulDescCreate(&desc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); @@ -350,40 +397,75 @@ class BlasCuda { desc, CUBLASLT_MATMUL_DESC_TRANSB, &transB, sizeof(transB))); CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( desc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue))); - if (bias_ptr) { + // For bias epilogues: set a non-null dummy pointer so the descriptor is + // valid for cublasLtMatmulAlgoGetHeuristic (real pointer patched per call). + if (epilogue != CUBLASLT_EPILOGUE_DEFAULT) { + const void *dummy = d_workspace; CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias_ptr, - sizeof(bias_ptr))); + desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &dummy, sizeof(dummy))); } - return desc; + descStore.emplace(key, desc); + return descStore.at(key); } - // Runs heuristic selection, executes cublasLtMatmul, syncs stream, and - // destroys desc. D_in is the matrix scaled by beta (may equal C_out for - // in-place accumulation). - void executeMatmul(cublasLtMatmulDesc_t desc, float alpha, const float *A, - const float *B, float beta, const float *D_in, - float *C_out, - const std::pair &kA, - const std::pair &kB, - const std::pair &kC) { + cublasLtMatmulHeuristicResult_t & + getOrComputeAlgo(cublasOperation_t transA, cublasOperation_t transB, + cublasLtEpilogue_t epilogue, + const std::pair &kA, + const std::pair &kB, + const std::pair &kC) { + AlgoKey key{{(int)transA, (int)transB, (int)epilogue}, + kA.first, kA.second, kB.first, kB.second}; + auto it = algoCache.find(key); + if (it != algoCache.end()) + return it->second; + + auto &desc = getOrCreateDesc(transA, transB, epilogue); cublasLtMatmulHeuristicResult_t h{}; int returnedResults = 0; CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic( - ltHandle, desc, layoutStore.at(kA), layoutStore.at(kB), - layoutStore.at(kC), layoutStore.at(kC), preference, 1, &h, - &returnedResults)); + ltHandle, desc, + layoutStore.at(kA), layoutStore.at(kB), + layoutStore.at(kC), layoutStore.at(kC), + preference, 1, &h, &returnedResults)); if (returnedResults == 0) { - cublasLtMatmulDescDestroy(desc); - std::cerr << "No suitable cuBLASLt algorithm found!\n"; + std::cerr << "[sofieBLAS] No suitable cuBLASLt algorithm found for " + << "transA=" << transA << " transB=" << transB + << " epilogue=" << epilogue + << " A=[" << kA.first << "x" << kA.second << "]" + << " B=[" << kB.first << "x" << kB.second << "]\n"; exit(EXIT_FAILURE); } - CHECK_CUBLAS(cublasLtMatmul(ltHandle, desc, &alpha, A, layoutStore.at(kA), - B, layoutStore.at(kB), &beta, D_in, - layoutStore.at(kC), C_out, layoutStore.at(kC), - &h.algo, d_workspace, workspaceSize, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); - CHECK_CUBLAS(cublasLtMatmulDescDestroy(desc)); + algoCache.emplace(key, h); + return algoCache.at(key); + } + + void executeMatmul(cublasOperation_t transA, cublasOperation_t transB, + cublasLtEpilogue_t epilogue, + float alpha, const float *A, const float *B, + float beta, const float *D_in, float *C_out, + const void *bias_ptr, + const std::pair &kA, + const std::pair &kB, + const std::pair &kC) { + // Retrieve (or lazily compute) the cached algorithm for this shape + auto &h = getOrComputeAlgo(transA, transB, epilogue, kA, kB, kC); + + // Retrieve the cached descriptor and patch the real bias pointer in-place + auto &desc = getOrCreateDesc(transA, transB, epilogue); + if (bias_ptr) { + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, + &bias_ptr, sizeof(bias_ptr))); + } + + CHECK_CUBLAS(cublasLtMatmul( + ltHandle, desc, + &alpha, A, layoutStore.at(kA), + B, layoutStore.at(kB), + &beta, D_in, layoutStore.at(kC), + C_out, layoutStore.at(kC), + &h.algo, d_workspace, workspaceSize, stream)); } };