diff --git a/.gitignore b/.gitignore index 475c359..2f9ffe1 100644 --- a/.gitignore +++ b/.gitignore @@ -41,4 +41,7 @@ *.dwo # build files -**/build/ \ No newline at end of file +**/build/ + +# vscode settings +.vscode/ \ No newline at end of file diff --git a/include/.vscode/settings.json b/include/.vscode/settings.json deleted file mode 100644 index 64f45f4..0000000 --- a/include/.vscode/settings.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "files.associations": { - "array": "cpp", - "string": "cpp", - "string_view": "cpp", - "span": "cpp" - } -} diff --git a/include/sofieBLAS/backends/cpu/sofieBLAS_cpu.hpp b/include/sofieBLAS/backends/cpu/sofieBLAS_cpu.hpp index 263703c..6cfeb1f 100644 --- a/include/sofieBLAS/backends/cpu/sofieBLAS_cpu.hpp +++ b/include/sofieBLAS/backends/cpu/sofieBLAS_cpu.hpp @@ -15,6 +15,8 @@ #endif #include +#include +#include class BlasCpu { public: @@ -36,18 +38,186 @@ class BlasCpu { } } + // C = alpha * op(A) * op(B) + beta * C (no bias, leading dims inferred) + template + inline void matmul(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::BufCpu, TIdx> const &A, + alpaka::BufCpu, TIdx> const &B, + float beta, + alpaka::BufCpu, TIdx> &C) { + int lda = (transa == 'N' || transa == 'n') ? static_cast(m) + : static_cast(k); + int ldb = (transb == 'N' || transb == 'n') ? static_cast(k) + : static_cast(n); + cblas_sgemm(CblasColMajor, charToTranspose(transa), charToTranspose(transb), + static_cast(m), static_cast(n), static_cast(k), + alpha, alpaka::getPtrNative(A), lda, alpaka::getPtrNative(B), + ldb, beta, alpaka::getPtrNative(C), static_cast(m)); + } + + template + inline void matmul(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::ViewPlainPtr, TIdx> const &A, + alpaka::ViewPlainPtr, TIdx> const &B, + float beta, + alpaka::ViewPlainPtr, TIdx> &C) { + int lda = (transa == 'N' || transa == 'n') ? static_cast(m) + : static_cast(k); + int ldb = (transb == 'N' || transb == 'n') ? static_cast(k) + : static_cast(n); + cblas_sgemm(CblasColMajor, charToTranspose(transa), charToTranspose(transb), + static_cast(m), static_cast(n), static_cast(k), + alpha, alpaka::getPtrNative(A), lda, alpaka::getPtrNative(B), + ldb, beta, alpaka::getPtrNative(C), static_cast(m)); + } + + // C = alpha * op(A) * op(B) + beta * bias + bias_vec (bias_vec broadcast per + // row) Matches the cuBLASLt EPILOGUE_BIAS semantics: the bias buffer serves + // as both the beta-scaled accumulator and provides the per-row bias vector + // (first m elements). + template + inline void + gemm(char transa, char transb, unsigned int m, unsigned int n, unsigned int k, + float alpha, alpaka::BufCpu, TIdx> const &A, + alpaka::BufCpu, TIdx> const &B, float beta, + alpaka::BufCpu, TIdx> &bias, + alpaka::BufCpu, TIdx> &C) { + int lda = (transa == 'N' || transa == 'n') ? static_cast(m) + : static_cast(k); + int ldb = (transb == 'N' || transb == 'n') ? static_cast(k) + : static_cast(n); + // Step 1: C = alpha * op(A) * op(B) (beta=0 so C is fully overwritten) + cblas_sgemm(CblasColMajor, charToTranspose(transa), charToTranspose(transb), + static_cast(m), static_cast(n), static_cast(k), + alpha, alpaka::getPtrNative(A), lda, alpaka::getPtrNative(B), + ldb, 0.0f, alpaka::getPtrNative(C), static_cast(m)); + // Step 2: C += beta * bias_matrix + bias_vec (per-row broadcast) + float *c = alpaka::getPtrNative(C); + const float *b = alpaka::getPtrNative(bias); + for (unsigned int j = 0; j < n; ++j) + for (unsigned int i = 0; i < m; ++i) + c[j * m + i] += beta * b[j * m + i] + b[i]; + } + template inline void - gemm(char transa, char transb, const unsigned int m, const unsigned int n, - const unsigned int k, const float alpha, - alpaka::BufCpu, TIdx> const &A, const int lda, - alpaka::BufCpu, TIdx> const &B, const int ldb, - const float beta, alpaka::BufCpu, TIdx> &C, - const int ldc) { - CBLAS_TRANSPOSE TransA = charToTranspose(transa); - CBLAS_TRANSPOSE TransB = charToTranspose(transb); - cblas_sgemm(CblasColMajor, TransA, TransB, m, n, k, alpha, A.data(), lda, - B.data(), ldb, beta, C.data(), ldc); + gemm(char transa, char transb, unsigned int m, unsigned int n, unsigned int k, + float alpha, + alpaka::ViewPlainPtr, TIdx> const &A, + alpaka::ViewPlainPtr, TIdx> const &B, + float beta, + alpaka::ViewPlainPtr, TIdx> &bias, + alpaka::ViewPlainPtr, TIdx> &C) { + int lda = (transa == 'N' || transa == 'n') ? static_cast(m) + : static_cast(k); + int ldb = (transb == 'N' || transb == 'n') ? static_cast(k) + : static_cast(n); + cblas_sgemm(CblasColMajor, charToTranspose(transa), charToTranspose(transb), + static_cast(m), static_cast(n), static_cast(k), + alpha, alpaka::getPtrNative(A), lda, alpaka::getPtrNative(B), + ldb, 0.0f, alpaka::getPtrNative(C), static_cast(m)); + float *c = alpaka::getPtrNative(C); + const float *b = alpaka::getPtrNative(bias); + for (unsigned int j = 0; j < n; ++j) + for (unsigned int i = 0; i < m; ++i) + c[j * m + i] += beta * b[j * m + i] + b[i]; + } + + // C = relu(alpha * op(A) * op(B) + beta * bias + bias_vec) + template + inline void gemmrelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::BufCpu, TIdx> const &A, + alpaka::BufCpu, TIdx> const &B, + float beta, + alpaka::BufCpu, TIdx> &bias, + alpaka::BufCpu, TIdx> &C) { + gemm(transa, transb, m, n, k, alpha, A, B, beta, bias, C); + float *c = alpaka::getPtrNative(C); + for (unsigned int i = 0; i < m * n; ++i) + c[i] = c[i] > 0.0f ? c[i] : 0.0f; + } + + template + inline void gemmrelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::ViewPlainPtr, TIdx> const &A, + alpaka::ViewPlainPtr, TIdx> const &B, + float beta, + alpaka::ViewPlainPtr, TIdx> &bias, + alpaka::ViewPlainPtr, TIdx> &C) { + gemm(transa, transb, m, n, k, alpha, A, B, beta, bias, C); + float *c = alpaka::getPtrNative(C); + for (unsigned int i = 0; i < m * n; ++i) + c[i] = c[i] > 0.0f ? c[i] : 0.0f; + } + + // C = gelu(alpha * op(A) * op(B) + beta * bias + bias_vec) + // Uses the standard GELU: x * 0.5 * (1 + erf(x / sqrt(2))) + template + inline void gemmgelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::BufCpu, TIdx> const &A, + alpaka::BufCpu, TIdx> const &B, + float beta, + alpaka::BufCpu, TIdx> &bias, + alpaka::BufCpu, TIdx> &C) { + gemm(transa, transb, m, n, k, alpha, A, B, beta, bias, C); + float *c = alpaka::getPtrNative(C); + constexpr float kInvSqrt2 = 0.7071067811865476f; + for (unsigned int i = 0; i < m * n; ++i) + c[i] *= 0.5f * (1.0f + std::erff(c[i] * kInvSqrt2)); + } + + template + inline void gemmgelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::ViewPlainPtr, TIdx> const &A, + alpaka::ViewPlainPtr, TIdx> const &B, + float beta, + alpaka::ViewPlainPtr, TIdx> &bias, + alpaka::ViewPlainPtr, TIdx> &C) { + gemm(transa, transb, m, n, k, alpha, A, B, beta, bias, C); + float *c = alpaka::getPtrNative(C); + constexpr float kInvSqrt2 = 0.7071067811865476f; + for (unsigned int i = 0; i < m * n; ++i) + c[i] *= 0.5f * (1.0f + std::erff(c[i] * kInvSqrt2)); + } + + // Raw-pointer overloads: accept T const*/T* from any BufXxx or ViewPlainPtr via getPtrNative() + template + inline void gemm(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, T const *A, T const *B, + float beta, T *bias, T *C) { + int lda = (transa == 'N' || transa == 'n') ? static_cast(m) : static_cast(k); + int ldb = (transb == 'N' || transb == 'n') ? static_cast(k) : static_cast(n); + cblas_sgemm(CblasColMajor, charToTranspose(transa), charToTranspose(transb), + static_cast(m), static_cast(n), static_cast(k), + alpha, A, lda, B, ldb, 0.0f, C, static_cast(m)); + for (unsigned int j = 0; j < n; ++j) + for (unsigned int i = 0; i < m; ++i) + C[j * m + i] += beta * bias[j * m + i] + bias[i]; + } + + template + inline void gemmrelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, T const *A, T const *B, + float beta, T *bias, T *C) { + gemm(transa, transb, m, n, k, alpha, A, B, beta, bias, C); + for (unsigned int i = 0; i < m * n; ++i) + C[i] = C[i] > 0.0f ? C[i] : 0.0f; + } + + template + inline void gemmgelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, T const *A, T const *B, + float beta, T *bias, T *C) { + gemm(transa, transb, m, n, k, alpha, A, B, beta, bias, C); + constexpr float kInvSqrt2 = 0.7071067811865476f; + for (unsigned int i = 0; i < m * n; ++i) + C[i] *= 0.5f * (1.0f + std::erff(C[i] * kInvSqrt2)); } }; diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index cbf18e0..f0b9e57 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -15,7 +15,7 @@ #include #define CHECK_CUDA(err) \ - if (err != cudaSuccess) { \ + if ((err) != cudaSuccess) { \ std::cerr << "CUDA error: " << cudaGetErrorString(err) << " at line " \ << __LINE__ << "\n"; \ exit(EXIT_FAILURE); \ @@ -23,14 +23,14 @@ #define CHECK_CUBLAS(status) \ do { \ - cublasStatus_t s = (status); \ - if (s != CUBLAS_STATUS_SUCCESS) { \ - std::cerr << "cuBLAS error " << s << " at line " << __LINE__ \ - << std::endl; \ + cublasStatus_t _s = (status); \ + if (_s != CUBLAS_STATUS_SUCCESS) { \ + std::cerr << "cuBLAS error " << _s << " at line " << __LINE__ << "\n"; \ exit(EXIT_FAILURE); \ } \ } while (0) + struct PairHash { std::size_t operator()(const std::pair &p) const noexcept { @@ -47,35 +47,80 @@ struct PairEq { } }; +struct DescKey { + int transA; // CUBLAS_OP_N / CUBLAS_OP_T encoded as int + int transB; + int epilogue; // cublasLtEpilogue_t encoded as int + bool operator==(const DescKey &o) const noexcept { + return transA == o.transA && transB == o.transB && epilogue == o.epilogue; + } +}; + +struct DescKeyHash { + std::size_t operator()(const DescKey &k) const noexcept { + // Small values: simple polynomial hash + std::size_t h = static_cast(k.transA) * 97u + + static_cast(k.transB) * 31u + + static_cast(k.epilogue); + return h ^ (h >> 16); + } +}; + + +struct AlgoKey { + DescKey dk; + std::size_t rowsA, colsA; // physical dimensions of A in layoutStore + std::size_t rowsB, colsB; // physical dimensions of B in layoutStore + bool operator==(const AlgoKey &o) const noexcept { + return dk == o.dk + && rowsA == o.rowsA && colsA == o.colsA + && rowsB == o.rowsB && colsB == o.colsB; + } +}; + +struct AlgoKeyHash { + std::size_t operator()(const AlgoKey &k) const noexcept { + std::size_t h = DescKeyHash{}(k.dk); + auto mix = [&](std::size_t v) { + h ^= std::hash{}(v) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2); + }; + mix(k.rowsA); mix(k.colsA); + mix(k.rowsB); mix(k.colsB); + return h; + } +}; + class BlasCuda { - cublasLtHandle_t ltHandle = nullptr; - cublasHandle_t handle = nullptr; - cublasLtMatmulDesc_t operationDesc = nullptr; + cublasLtHandle_t ltHandle = nullptr; + cublasHandle_t handle = nullptr; // legacy cuBLAS for batched ops cublasLtMatmulPreference_t preference = nullptr; - void *d_workspace = nullptr; - size_t workspaceSize = 1 << 22; // 4MB + void *d_workspace = nullptr; + size_t workspaceSize = 1u << 25; // 32 MB (was 4 MB) cudaStream_t stream = nullptr; - cublasLtMatmulHeuristicResult_t heuristic; - cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT; - int error_flag = 0; std::unordered_map, cublasLtMatrixLayout_t, PairHash, PairEq> - LayoutStore; + layoutStore; + + std::unordered_map descStore; + + std::unordered_map + algoCache; public: - BlasCuda(const BlasCuda&) = delete; - BlasCuda& operator=(const BlasCuda&) = delete; - BlasCuda(BlasCuda&&) = delete; - BlasCuda& operator=(BlasCuda&&) = delete; + BlasCuda(const BlasCuda &) = delete; + BlasCuda &operator=(const BlasCuda &) = delete; + BlasCuda(BlasCuda &&) = delete; + BlasCuda &operator=(BlasCuda &&) = delete; BlasCuda(alpaka::QueueCudaRtNonBlocking &queue) : m_queue{queue} { stream = static_cast(m_queue.getNativeHandle()); + CHECK_CUBLAS(cublasLtCreate(<Handle)); + CHECK_CUBLAS(cublasCreate(&handle)); - heuristic = {}; - CHECK_CUBLAS(cublasLtMatmulDescCreate(&operationDesc, CUBLAS_COMPUTE_32F, - CUDA_R_32F)); + CHECK_CUBLAS(cublasSetStream(handle, stream)); + CHECK_CUBLAS(cublasLtMatmulPreferenceCreate(&preference)); CHECK_CUDA(cudaMalloc(&d_workspace, workspaceSize)); CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( @@ -84,265 +129,343 @@ class BlasCuda { } ~BlasCuda() { - for (auto& [key, layout] : LayoutStore) { - if (layout) { - cublasLtMatrixLayoutDestroy(layout); - } - } - LayoutStore.clear(); - - if (preference) - cublasLtMatmulPreferenceDestroy(preference); - if (operationDesc) - cublasLtMatmulDescDestroy(operationDesc); - if (ltHandle) - cublasLtDestroy(ltHandle); - if (d_workspace) - cudaFree(d_workspace); - + for (auto &[key, layout] : layoutStore) + if (layout) cublasLtMatrixLayoutDestroy(layout); + for (auto &[key, desc] : descStore) + if (desc) cublasLtMatmulDescDestroy(desc); + if (preference) cublasLtMatmulPreferenceDestroy(preference); + if (ltHandle) cublasLtDestroy(ltHandle); + if (handle) cublasDestroy(handle); + if (d_workspace) cudaFree(d_workspace); } inline cublasOperation_t charToCuBlasTranspose(char trans) { switch (trans) { - case 'N': - case 'n': - return CUBLAS_OP_N; - case 'T': - case 't': - return CUBLAS_OP_T; - case 'C': - case 'c': - return CUBLAS_OP_C; + case 'N': case 'n': return CUBLAS_OP_N; + case 'T': case 't': return CUBLAS_OP_T; + case 'C': case 'c': return CUBLAS_OP_C; default: throw std::invalid_argument("Invalid transpose character for cuBLAS."); } } - void AddLayoutConfig(std::size_t m, std::size_t n, std::size_t k) { - CheckAndAddLayout(k, m); - CheckAndAddLayout(k, n); - CheckAndAddLayout(m, n); + void addLayoutConfig(std::size_t m, std::size_t n, std::size_t k, + std::size_t lda, std::size_t ldb, std::size_t ldc, + char transa, char transb) { + // Physical A: (m×k) if NoTrans, (k×m) if Trans + if (transa == 'N' || transa == 'n') + checkAndAddLayout(m, k, lda); + else + checkAndAddLayout(k, m, lda); + // Physical B: (k×n) if NoTrans, (n×k) if Trans + if (transb == 'N' || transb == 'n') + checkAndAddLayout(k, n, ldb); + else + checkAndAddLayout(n, k, ldb); + // C is always (m×n) + checkAndAddLayout(m, n, ldc); } -template -inline void -gemm(char transa, char transb, const unsigned int m, - const unsigned int n, const unsigned int k, - const float alpha, - alpaka::BufCudaRt, TIdx> const &A, - alpaka::BufCudaRt, TIdx> const &B, - const float beta, - alpaka::BufCudaRt, TIdx> &bias, - alpaka::BufCudaRt, TIdx> &C) -{ - cublasLtMatmulDesc_t localDesc = nullptr; - CHECK_CUBLAS(cublasLtMatmulDescCreate(&localDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); - - cublasOperation_t transB_op = charToCuBlasTranspose(transb); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transB_op, sizeof(transB_op))); - - cublasOperation_t transA_op = charToCuBlasTranspose(transa); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transA_op, sizeof(transA_op))); - - void *bias_ptr = reinterpret_cast(alpaka::getPtrNative(bias)); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias_ptr, sizeof(bias_ptr))); - - cublasLtEpilogue_t ep = CUBLASLT_EPILOGUE_BIAS; - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, - CUBLASLT_MATMUL_DESC_EPILOGUE, - &ep, - sizeof(ep))); - - - cublasLtMatmulHeuristicResult_t localHeuristic{}; - int returnedResults = 0; - CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic( - ltHandle, - localDesc, - LayoutStore.at({k, m}), - LayoutStore.at({k, n}), - LayoutStore.at({m, n}), - LayoutStore.at({m, n}), - preference, - 1, - &localHeuristic, - &returnedResults)); - - if (returnedResults == 0) { - cublasLtMatmulDescDestroy(localDesc); - std::cerr << "No suitable cuBLASLt algorithm found!\n"; - exit(EXIT_FAILURE); - } - - CHECK_CUBLAS(cublasLtMatmul( - ltHandle, - localDesc, - &alpha, - alpaka::getPtrNative(A), LayoutStore.at({k, m}), - alpaka::getPtrNative(B), LayoutStore.at({k, n}), - &beta, - alpaka::getPtrNative(bias), LayoutStore.at({m, n}), - alpaka::getPtrNative(C), LayoutStore.at({m, n}), - &(localHeuristic.algo), - d_workspace, - workspaceSize, - stream)); - - cudaDeviceSynchronize(); - CHECK_CUBLAS(cublasLtMatmulDescDestroy(localDesc)); -} - -template -inline void -gemmrelu(char transa, char transb, const unsigned int m, - const unsigned int n, const unsigned int k, - const float alpha, - alpaka::BufCudaRt, TIdx> const &A, - alpaka::BufCudaRt, TIdx> const &B, - const float beta, - alpaka::BufCudaRt, TIdx> &bias, - alpaka::BufCudaRt, TIdx> &C) -{ - cublasLtMatmulDesc_t localDesc = nullptr; - CHECK_CUBLAS(cublasLtMatmulDescCreate(&localDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); - - cublasOperation_t transB_op = charToCuBlasTranspose(transb); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transB_op, sizeof(transB_op))); - - cublasOperation_t transA_op = charToCuBlasTranspose(transa); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transA_op, sizeof(transA_op))); + template + inline void + gemm(char transa, char transb, unsigned int m, unsigned int n, unsigned int k, + float alpha, alpaka::BufCudaRt, TIdx> const &A, + alpaka::BufCudaRt, TIdx> const &B, float beta, + alpaka::BufCudaRt, TIdx> &bias, + alpaka::BufCudaRt, TIdx> &C) { + executeMatmul(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_BIAS, alpha, + alpaka::getPtrNative(A), alpaka::getPtrNative(B), beta, + alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + static_cast(alpaka::getPtrNative(bias)), + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } - void *bias_ptr = reinterpret_cast(alpaka::getPtrNative(bias)); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias_ptr, sizeof(bias_ptr))); + template + inline void + gemm(char transa, char transb, unsigned int m, unsigned int n, unsigned int k, + float alpha, + alpaka::ViewPlainPtr, TIdx> const &A, + alpaka::ViewPlainPtr, TIdx> const &B, + float beta, + alpaka::ViewPlainPtr, TIdx> &bias, + alpaka::ViewPlainPtr, TIdx> &C) { + executeMatmul(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_BIAS, alpha, + alpaka::getPtrNative(A), alpaka::getPtrNative(B), beta, + alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + static_cast(alpaka::getPtrNative(bias)), + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } - cublasLtEpilogue_t ep = CUBLASLT_EPILOGUE_RELU_BIAS; - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &ep, sizeof(ep))); + template + inline void gemm(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, T const *A, T const *B, + float beta, T *bias, T *C) { + executeMatmul(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_BIAS, alpha, A, B, beta, bias, C, + static_cast(bias), + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } - cublasLtMatmulHeuristicResult_t localHeuristic{}; - CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic( - ltHandle, - localDesc, - LayoutStore.at({k, m}), - LayoutStore.at({k, n}), - LayoutStore.at({m, n}), - LayoutStore.at({m, n}), - preference, - 1, - &localHeuristic, - &error_flag)); - - if (error_flag == 0) { - cublasLtMatmulDescDestroy(localDesc); - std::cerr << "No suitable cuBLASLt algorithm found!\n"; - exit(EXIT_FAILURE); - } + template + inline void gemmrelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::BufCudaRt, TIdx> const &A, + alpaka::BufCudaRt, TIdx> const &B, + float beta, + alpaka::BufCudaRt, TIdx> &bias, + alpaka::BufCudaRt, TIdx> &C) { + executeMatmul(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_RELU_BIAS, alpha, + alpaka::getPtrNative(A), alpaka::getPtrNative(B), beta, + alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + static_cast(alpaka::getPtrNative(bias)), + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } - CHECK_CUBLAS(cublasLtMatmul( - ltHandle, - localDesc, - &alpha, - alpaka::getPtrNative(A), LayoutStore.at({k, m}), - alpaka::getPtrNative(B), LayoutStore.at({k, n}), - &beta, - alpaka::getPtrNative(bias), LayoutStore.at({m, n}), - alpaka::getPtrNative(C), LayoutStore.at({m, n}), - &(localHeuristic.algo), - d_workspace, - workspaceSize, - stream)); + template + inline void gemmrelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::ViewPlainPtr, TIdx> const &A, + alpaka::ViewPlainPtr, TIdx> const &B, + float beta, + alpaka::ViewPlainPtr, TIdx> &bias, + alpaka::ViewPlainPtr, TIdx> &C) { + executeMatmul(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_RELU_BIAS, alpha, + alpaka::getPtrNative(A), alpaka::getPtrNative(B), beta, + alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + static_cast(alpaka::getPtrNative(bias)), + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } - cudaDeviceSynchronize(); - CHECK_CUBLAS(cublasLtMatmulDescDestroy(localDesc)); -} + template + inline void gemmrelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, T const *A, T const *B, + float beta, T *bias, T *C) { + executeMatmul(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_RELU_BIAS, alpha, A, B, beta, bias, C, + static_cast(bias), + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } template - inline void gemmgelu(char transa, char transb, const unsigned int m, - const unsigned int n, const unsigned int k, - const float alpha, + inline void gemmgelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, alpaka::BufCudaRt, TIdx> const &A, alpaka::BufCudaRt, TIdx> const &B, - const float beta, + float beta, alpaka::BufCudaRt, TIdx> &bias, alpaka::BufCudaRt, TIdx> &C) { + executeMatmul(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_GELU_BIAS, alpha, + alpaka::getPtrNative(A), alpaka::getPtrNative(B), beta, + alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + static_cast(alpaka::getPtrNative(bias)), + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } - cublasLtMatmulDesc_t localDesc = nullptr; - CHECK_CUBLAS(cublasLtMatmulDescCreate(&localDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); - - void *bias_ptr = reinterpret_cast(alpaka::getPtrNative(bias)); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias_ptr, - sizeof(bias_ptr))); + template + inline void gemmgelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::ViewPlainPtr, TIdx> const &A, + alpaka::ViewPlainPtr, TIdx> const &B, + float beta, + alpaka::ViewPlainPtr, TIdx> &bias, + alpaka::ViewPlainPtr, TIdx> &C) { + executeMatmul(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_GELU_BIAS, alpha, + alpaka::getPtrNative(A), alpaka::getPtrNative(B), beta, + alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + static_cast(alpaka::getPtrNative(bias)), + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } - cublasOperation_t transB = charToCuBlasTranspose(transb); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transB, sizeof(transB))); + template + inline void gemmgelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, T const *A, T const *B, + float beta, T *bias, T *C) { + executeMatmul(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_GELU_BIAS, alpha, A, B, beta, bias, C, + static_cast(bias), + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } - cublasOperation_t transA = charToCuBlasTranspose(transa); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transA, sizeof(transA))); - SetGeluActivation(); + template + inline void matmul(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::BufCudaRt, TIdx> const &A, + alpaka::BufCudaRt, TIdx> const &B, + float beta, + alpaka::BufCudaRt, TIdx> &C) { + float *c = alpaka::getPtrNative(C); + executeMatmul(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_DEFAULT, alpha, + alpaka::getPtrNative(A), alpaka::getPtrNative(B), beta, + c, c, nullptr, + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } - cublasLtMatmulHeuristicResult_t localHeuristic{}; - CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic( - ltHandle, localDesc, - LayoutStore.at({k, m}), - LayoutStore.at({k, n}), - LayoutStore.at({m, n}), - LayoutStore.at({m, n}), - preference, 1, &localHeuristic, &error_flag)); - if (error_flag == 0) { - std::cerr << "No suitable cuBLASLt algorithm found!\n"; - exit(EXIT_FAILURE); - } + template + inline void matmul(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::ViewPlainPtr, TIdx> const &A, + alpaka::ViewPlainPtr, TIdx> const &B, + float beta, + alpaka::ViewPlainPtr, TIdx> &C) { + T *c = alpaka::getPtrNative(C); + executeMatmul(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_DEFAULT, alpha, + alpaka::getPtrNative(A), alpaka::getPtrNative(B), beta, + c, c, nullptr, + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } - CHECK_CUBLAS(cublasLtMatmul( - ltHandle, localDesc, &alpha, alpaka::getPtrNative(A), LayoutStore.at({k, m}), - alpaka::getPtrNative(B), LayoutStore.at({k, n}), &beta, alpaka::getPtrNative(bias), LayoutStore.at({m, n}), - alpaka::getPtrNative(C), LayoutStore.at({m, n}), &(localHeuristic.algo), d_workspace, - workspaceSize, stream)); + // Raw-pointer overload + template + inline void matmul(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, T const *A, T const *B, + float beta, T *C) { + executeMatmul(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_DEFAULT, alpha, A, B, beta, C, C, nullptr, + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } + + inline void gemmStridedBatched( + char transa, char transb, + int m, int n, int k, float alpha, + const float *A, int lda, long long strideA, + const float *B, int ldb, long long strideB, + float beta, + float *C, int ldc, long long strideC, + int batchCount) + { + CHECK_CUBLAS(cublasSgemmStridedBatched( + handle, + charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + m, n, k, + &alpha, + A, lda, strideA, + B, ldb, strideB, + &beta, + C, ldc, strideC, + batchCount)); + // No cudaStreamSynchronize — operations remain asynchronous on the stream } private: alpaka::QueueCudaRtNonBlocking m_queue; - void CheckAndAddLayout(size_t rows, size_t cols) { + + static std::pair + layoutKeyA(char trans, std::size_t m, std::size_t k) { + return (trans == 'N' || trans == 'n') ? std::make_pair(m, k) + : std::make_pair(k, m); + } + + static std::pair + layoutKeyB(char trans, std::size_t k, std::size_t n) { + return (trans == 'N' || trans == 'n') ? std::make_pair(k, n) + : std::make_pair(n, k); + } + + void checkAndAddLayout(std::size_t rows, std::size_t cols, std::size_t ld) { auto key = std::make_pair(rows, cols); - if (LayoutStore.find(key) == LayoutStore.end()) { - cublasLtMatrixLayout_t temp = nullptr; - size_t ld = rows; + if (layoutStore.find(key) == layoutStore.end()) { + cublasLtMatrixLayout_t layout = nullptr; CHECK_CUBLAS( - cublasLtMatrixLayoutCreate(&temp, CUDA_R_32F, rows, cols, ld)); - LayoutStore.emplace(key, temp); + cublasLtMatrixLayoutCreate(&layout, CUDA_R_32F, rows, cols, ld)); + layoutStore.emplace(key, layout); } } - void ResetActivation() { - epilogue = CUBLASLT_EPILOGUE_BIAS; - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc, - CUBLASLT_MATMUL_DESC_EPILOGUE, - &epilogue, sizeof(epilogue))); + cublasLtMatmulDesc_t &getOrCreateDesc(cublasOperation_t transA, + cublasOperation_t transB, + cublasLtEpilogue_t epilogue) { + DescKey key{(int)transA, (int)transB, (int)epilogue}; + auto it = descStore.find(key); + if (it != descStore.end()) + return it->second; + + cublasLtMatmulDesc_t desc = nullptr; + CHECK_CUBLAS( + cublasLtMatmulDescCreate(&desc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + desc, CUBLASLT_MATMUL_DESC_TRANSA, &transA, sizeof(transA))); + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + desc, CUBLASLT_MATMUL_DESC_TRANSB, &transB, sizeof(transB))); + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + desc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue))); + // For bias epilogues: set a non-null dummy pointer so the descriptor is + // valid for cublasLtMatmulAlgoGetHeuristic (real pointer patched per call). + if (epilogue != CUBLASLT_EPILOGUE_DEFAULT) { + const void *dummy = d_workspace; + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &dummy, sizeof(dummy))); + } + descStore.emplace(key, desc); + return descStore.at(key); } - void SetReluActivation() { - epilogue = CUBLASLT_EPILOGUE_RELU_BIAS; - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc, - CUBLASLT_MATMUL_DESC_EPILOGUE, - &epilogue, sizeof(epilogue))); + cublasLtMatmulHeuristicResult_t & + getOrComputeAlgo(cublasOperation_t transA, cublasOperation_t transB, + cublasLtEpilogue_t epilogue, + const std::pair &kA, + const std::pair &kB, + const std::pair &kC) { + AlgoKey key{{(int)transA, (int)transB, (int)epilogue}, + kA.first, kA.second, kB.first, kB.second}; + auto it = algoCache.find(key); + if (it != algoCache.end()) + return it->second; + + auto &desc = getOrCreateDesc(transA, transB, epilogue); + cublasLtMatmulHeuristicResult_t h{}; + int returnedResults = 0; + CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic( + ltHandle, desc, + layoutStore.at(kA), layoutStore.at(kB), + layoutStore.at(kC), layoutStore.at(kC), + preference, 1, &h, &returnedResults)); + if (returnedResults == 0) { + std::cerr << "[sofieBLAS] No suitable cuBLASLt algorithm found for " + << "transA=" << transA << " transB=" << transB + << " epilogue=" << epilogue + << " A=[" << kA.first << "x" << kA.second << "]" + << " B=[" << kB.first << "x" << kB.second << "]\n"; + exit(EXIT_FAILURE); + } + algoCache.emplace(key, h); + return algoCache.at(key); } - void SetGeluActivation() { - epilogue = CUBLASLT_EPILOGUE_GELU; - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc, - CUBLASLT_MATMUL_DESC_EPILOGUE, - &epilogue, sizeof(epilogue))); + void executeMatmul(cublasOperation_t transA, cublasOperation_t transB, + cublasLtEpilogue_t epilogue, + float alpha, const float *A, const float *B, + float beta, const float *D_in, float *C_out, + const void *bias_ptr, + const std::pair &kA, + const std::pair &kB, + const std::pair &kC) { + // Retrieve (or lazily compute) the cached algorithm for this shape + auto &h = getOrComputeAlgo(transA, transB, epilogue, kA, kB, kC); + + // Retrieve the cached descriptor and patch the real bias pointer in-place + auto &desc = getOrCreateDesc(transA, transB, epilogue); + if (bias_ptr) { + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, + &bias_ptr, sizeof(bias_ptr))); + } + + CHECK_CUBLAS(cublasLtMatmul( + ltHandle, desc, + &alpha, A, layoutStore.at(kA), + B, layoutStore.at(kB), + &beta, D_in, layoutStore.at(kC), + C_out, layoutStore.at(kC), + &h.algo, d_workspace, workspaceSize, stream)); } }; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2ebded2..0a4aa01 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -19,10 +19,15 @@ endif() # --- Compiler flags --- set(CXXFLAGS -O2 -g -DALPAKA_HAS_STD_ATOMIC_REF) set(CXX_HOST_FLAGS -fPIC -pthread) -set(CUDA_ARCH "sm_86") -set(CXX_CUDA_FLAGS -arch=${CUDA_ARCH} -Wno-deprecated-gpu-targets --extended-lambda --expt-relaxed-constexpr) +set(CXX_CUDA_FLAGS -Wno-deprecated-gpu-targets --extended-lambda --expt-relaxed-constexpr) set(XCOMPILER_FLAGS -Xcompiler=-fPIC,-pthread) +# --- CUDA architecture: must be set before enable_language(CUDA) so all +# targets inherit a valid default (CMake 3.18+ requires CUDA_ARCHITECTURES +# to be non-empty on every target once CUDA is enabled globally). +set(CMAKE_CUDA_ARCHITECTURES 86) +enable_language(CUDA) + # --- Include directories --- include_directories(${ALPAKA_BASE}/include "../include") @@ -86,12 +91,9 @@ target_include_directories(test_cpu PRIVATE "../sofieBLAS/include" ${ALPAKA_BASE target_link_libraries(test_cpu PRIVATE ${BLAS_LIBS}) -set(TEST_SRC test.cc) - add_executable(test_cuda) target_sources(test_cuda PRIVATE test.cc) set_source_files_properties(test.cc PROPERTIES LANGUAGE CUDA) -enable_language(CUDA) set_target_properties(test_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON) target_compile_features(test_cuda PUBLIC cxx_std_20) @@ -104,7 +106,7 @@ target_compile_options(test_cuda PRIVATE target_compile_definitions(test_cuda PRIVATE ALPAKA_ACC_GPU_CUDA_ENABLED) target_include_directories(test_cuda PRIVATE ${ALPAKA_BASE}/include "../sofieBLAS/include" ${CUDA_BASE}/include) target_link_directories(test_cuda PRIVATE ${CUDA_BASE}/lib64) -target_link_libraries(test_cuda PRIVATE cublas cudart) +target_link_libraries(test_cuda PRIVATE cublasLt cublas cudart) # --- clean target equivalent --- add_custom_target(clean-all diff --git a/tests/test.cc b/tests/test.cc index 5812bc9..e106275 100644 --- a/tests/test.cc +++ b/tests/test.cc @@ -1,121 +1,591 @@ #include "sofieBLAS/sofieBLAS.hpp" #include +#include +#include #include #include -#include +#include +#include -// index and size type using Idx = uint32_t; - -// dimensions -using Dim0D = alpaka::DimInt<0u>; using Dim1D = alpaka::DimInt<1u>; -using Dim2D = alpaka::DimInt<2u>; -using Dim3D = alpaka::DimInt<3u>; - -// Print a column-major matrix -template -void print(alpaka::BufCpu const &M, TIdx size) { - assert(alpaka::getExtentProduct(M) == size * size); - - for (TIdx row = 0; row < size; ++row) { - for (TIdx col = 0; col < size; ++col) { - std::cout << std::fixed << std::setprecision(2) << std::setw(7) - << M[col * size + row] << " "; + +// --------------------------------------------------------------------------- +// Reference implementations (column-major, float) +// --------------------------------------------------------------------------- + +// Access element (row, col) of a column-major matrix with leading dim `ld`. +static inline float cm(const float *M, int row, int col, int ld) { + return M[col * ld + row]; +} + +// C = alpha * op(A) * op(B) + beta * C (in-place, column-major) +static void refMatmul(float *C, const float *A, const float *B, int m, int n, + int k, float alpha, float beta, bool transA, + bool transB) { + int lda = transA ? k : m; + int ldb = transB ? n : k; + for (int j = 0; j < n; ++j) { + for (int i = 0; i < m; ++i) { + float sum = 0.f; + for (int p = 0; p < k; ++p) { + float a = transA ? cm(A, p, i, lda) : cm(A, i, p, lda); + float b = transB ? cm(B, j, p, ldb) : cm(B, p, j, ldb); + sum += a * b; + } + C[j * m + i] = alpha * sum + beta * C[j * m + i]; } - std::cout << "\n"; } } -int main() { - constexpr Idx size = 4; +// C = alpha * op(A) * op(B) + beta * bias_matrix + bias_vec (per-row broadcast) +static void refGemm(float *C, const float *A, const float *B, const float *bias, + int m, int n, int k, float alpha, float beta, bool transA, + bool transB) { + int lda = transA ? k : m; + int ldb = transB ? n : k; + for (int j = 0; j < n; ++j) { + for (int i = 0; i < m; ++i) { + float sum = 0.f; + for (int p = 0; p < k; ++p) { + float a = transA ? cm(A, p, i, lda) : cm(A, i, p, lda); + float b = transB ? cm(B, j, p, ldb) : cm(B, p, j, ldb); + sum += a * b; + } + C[j * m + i] = alpha * sum + beta * bias[j * m + i] + bias[i]; + } + } +} - // Host platform and device - alpaka::PlatformCpu host_platform{}; - auto host = alpaka::getDevByIdx(host_platform, 0u); +static void refGemmRelu(float *C, const float *A, const float *B, + const float *bias, int m, int n, int k, float alpha, + float beta, bool transA, bool transB) { + refGemm(C, A, B, bias, m, n, k, alpha, beta, transA, transB); + for (int i = 0; i < m * n; ++i) + C[i] = C[i] > 0.f ? C[i] : 0.f; +} - // Allocate matrices (column-major) - auto A = alpaka::allocBuf(host, size * size); - auto B = alpaka::allocBuf(host, size * size); - auto C = alpaka::allocBuf(host, size * size); +static void refGemmGelu(float *C, const float *A, const float *B, + const float *bias, int m, int n, int k, float alpha, + float beta, bool transA, bool transB) { + refGemm(C, A, B, bias, m, n, k, alpha, beta, transA, transB); + constexpr float kInvSqrt2 = 0.7071067811865476f; + for (int i = 0; i < m * n; ++i) + C[i] *= 0.5f * (1.f + std::erff(C[i] * kInvSqrt2)); +} - // Fill A and B with random floats centered around 0 - std::random_device rd; - std::mt19937 gen(rd()); - std::normal_distribution dist(0.0f, 10.0f); +static int gFailures = 0; - for (int i = 0; i < size * size; ++i) { - A[i] = dist(gen); - B[i] = dist(gen); +static void checkClose(const float *got, const float *expected, int n, + const std::string &name, float rtol = 1e-4f, + float atol = 1e-4f) { + bool pass = true; + for (int i = 0; i < n; ++i) { + float diff = std::abs(got[i] - expected[i]); + float thr = atol + rtol * std::abs(expected[i]); + if (diff > thr) { + std::cerr << " FAIL [" << name << "] idx=" << i << " got=" << got[i] + << " expected=" << expected[i] << " diff=" << diff << "\n"; + pass = false; + } } - std::cout << "Matrix A:\n"; - print(A, size); - std::cout << '\n'; - std::cout << "Matrix B:\n"; - print(B, size); - std::cout << '\n'; + if (pass) + std::cout << " PASS " << name << "\n"; + else + ++gFailures; +} -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED +// --------------------------------------------------------------------------- +// Helpers to fill test matrices +// --------------------------------------------------------------------------- + +static void fillSeq(float *M, int n, float start = 1.f, float step = 1.f) { + for (int i = 0; i < n; ++i) + M[i] = start + static_cast(i) * step; +} + +static void fillVal(float *M, int n, float v) { + for (int i = 0; i < n; ++i) + M[i] = v; +} + +// --------------------------------------------------------------------------- +// CPU tests +// --------------------------------------------------------------------------- + +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + +static void runCpuTests() { + std::cout << "\n=== CPU Tests ===\n"; + + alpaka::PlatformCpu platform{}; + auto dev = alpaka::getDevByIdx(platform, 0u); + alpaka::Queue queue{dev}; + sofieBLAS blas(queue); + + constexpr int M = 4, N = 3, K = 5; + + // Allocate host buffers + auto hA = alpaka::allocBuf(dev, static_cast(M * K)); + auto hB = alpaka::allocBuf(dev, static_cast(K * N)); + auto hC = alpaka::allocBuf(dev, static_cast(M * N)); + auto hBias = alpaka::allocBuf(dev, static_cast(M * N)); + + float *A = alpaka::getPtrNative(hA); + float *B = alpaka::getPtrNative(hB); + float *C = alpaka::getPtrNative(hC); + float *bias = alpaka::getPtrNative(hBias); + + fillSeq(A, M * K); + fillSeq(B, K * N, 1.f, 0.5f); + fillSeq(bias, M * N, 0.1f, 0.1f); + + std::vector ref(M * N); + + // --- matmul NN --- + fillVal(C, M * N, 0.f); + blas.matmul('N', 'N', M, N, K, 1.f, hA, hB, 0.f, hC); + std::copy(C, C + M * N, ref.data()); + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), A, B, M, N, K, 1.f, 0.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::matmul NN"); + + // --- matmul TN (A^T: K×M physical → M×K logical) --- + // For TN we need A to be K×M so we create a separate buffer + { + auto hAt = alpaka::allocBuf(dev, static_cast(K * M)); + float *At = alpaka::getPtrNative(hAt); + fillSeq(At, K * M); + fillVal(C, M * N, 0.f); + blas.matmul('T', 'N', M, N, K, 1.f, hAt, hB, 0.f, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), At, B, M, N, K, 1.f, 0.f, true, false); + checkClose(C, ref.data(), M * N, "cpu::matmul TN"); + } + + // --- matmul NT --- + { + auto hBt = alpaka::allocBuf(dev, static_cast(N * K)); + float *Bt = alpaka::getPtrNative(hBt); + fillSeq(Bt, N * K, 1.f, 0.5f); + fillVal(C, M * N, 0.f); + blas.matmul('N', 'T', M, N, K, 1.f, hA, hBt, 0.f, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), A, Bt, M, N, K, 1.f, 0.f, false, true); + checkClose(C, ref.data(), M * N, "cpu::matmul NT"); + } + + // --- matmul TT --- { - alpaka::PlatformCudaRt platform; - alpaka::DevCudaRt device = alpaka::getDevByIdx(platform, 0u); - alpaka::Queue queue{device}; + auto hAt = alpaka::allocBuf(dev, static_cast(K * M)); + auto hBt = alpaka::allocBuf(dev, static_cast(N * K)); + float *At = alpaka::getPtrNative(hAt); + float *Bt = alpaka::getPtrNative(hBt); + fillSeq(At, K * M); + fillSeq(Bt, N * K, 1.f, 0.5f); + fillVal(C, M * N, 0.f); + blas.matmul('T', 'T', M, N, K, 1.f, hAt, hBt, 0.f, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), At, Bt, M, N, K, 1.f, 0.f, true, true); + checkClose(C, ref.data(), M * N, "cpu::matmul TT"); + } - const Idx m = size; // rows of A and C - const Idx n = size; // columns of B and C - const Idx k = size; // columns of A and rows of B + // --- matmul: alpha scaling --- + fillVal(C, M * N, 0.f); + blas.matmul('N', 'N', M, N, K, 2.5f, hA, hB, 0.f, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), A, B, M, N, K, 2.5f, 0.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::matmul alpha=2.5"); - const float alpha = 1.0f; - const float beta = 0.0f; + // --- matmul: beta accumulation --- + fillSeq(C, M * N, 10.f); // pre-fill C + blas.matmul('N', 'N', M, N, K, 1.f, hA, hB, 0.5f, hC); + { + std::vector C0(M * N); + fillSeq(C0.data(), M * N, 10.f); + refMatmul(ref.data(), A, B, M, N, K, 1.f, 0.5f, false, false); + // ref already has the pre-filled values baked in via refMatmul beta path + // but refMatmul reads C in-place, so we need to re-run with correct init + std::copy(C0.begin(), C0.end(), ref.data()); + refMatmul(ref.data(), A, B, M, N, K, 1.f, 0.5f, false, false); + } + checkClose(C, ref.data(), M * N, "cpu::matmul beta=0.5"); - const Idx lda = size; // leading dimension of A - const Idx ldb = size; // leading dimension of B - const Idx ldc = size; + // --- gemm NN (beta=0, no prior accumulation) --- + fillVal(C, M * N, 0.f); + fillSeq(bias, M * N, 0.1f, 0.1f); + blas.gemm('N', 'N', M, N, K, 1.f, hA, hB, 0.f, hBias, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemm(ref.data(), A, B, bias, M, N, K, 1.f, 0.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::gemm NN beta=0"); - auto A_d = alpaka::allocAsyncBuf(queue, size * size); - auto B_d = alpaka::allocAsyncBuf(queue, size * size); - auto C_d = alpaka::allocAsyncBuf(queue, size * size); - alpaka::memcpy(queue, A_d, A); - alpaka::memcpy(queue, B_d, B); + // --- gemm NN (beta=1 accumulation) --- + fillVal(C, M * N, 0.f); + blas.gemm('N', 'N', M, N, K, 1.f, hA, hB, 1.f, hBias, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemm(ref.data(), A, B, bias, M, N, K, 1.f, 1.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::gemm NN beta=1"); - sofieBLAS blas(queue); - blas.gemm('n', 'n', m, n, k, alpha, A_d, lda, B_d, ldb, beta, C_d, ldc); - alpaka::memcpy(queue, C, C_d); + // --- gemm TN --- + { + auto hAt = alpaka::allocBuf(dev, static_cast(K * M)); + float *At = alpaka::getPtrNative(hAt); + fillSeq(At, K * M); + fillVal(C, M * N, 0.f); + blas.gemm('T', 'N', M, N, K, 1.f, hAt, hB, 0.f, hBias, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemm(ref.data(), At, B, bias, M, N, K, 1.f, 0.f, true, false); + checkClose(C, ref.data(), M * N, "cpu::gemm TN"); + } + // --- gemmrelu: all-positive matmul result stays unchanged --- + { + // A and B with positive values ensure result is positive before bias + auto hAp = alpaka::allocBuf(dev, static_cast(M * K)); + auto hBp = alpaka::allocBuf(dev, static_cast(K * N)); + auto hBiasp = alpaka::allocBuf(dev, static_cast(M * N)); + float *Ap = alpaka::getPtrNative(hAp); + float *Bp = alpaka::getPtrNative(hBp); + float *biasp = alpaka::getPtrNative(hBiasp); + fillSeq(Ap, M * K, 0.1f, 0.1f); + fillSeq(Bp, K * N, 0.1f, 0.1f); + fillVal(biasp, M * N, 0.f); + fillVal(C, M * N, 0.f); + blas.gemmrelu('N', 'N', M, N, K, 1.f, hAp, hBp, 0.f, hBiasp, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmRelu(ref.data(), Ap, Bp, biasp, M, N, K, 1.f, 0.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::gemmrelu all-positive"); + } + + // --- gemmrelu: negative values clamped to zero --- + { + // Use alpha=-1 to force negative results + auto hBiasz = alpaka::allocBuf(dev, static_cast(M * N)); + fillVal(alpaka::getPtrNative(hBiasz), M * N, 0.f); + fillVal(C, M * N, 0.f); + blas.gemmrelu('N', 'N', M, N, K, -1.f, hA, hB, 0.f, hBiasz, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmRelu(ref.data(), A, B, alpaka::getPtrNative(hBiasz), M, N, K, -1.f, + 0.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::gemmrelu alpha=-1 (clamped)"); + } + + // --- gemmrelu with bias --- + fillVal(C, M * N, 0.f); + fillSeq(bias, M * N, -5.f, 2.f); // mixed sign bias + blas.gemmrelu('N', 'N', M, N, K, 1.f, hA, hB, 0.f, hBias, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmRelu(ref.data(), A, B, bias, M, N, K, 1.f, 0.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::gemmrelu with mixed bias"); + + // --- gemmgelu NN --- + fillVal(C, M * N, 0.f); + fillVal(bias, M * N, 0.f); + blas.gemmgelu('N', 'N', M, N, K, 1.f, hA, hB, 0.f, hBias, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmGelu(ref.data(), A, B, bias, M, N, K, 1.f, 0.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::gemmgelu NN"); + + // --- gemmgelu with bias --- + fillVal(C, M * N, 0.f); + fillSeq(bias, M * N, -2.f, 0.5f); + blas.gemmgelu('N', 'N', M, N, K, 1.f, hA, hB, 0.f, hBias, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmGelu(ref.data(), A, B, bias, M, N, K, 1.f, 0.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::gemmgelu with bias"); + + // --- gemmgelu TN --- + { + auto hAt = alpaka::allocBuf(dev, static_cast(K * M)); + float *At = alpaka::getPtrNative(hAt); + fillSeq(At, K * M); + fillVal(C, M * N, 0.f); + fillVal(bias, M * N, 0.f); + blas.gemmgelu('T', 'N', M, N, K, 1.f, hAt, hB, 0.f, hBias, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmGelu(ref.data(), At, B, bias, M, N, K, 1.f, 0.f, true, false); + checkClose(C, ref.data(), M * N, "cpu::gemmgelu TN"); + } + + // --- edge: zero matrix --- + { + auto hZ = alpaka::allocBuf(dev, static_cast(M * K)); + fillVal(alpaka::getPtrNative(hZ), M * K, 0.f); + fillVal(C, M * N, 99.f); + fillVal(bias, M * N, 0.f); + blas.matmul('N', 'N', M, N, K, 1.f, hZ, hB, 0.f, hC); + std::fill(ref.begin(), ref.end(), 0.f); + checkClose(C, ref.data(), M * N, "cpu::matmul zero-A"); + } + + // --- edge: identity-like (square, known result) --- + { + constexpr int S = 3; + auto hI = alpaka::allocBuf(dev, static_cast(S * S)); + auto hX = alpaka::allocBuf(dev, static_cast(S * S)); + auto hY = alpaka::allocBuf(dev, static_cast(S * S)); + float *I = alpaka::getPtrNative(hI); + float *X = alpaka::getPtrNative(hX); + float *Y = alpaka::getPtrNative(hY); + fillVal(I, S * S, 0.f); + for (int i = 0; i < S; ++i) + I[i * S + i] = 1.f; + fillSeq(X, S * S); + fillVal(Y, S * S, 0.f); + blas.matmul('N', 'N', S, S, S, 1.f, hI, hX, 0.f, hY); + checkClose(Y, X, S * S, "cpu::matmul identity×X=X"); + } +} + +#endif // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + +// --------------------------------------------------------------------------- +// CUDA tests +// --------------------------------------------------------------------------- + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + +// Helper: infer packed column-major leading dimensions +static int ldaFor(char trans, int m, int k) { + return (trans == 'N' || trans == 'n') ? m : k; +} +static int ldbFor(char trans, int k, int n) { + return (trans == 'N' || trans == 'n') ? k : n; +} + +static void runCudaTests() { + std::cout << "\n=== CUDA Tests ===\n"; + + alpaka::PlatformCudaRt platform{}; + auto dev = alpaka::getDevByIdx(platform, 0u); + alpaka::Queue queue{dev}; + sofieBLAS blas(queue); + + alpaka::PlatformCpu hostPlatform{}; + auto hostDev = alpaka::getDevByIdx(hostPlatform, 0u); + + constexpr int M = 4, N = 3, K = 5; + + auto hA = alpaka::allocBuf(hostDev, static_cast(M * K)); + auto hB = alpaka::allocBuf(hostDev, static_cast(K * N)); + auto hC = alpaka::allocBuf(hostDev, static_cast(M * N)); + auto hBias = alpaka::allocBuf(hostDev, static_cast(M * N)); + + float *A = alpaka::getPtrNative(hA); + float *B = alpaka::getPtrNative(hB); + float *bias = alpaka::getPtrNative(hBias); + + fillSeq(A, M * K); + fillSeq(B, K * N, 1.f, 0.5f); + fillVal(bias, M * N, 0.f); + + auto dA = alpaka::allocAsyncBuf(queue, static_cast(M * K)); + auto dB = alpaka::allocAsyncBuf(queue, static_cast(K * N)); + auto dC = alpaka::allocAsyncBuf(queue, static_cast(M * N)); + auto dBias = + alpaka::allocAsyncBuf(queue, static_cast(M * N)); + + alpaka::memcpy(queue, dA, hA); + alpaka::memcpy(queue, dB, hB); + alpaka::memcpy(queue, dBias, hBias); + alpaka::wait(queue); + + std::vector ref(M * N); + float *C = alpaka::getPtrNative(hC); + + auto verify = [&](const std::string &name) { + alpaka::memcpy(queue, hC, dC); + alpaka::wait(queue); + checkClose(C, ref.data(), M * N, name); + }; + + // ---- matmul NN ---- + blas.addLayoutConfig(M, N, K, ldaFor('N', M, K), ldbFor('N', K, N), M, 'N', + 'N'); + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), A, B, M, N, K, 1.f, 0.f, false, false); + blas.matmul('N', 'N', M, N, K, 1.f, dA, dB, 0.f, dC); + verify("cuda::matmul NN"); + + // ---- matmul TN ---- + { + auto hAt = alpaka::allocBuf(hostDev, static_cast(K * M)); + float *At = alpaka::getPtrNative(hAt); + fillSeq(At, K * M); + auto dAt = + alpaka::allocAsyncBuf(queue, static_cast(K * M)); + alpaka::memcpy(queue, dAt, hAt); alpaka::wait(queue); - std::cout << "CUDA Matrix C = A × B:\n"; - print(C, size); - std::cout << '\n'; + blas.addLayoutConfig(M, N, K, ldaFor('T', M, K), ldbFor('N', K, N), M, 'T', + 'N'); + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), At, B, M, N, K, 1.f, 0.f, true, false); + blas.matmul('T', 'N', M, N, K, 1.f, dAt, dB, 0.f, dC); + verify("cuda::matmul TN"); } -#endif -#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + // ---- matmul NT ---- { - alpaka::PlatformCpu platform; - alpaka::DevCpu device = alpaka::getDevByIdx(platform, 0u); - alpaka::Queue queue{device}; + auto hBt = alpaka::allocBuf(hostDev, static_cast(N * K)); + float *Bt = alpaka::getPtrNative(hBt); + fillSeq(Bt, N * K, 1.f, 0.5f); + auto dBt = + alpaka::allocAsyncBuf(queue, static_cast(N * K)); + alpaka::memcpy(queue, dBt, hBt); + alpaka::wait(queue); + blas.addLayoutConfig(M, N, K, ldaFor('N', M, K), ldbFor('T', K, N), M, 'N', + 'T'); + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), A, Bt, M, N, K, 1.f, 0.f, false, true); + blas.matmul('N', 'T', M, N, K, 1.f, dA, dBt, 0.f, dC); + verify("cuda::matmul NT"); + } + + // ---- matmul alpha=2.5 ---- + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), A, B, M, N, K, 2.5f, 0.f, false, false); + blas.matmul('N', 'N', M, N, K, 2.5f, dA, dB, 0.f, dC); + verify("cuda::matmul alpha=2.5"); - const Idx m = size; // rows of A and C - const Idx n = size; // columns of B and C - const Idx k = size; // columns of A and rows of B + // ---- gemm NN beta=0 ---- + fillSeq(bias, M * N, 0.1f, 0.1f); + alpaka::memcpy(queue, dBias, hBias); + alpaka::wait(queue); + std::fill(ref.begin(), ref.end(), 0.f); + refGemm(ref.data(), A, B, bias, M, N, K, 1.f, 0.f, false, false); + blas.gemm('N', 'N', M, N, K, 1.f, dA, dB, 0.f, dBias, dC); + verify("cuda::gemm NN beta=0"); - const float alpha = 1.0f; - const float beta = 0.0f; + // ---- gemm NN beta=1 ---- + // D_in = bias, so result = A*B + 1*bias_matrix + bias_vec + std::fill(ref.begin(), ref.end(), 0.f); + refGemm(ref.data(), A, B, bias, M, N, K, 1.f, 1.f, false, false); + blas.gemm('N', 'N', M, N, K, 1.f, dA, dB, 1.f, dBias, dC); + verify("cuda::gemm NN beta=1"); - const Idx lda = size; // leading dimension of A - const Idx ldb = size; // leading dimension of B - const Idx ldc = size; // leading dimension of C + // ---- gemm TN ---- + { + auto hAt = alpaka::allocBuf(hostDev, static_cast(K * M)); + float *At = alpaka::getPtrNative(hAt); + fillSeq(At, K * M); + auto dAt = + alpaka::allocAsyncBuf(queue, static_cast(K * M)); + alpaka::memcpy(queue, dAt, hAt); + alpaka::wait(queue); + blas.addLayoutConfig(M, N, K, ldaFor('T', M, K), ldbFor('N', K, N), M, 'T', + 'N'); + std::fill(ref.begin(), ref.end(), 0.f); + refGemm(ref.data(), At, B, bias, M, N, K, 1.f, 0.f, true, false); + blas.gemm('T', 'N', M, N, K, 1.f, dAt, dB, 0.f, dBias, dC); + verify("cuda::gemm TN"); + } - sofieBLAS blas(queue); - blas.gemm('n', 'n', m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); + // ---- gemmrelu: all-positive (relu is identity) ---- + { + auto hAp = alpaka::allocBuf(hostDev, static_cast(M * K)); + auto hBp = alpaka::allocBuf(hostDev, static_cast(K * N)); + auto hBiasz = + alpaka::allocBuf(hostDev, static_cast(M * N)); + float *Ap = alpaka::getPtrNative(hAp); + float *Bp = alpaka::getPtrNative(hBp); + fillSeq(Ap, M * K, 0.1f, 0.1f); + fillSeq(Bp, K * N, 0.1f, 0.1f); + fillVal(alpaka::getPtrNative(hBiasz), M * N, 0.f); + auto dAp = + alpaka::allocAsyncBuf(queue, static_cast(M * K)); + auto dBp = + alpaka::allocAsyncBuf(queue, static_cast(K * N)); + auto dBiasz = + alpaka::allocAsyncBuf(queue, static_cast(M * N)); + alpaka::memcpy(queue, dAp, hAp); + alpaka::memcpy(queue, dBp, hBp); + alpaka::memcpy(queue, dBiasz, hBiasz); + alpaka::wait(queue); + blas.addLayoutConfig(M, N, K, M, K, M, 'N', 'N'); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmRelu(ref.data(), Ap, Bp, alpaka::getPtrNative(hBiasz), M, N, K, 1.f, + 0.f, false, false); + blas.gemmrelu('N', 'N', M, N, K, 1.f, dAp, dBp, 0.f, dBiasz, dC); + verify("cuda::gemmrelu all-positive"); + } + // ---- gemmrelu: alpha=-1 forces negatives -> clamped to zero ---- + { + auto hBiasz = + alpaka::allocBuf(hostDev, static_cast(M * N)); + fillVal(alpaka::getPtrNative(hBiasz), M * N, 0.f); + auto dBiasz = + alpaka::allocAsyncBuf(queue, static_cast(M * N)); + alpaka::memcpy(queue, dBiasz, hBiasz); alpaka::wait(queue); - std::cout << "CPU Matrix C = A × B:\n"; - print(C, size); - std::cout << '\n'; + std::fill(ref.begin(), ref.end(), 0.f); + refGemmRelu(ref.data(), A, B, alpaka::getPtrNative(hBiasz), M, N, K, -1.f, + 0.f, false, false); + blas.gemmrelu('N', 'N', M, N, K, -1.f, dA, dB, 0.f, dBiasz, dC); + verify("cuda::gemmrelu alpha=-1 (clamped)"); } + + // ---- gemmrelu with mixed bias ---- + fillSeq(bias, M * N, -5.f, 2.f); + alpaka::memcpy(queue, dBias, hBias); + alpaka::wait(queue); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmRelu(ref.data(), A, B, bias, M, N, K, 1.f, 0.f, false, false); + blas.gemmrelu('N', 'N', M, N, K, 1.f, dA, dB, 0.f, dBias, dC); + verify("cuda::gemmrelu with mixed bias"); + + // ---- gemmgelu NN ---- + fillVal(bias, M * N, 0.f); + alpaka::memcpy(queue, dBias, hBias); + alpaka::wait(queue); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmGelu(ref.data(), A, B, bias, M, N, K, 1.f, 0.f, false, false); + blas.gemmgelu('N', 'N', M, N, K, 1.f, dA, dB, 0.f, dBias, dC); + verify("cuda::gemmgelu NN"); + + // ---- gemmgelu with bias ---- + fillSeq(bias, M * N, -2.f, 0.5f); + alpaka::memcpy(queue, dBias, hBias); + alpaka::wait(queue); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmGelu(ref.data(), A, B, bias, M, N, K, 1.f, 0.f, false, false); + blas.gemmgelu('N', 'N', M, N, K, 1.f, dA, dB, 0.f, dBias, dC); + verify("cuda::gemmgelu with bias"); + + // ---- edge: zero A ---- + { + auto hZero = alpaka::allocBuf(hostDev, static_cast(M * K)); + fillVal(alpaka::getPtrNative(hZero), M * K, 0.f); + auto dZero = + alpaka::allocAsyncBuf(queue, static_cast(M * K)); + alpaka::memcpy(queue, dZero, hZero); + alpaka::wait(queue); + std::fill(ref.begin(), ref.end(), 0.f); + blas.matmul('N', 'N', M, N, K, 1.f, dZero, dB, 0.f, dC); + verify("cuda::matmul zero-A"); + } +} + +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED + +// --------------------------------------------------------------------------- +// main +// --------------------------------------------------------------------------- + +int main() { +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + runCpuTests(); #endif +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + runCudaTests(); +#endif + + std::cout << "\n"; + if (gFailures == 0) + std::cout << "All tests passed.\n"; + else + std::cout << gFailures << " test(s) FAILED.\n"; - return 0; + return gFailures > 0 ? EXIT_FAILURE : EXIT_SUCCESS; }