From 0e6e99c5dd7c77e8aa4ba479106a10fdae6ceba1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Max=20L=C3=BCbke?= Date: Wed, 4 Oct 2023 19:47:58 +0200 Subject: [PATCH] add matmul tiled example --- src/sycl_comp.cpp | 144 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 106 insertions(+), 38 deletions(-) diff --git a/src/sycl_comp.cpp b/src/sycl_comp.cpp index 965e193..6263b99 100644 --- a/src/sycl_comp.cpp +++ b/src/sycl_comp.cpp @@ -1,4 +1,7 @@ +#include #include +#include +#include #include #include #include @@ -53,30 +56,8 @@ auto matrixMultTransposeCPU(const Matrix &matA, auto matrixMultSYCL(sycl::queue &q, const Matrix &matA, const Matrix &matB) { - - // auto d_matA = static_cast(sycl::malloc_device(matA.bytes(), q)); - // q.memcpy(d_matA, matA.mem.data(), matA.bytes()); - - // auto d_matB = static_cast(sycl::malloc_device(matB_t.bytes(), q)); - // q.memcpy(d_matB, matB_t.mem.data(), matB_t.bytes()); - Matrix matRes(matA.rows, matB.cols); - - // auto d_matRes = static_cast(sycl::malloc_device(matRes.bytes(), q)); - - // std::size_t max_group_size = - // q.get_device().get_info(); - - // lets assume we always have a maximum group size with a power of 2 - // const std::uint32_t local_one_dim = - // std::pow(2, static_cast(std::log2(max_group_size) / 2)); - sycl::range<2> global_range(matRes.rows, matRes.cols); - // sycl::range<2> local_range( - // local_one_dim > matRes.rows ? matRes.rows : local_one_dim, - // local_one_dim > matRes.cols ? matRes.cols : local_one_dim); - - q.wait(); { sycl::buffer b_matA(matA.mem.data(), @@ -115,24 +96,8 @@ auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix &matA, const Matrix &matB) { Matrix matB_t = matB.t(); - Matrix matRes(matA.rows, matB.cols); - - // auto d_matRes = static_cast(sycl::malloc_device(matRes.bytes(), q)); - - // std::size_t max_group_size = - // q.get_device().get_info(); - - // lets assume we always have a maximum group size with a power of 2 - // const std::uint32_t local_one_dim = - // std::pow(2, static_cast(std::log2(max_group_size) / 2)); - sycl::range<2> global_range(matRes.rows, matRes.cols); - // sycl::range<2> local_range( - // local_one_dim > matRes.rows ? matRes.rows : local_one_dim, - // local_one_dim > matRes.cols ? matRes.cols : local_one_dim); - - q.wait(); { sycl::buffer b_matA(matA.mem.data(), @@ -167,6 +132,102 @@ auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix &matA, return matRes.chksum(); } +/* Obtains the previous power of two from the given integer. + * It works by masking out all ones after the first one bit, + * then leaves the first one bit intact, effectively + * yielding the first power of two < x. */ +inline int prevPowerOfTwo(int x) { + if (x < 0) { + return 0; + } + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + return x - (x >> 1); +} + +template +auto matrixMultTiledSYCL(sycl::queue &q, const Matrix &matA, + const Matrix &matB) { + Matrix matRes(matA.rows, matB.cols); + + std::size_t max_group_size = + q.get_device().get_info(); + + // lets assume we always have a maximum group size with a power of 2 + const std::uint32_t max_block_size = + static_cast(prevPowerOfTwo(std::sqrt(max_group_size))); + + const std::uint32_t block_size = std::min(matA.cols, max_block_size); + + sycl::range<2> global_range(matRes.rows, matRes.cols); + sycl::range<2> local_range(block_size, block_size); + + { + sycl::buffer b_matA(matA.mem.data(), + sycl::range<2>(matA.rows, matA.cols)); + + sycl::buffer b_matB(matB.mem.data(), + sycl::range<2>(matB.rows, matB.cols)); + + sycl::buffer b_matRes(matRes.mem.data(), + sycl::range<2>(matRes.rows, matRes.cols)); + + q.submit([&](sycl::handler &h) { + auto acc_matA = b_matA.template get_access(h); + auto acc_matB = b_matB.template get_access(h); + auto acc_matRes = + b_matRes.template get_access(h); + + sycl::accessor + tileA(local_range, h); + + sycl::accessor + tileB(local_range, h); + + h.parallel_for( + sycl::nd_range{global_range, local_range}, [=](sycl::nd_item<2> &ID) { + const int i = ID.get_global_id(0); + const int j = ID.get_global_id(1); + + const int local_i = ID.get_local_id(0); + const int local_j = ID.get_local_id(1); + + const int max_tile = ID.get_group_range(0); + + // Current local IDem + T sum = 0; + + for (int tile_i = 0; tile_i < max_tile; tile_i++) { + tileA[local_i][local_j] = + acc_matA[i][tile_i * block_size + local_j]; + tileB[local_j][local_i] = + acc_matB[block_size * tile_i + local_i][j]; + + ID.barrier(sycl::access::fence_space::local_space); + + for (auto k = 0; k < block_size; k++) { + sum += tileA[local_i][k] * tileB[local_j][k]; + } + + ID.barrier(sycl::access::fence_space::local_space); + } + + acc_matRes[i][j] = sum; + }); + }); + } + + q.wait(); + + return matRes.chksum(); +} + auto main(int argc, char **argv) -> int { if (argc != 3) { @@ -180,6 +241,7 @@ auto main(int argc, char **argv) -> int { assert(matA.rows == matB.cols); +#ifdef SEQ_BENCH auto cpu_chksum = measure<>::duration(matrixMultCPU, matA, matB); print_pair("CPU - naive", cpu_chksum.first, cpu_chksum.second.count()); @@ -187,6 +249,7 @@ auto main(int argc, char **argv) -> int { measure<>::duration(matrixMultTransposeCPU, matA, matB); print_pair("CPU - transposed", cpu_transp_chksum.first, cpu_transp_chksum.second.count()); +#endif sycl::queue cpu_queue(sycl::cpu_selector_v); @@ -208,5 +271,10 @@ auto main(int argc, char **argv) -> int { print_pair("GPU - transposed", gpu_transp_chksum.first, gpu_transp_chksum.second.count()); + auto gpu_tiled_chksum = measure<>::duration(matrixMultTiledSYCL, + gpu_queue, matA, matB); + print_pair("GPU - tiled", gpu_tiled_chksum.first, + gpu_tiled_chksum.second.count()); + return EXIT_SUCCESS; }