From d0673e61ca3f4ee732d57c240bb20cd02c928a41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Max=20L=C3=BCbke?= Date: Wed, 4 Oct 2023 09:54:15 +0200 Subject: [PATCH] add omp/gpu naive and transposed code --- matrix.hpp | 4 +- sycl_comp.cpp | 203 ++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 158 insertions(+), 49 deletions(-) diff --git a/matrix.hpp b/matrix.hpp index da56d3d..91f3aa7 100644 --- a/matrix.hpp +++ b/matrix.hpp @@ -28,7 +28,7 @@ template struct Matrix { } matfs >> this->rows >> this->cols; - this->data.resize(this->rows * this->cols); + this->mem.resize(this->rows * this->cols); for (std::uint32_t i = 0; i < rows; i++) { for (std::uint32_t j = 0; j < cols; j++) { @@ -67,7 +67,7 @@ template struct Matrix { XXH32_hash_t chksum() const { constexpr XXH32_hash_t HASH_SEED = 42; - return XXH32(this->data.data(), mem.size(), HASH_SEED); + return XXH32(this->mem.data(), mem.size(), HASH_SEED); } std::size_t bytes() const { return this->mem.size() * sizeof(T); } diff --git a/sycl_comp.cpp b/sycl_comp.cpp index 63d5779..8e755b7 100644 --- a/sycl_comp.cpp +++ b/sycl_comp.cpp @@ -18,9 +18,11 @@ namespace sycl = cl::sycl; << "\t-> Check: 0x" << stream_hex(_chksm) \ << "\tRuntime: " << _time << " us\n\n" -template -auto matrixMultCPU(const Matrix &matA, const Matrix &matB) { - Matrix res(matA.rows, matB.cols); +using data_type = int; + +auto matrixMultCPU(const Matrix &matA, + const Matrix &matB) { + Matrix res(matA.rows, matB.cols); for (std::uint32_t i = 0; i < res.rows; i++) { for (std::uint32_t j = 0; j < res.cols; j++) { auto &res_val = res(i, j) = 0; @@ -33,10 +35,10 @@ auto matrixMultCPU(const Matrix &matA, const Matrix &matB) { return res.chksum(); } -template -auto matrixMultTransposeCPU(const Matrix &matA, const Matrix &matB) { - Matrix matB_t = matB.t(); - Matrix res(matA.rows, matB.cols); +auto matrixMultTransposeCPU(const Matrix &matA, + const Matrix &matB) { + Matrix matB_t = matB.t(); + Matrix res(matA.rows, matB.cols); for (std::uint32_t i = 0; i < res.rows; i++) { for (std::uint32_t j = 0; j < res.cols; j++) { auto &res_val = res(i, j) = 0; @@ -49,6 +51,126 @@ auto matrixMultTransposeCPU(const Matrix &matA, const Matrix &matB) { return res.chksum(); } +auto matrixMultSYCL(sycl::queue &q, const Matrix &matA, + const Matrix &matB) { + + // auto d_matA = static_cast(sycl::malloc_device(matA.bytes(), q)); + // q.memcpy(d_matA, matA.mem.data(), matA.bytes()); + + // auto d_matB = static_cast(sycl::malloc_device(matB_t.bytes(), q)); + // q.memcpy(d_matB, matB_t.mem.data(), matB_t.bytes()); + + Matrix matRes(matA.rows, matB.cols); + + // auto d_matRes = static_cast(sycl::malloc_device(matRes.bytes(), q)); + + // std::size_t max_group_size = + // q.get_device().get_info(); + + // lets assume we always have a maximum group size with a power of 2 + // const std::uint32_t local_one_dim = + // std::pow(2, static_cast(std::log2(max_group_size) / 2)); + + sycl::range<2> global_range(matRes.rows, matRes.cols); + // sycl::range<2> local_range( + // local_one_dim > matRes.rows ? matRes.rows : local_one_dim, + // local_one_dim > matRes.cols ? matRes.cols : local_one_dim); + + q.wait(); + + { + sycl::buffer b_matA(matA.mem.data(), + sycl::range<2>(matA.rows, matA.cols)); + + sycl::buffer b_matB(matB.mem.data(), + sycl::range<2>(matB.rows, matB.cols)); + + sycl::buffer b_matRes( + matRes.mem.data(), sycl::range<2>(matRes.rows, matRes.cols)); + + q.submit([&](sycl::handler &h) { + auto acc_matA = b_matA.get_access(h); + auto acc_matB = b_matB.get_access(h); + auto acc_matRes = b_matRes.get_access(h); + + h.parallel_for(global_range, [=](sycl::id<2> ID) { + auto i = ID[0]; + auto j = ID[1]; + data_type sum = 0; + + if (i < global_range.get(0) && j < global_range.get(1)) { + for (auto k = 0; k < matA.cols; k++) { + sum += acc_matA[i][k] * acc_matB[k][j]; + } + acc_matRes[i][j] = sum; + } + }); + }); + } + + q.wait(); + + return matRes.chksum(); +} + +auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix &matA, + const Matrix &matB) { + + Matrix matB_t = matB.t(); + + Matrix matRes(matA.rows, matB.cols); + + // auto d_matRes = static_cast(sycl::malloc_device(matRes.bytes(), q)); + + // std::size_t max_group_size = + // q.get_device().get_info(); + + // lets assume we always have a maximum group size with a power of 2 + // const std::uint32_t local_one_dim = + // std::pow(2, static_cast(std::log2(max_group_size) / 2)); + + sycl::range<2> global_range(matRes.rows, matRes.cols); + // sycl::range<2> local_range( + // local_one_dim > matRes.rows ? matRes.rows : local_one_dim, + // local_one_dim > matRes.cols ? matRes.cols : local_one_dim); + + q.wait(); + + { + sycl::buffer b_matA(matA.mem.data(), + sycl::range<2>(matA.rows, matA.cols)); + + sycl::buffer b_matB(matB_t.mem.data(), + sycl::range<2>(matB_t.rows, matB_t.cols)); + + sycl::buffer b_matRes( + matRes.mem.data(), sycl::range<2>(matRes.rows, matRes.cols)); + + q.submit([&](sycl::handler &h) { + auto acc_matA = b_matA.get_access(h); + auto acc_matB = b_matB.get_access(h); + auto acc_matRes = b_matRes.get_access(h); + + h.parallel_for(global_range, [=](sycl::id<2> ID) { + auto i = ID[0]; + auto j = ID[1]; + data_type sum = 0; + + if (i < global_range.get(0) && j < global_range.get(1)) { + for (auto k = 0; k < matA.cols; k++) { + sum += acc_matA[i][k] * acc_matB[j][k]; + } + acc_matRes[i][j] = sum; + } + }); + }); + } + + q.wait(); + + return matRes.chksum(); +} + auto main(int argc, char **argv) -> int { if (argc != 3) { @@ -57,51 +179,38 @@ auto main(int argc, char **argv) -> int { return EXIT_FAILURE; } - Matrix matA(argv[1]); - Matrix matB(argv[2]); + Matrix matA(argv[1]); + Matrix matB(argv[2]); assert(matA.rows == matB.cols); - auto cpu_chksum = measure<>::duration(matrixMultCPU, matA, matB); - std::cout << "CPU only \n\t->" - << "Check: 0x" << stream_hex(cpu_chksum.first) - << "\tRuntime: " << cpu_chksum.second.count() << " us\n\n"; + auto cpu_chksum = measure<>::duration(matrixMultCPU, matA, matB); + print_pair("CPU - naive", cpu_chksum.first, cpu_chksum.second.count()); auto cpu_transp_chksum = - measure<>::duration(matrixMultTransposeCPU, matA, matB); - std::cout << "CPU only - transposed \n\t->" - << "Check: 0x" << stream_hex(cpu_transp_chksum.first) - << "\tRuntime: " << cpu_transp_chksum.second.count() << " us\n\n"; + measure<>::duration(matrixMultTransposeCPU, matA, matB); + print_pair("CPU - transposed", cpu_transp_chksum.first, + cpu_transp_chksum.second.count()); + + sycl::queue cpu_queue(sycl::cpu_selector_v); + + auto omp_chksum = measure<>::duration(matrixMultSYCL, cpu_queue, matA, matB); + print_pair("OMP - naive", omp_chksum.first, omp_chksum.second.count()); + + auto omp_transp_chksum = + measure<>::duration(matrixMultTransposeSYCL, cpu_queue, matA, matB); + print_pair("OMP - transposed", omp_transp_chksum.first, + omp_transp_chksum.second.count()); + + sycl::queue gpu_queue(sycl::gpu_selector_v); + + auto gpu_chksum = measure<>::duration(matrixMultSYCL, gpu_queue, matA, matB); + print_pair("GPU - naive", gpu_chksum.first, gpu_chksum.second.count()); + + auto gpu_transp_chksum = + measure<>::duration(matrixMultTransposeSYCL, gpu_queue, matA, matB); + print_pair("GPU - transposed", gpu_transp_chksum.first, + gpu_transp_chksum.second.count()); return EXIT_SUCCESS; - - // sycl::queue q; - - // std::cout << "Using device: " - // << q.get_device().get_info() << "\n"; - - // int hostArray[42]; - // auto deviceArray = static_cast(malloc_device(42 * sizeof(int), q)); - - // for (int i = 0; i < 42; i++) { - // hostArray[i] = i; - // } - - // q.memcpy(deviceArray, hostArray, 42 * sizeof(int)); - // q.wait(); - - // q.submit([&](sycl::handler &h) { - // h.parallel_for(sycl::range<1>(42), [=](auto ID) { deviceArray[ID]++; }); - // }); - - // q.wait(); - - // q.memcpy(hostArray, deviceArray, 42 * sizeof(int)); - // q.wait(); - - // for (int i = 0; i < 42; i++) { - // std::cout << hostArray[i] << " "; - // } - - // std::cout << "\n"; }