#include #include #include #include #include #include #include "matrix.hpp" #include "timer.hpp" namespace sycl = cl::sycl; #define stream_hex(_val) std::hex << _val << std::dec #define print_pair(_desc, _chksm, _time) \ std::cout << _desc << "\n\t" \ << "\t-> Check: 0x" << stream_hex(_chksm) \ << "\tRuntime: " << _time << " us\n\n" using data_type = int; auto matrixMultCPU(const Matrix &matA, const Matrix &matB) { Matrix res(matA.rows, matB.cols); for (std::uint32_t i = 0; i < res.rows; i++) { for (std::uint32_t j = 0; j < res.cols; j++) { auto &res_val = res(i, j) = 0; for (std::uint32_t k = 0; k < matA.cols; k++) { res_val += matA(i, k) * matB(k, j); } } } return res.chksum(); } auto matrixMultTransposeCPU(const Matrix &matA, const Matrix &matB) { Matrix matB_t = matB.t(); Matrix res(matA.rows, matB.cols); for (std::uint32_t i = 0; i < res.rows; i++) { for (std::uint32_t j = 0; j < res.cols; j++) { auto &res_val = res(i, j) = 0; for (std::uint32_t k = 0; k < matA.cols; k++) { res_val += matA(i, k) * matB_t(j, k); } } } return res.chksum(); } auto matrixMultSYCL(sycl::queue &q, const Matrix &matA, const Matrix &matB) { // auto d_matA = static_cast(sycl::malloc_device(matA.bytes(), q)); // q.memcpy(d_matA, matA.mem.data(), matA.bytes()); // auto d_matB = static_cast(sycl::malloc_device(matB_t.bytes(), q)); // q.memcpy(d_matB, matB_t.mem.data(), matB_t.bytes()); Matrix matRes(matA.rows, matB.cols); // auto d_matRes = static_cast(sycl::malloc_device(matRes.bytes(), q)); // std::size_t max_group_size = // q.get_device().get_info(); // lets assume we always have a maximum group size with a power of 2 // const std::uint32_t local_one_dim = // std::pow(2, static_cast(std::log2(max_group_size) / 2)); sycl::range<2> global_range(matRes.rows, matRes.cols); // sycl::range<2> local_range( // local_one_dim > matRes.rows ? matRes.rows : local_one_dim, // local_one_dim > matRes.cols ? matRes.cols : local_one_dim); q.wait(); { sycl::buffer b_matA(matA.mem.data(), sycl::range<2>(matA.rows, matA.cols)); sycl::buffer b_matB(matB.mem.data(), sycl::range<2>(matB.rows, matB.cols)); sycl::buffer b_matRes( matRes.mem.data(), sycl::range<2>(matRes.rows, matRes.cols)); q.submit([&](sycl::handler &h) { auto acc_matA = b_matA.get_access(h); auto acc_matB = b_matB.get_access(h); auto acc_matRes = b_matRes.get_access(h); h.parallel_for(global_range, [=](sycl::id<2> ID) { auto i = ID[0]; auto j = ID[1]; data_type sum = 0; for (auto k = 0; k < matA.cols; k++) { sum += acc_matA[i][k] * acc_matB[k][j]; } acc_matRes[i][j] = sum; }); }); } q.wait(); return matRes.chksum(); } auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix &matA, const Matrix &matB) { Matrix matB_t = matB.t(); Matrix matRes(matA.rows, matB.cols); // auto d_matRes = static_cast(sycl::malloc_device(matRes.bytes(), q)); // std::size_t max_group_size = // q.get_device().get_info(); // lets assume we always have a maximum group size with a power of 2 // const std::uint32_t local_one_dim = // std::pow(2, static_cast(std::log2(max_group_size) / 2)); sycl::range<2> global_range(matRes.rows, matRes.cols); // sycl::range<2> local_range( // local_one_dim > matRes.rows ? matRes.rows : local_one_dim, // local_one_dim > matRes.cols ? matRes.cols : local_one_dim); q.wait(); { sycl::buffer b_matA(matA.mem.data(), sycl::range<2>(matA.rows, matA.cols)); sycl::buffer b_matB(matB_t.mem.data(), sycl::range<2>(matB_t.rows, matB_t.cols)); sycl::buffer b_matRes( matRes.mem.data(), sycl::range<2>(matRes.rows, matRes.cols)); q.submit([&](sycl::handler &h) { auto acc_matA = b_matA.get_access(h); auto acc_matB = b_matB.get_access(h); auto acc_matRes = b_matRes.get_access(h); h.parallel_for(global_range, [=](sycl::id<2> ID) { auto i = ID[0]; auto j = ID[1]; data_type sum = 0; for (auto k = 0; k < matA.cols; k++) { sum += acc_matA[i][k] * acc_matB[j][k]; } acc_matRes[i][j] = sum; }); }); } q.wait(); return matRes.chksum(); } auto main(int argc, char **argv) -> int { if (argc != 3) { std::cerr << "Provide 2 arguments to the program!\n" << "Usage: .txt .txt\n"; return EXIT_FAILURE; } Matrix matA(argv[1]); Matrix matB(argv[2]); assert(matA.rows == matB.cols); auto cpu_chksum = measure<>::duration(matrixMultCPU, matA, matB); print_pair("CPU - naive", cpu_chksum.first, cpu_chksum.second.count()); auto cpu_transp_chksum = measure<>::duration(matrixMultTransposeCPU, matA, matB); print_pair("CPU - transposed", cpu_transp_chksum.first, cpu_transp_chksum.second.count()); sycl::queue cpu_queue(sycl::cpu_selector_v); auto omp_chksum = measure<>::duration(matrixMultSYCL, cpu_queue, matA, matB); print_pair("OMP - naive", omp_chksum.first, omp_chksum.second.count()); auto omp_transp_chksum = measure<>::duration(matrixMultTransposeSYCL, cpu_queue, matA, matB); print_pair("OMP - transposed", omp_transp_chksum.first, omp_transp_chksum.second.count()); sycl::queue gpu_queue(sycl::gpu_selector_v); auto gpu_chksum = measure<>::duration(matrixMultSYCL, gpu_queue, matA, matB); print_pair("GPU - naive", gpu_chksum.first, gpu_chksum.second.count()); auto gpu_transp_chksum = measure<>::duration(matrixMultTransposeSYCL, gpu_queue, matA, matB); print_pair("GPU - transposed", gpu_transp_chksum.first, gpu_transp_chksum.second.count()); return EXIT_SUCCESS; }