diff --git a/src/sycl_comp.cpp b/src/sycl_comp.cpp index 420c7fa..15b7e00 100644 --- a/src/sycl_comp.cpp +++ b/src/sycl_comp.cpp @@ -23,9 +23,9 @@ namespace sycl = cl::sycl; using data_type = int; -auto matrixMultCPU(const Matrix &matA, - const Matrix &matB) { - Matrix res(matA.rows, matB.cols); +template +auto matrixMultCPU(const Matrix &matA, const Matrix &matB) { + Matrix res(matA.rows, matB.cols); for (std::uint32_t i = 0; i < res.rows; i++) { for (std::uint32_t j = 0; j < res.cols; j++) { auto &res_val = res(i, j) = 0; @@ -38,10 +38,11 @@ auto matrixMultCPU(const Matrix &matA, return res.chksum(); } -auto matrixMultTransposeCPU(const Matrix &matA, - const Matrix &matB) { - Matrix matB_t = matB.t(); - Matrix res(matA.rows, matB.cols); +template +auto matrixMultTransposeCPU(const Matrix &matA, + const Matrix &matB) { + Matrix matB_t = matB.t(); + Matrix res(matA.rows, matB.cols); for (std::uint32_t i = 0; i < res.rows; i++) { for (std::uint32_t j = 0; j < res.cols; j++) { auto &res_val = res(i, j) = 0; @@ -54,30 +55,31 @@ auto matrixMultTransposeCPU(const Matrix &matA, return res.chksum(); } -auto matrixMultSYCL(sycl::queue &q, const Matrix &matA, - const Matrix &matB) { - Matrix matRes(matA.rows, matB.cols); +template +auto matrixMultSYCL(sycl::queue &q, const Matrix &matA, + const Matrix &matB) { + Matrix matRes(matA.rows, matB.cols); sycl::range<2> global_range(matRes.rows, matRes.cols); { - sycl::buffer b_matA(matA.mem.data(), + sycl::buffer b_matA(matA.mem.data(), sycl::range<2>(matA.rows, matA.cols)); - sycl::buffer b_matB(matB.mem.data(), + sycl::buffer b_matB(matB.mem.data(), sycl::range<2>(matB.rows, matB.cols)); - sycl::buffer b_matRes( + sycl::buffer b_matRes( matRes.mem.data(), sycl::range<2>(matRes.rows, matRes.cols)); q.submit([&](sycl::handler &h) { - auto acc_matA = b_matA.get_access(h); - auto acc_matB = b_matB.get_access(h); - auto acc_matRes = b_matRes.get_access(h); + auto acc_matA = b_matA.template get_access(h); + auto acc_matB = b_matB.template get_access(h); + auto acc_matRes = b_matRes.template get_access(h); h.parallel_for(global_range, [=](sycl::id<2> ID) { auto i = ID[0]; auto j = ID[1]; - data_type sum = 0; + T sum = 0; for (auto k = 0; k < matA.cols; k++) { sum += acc_matA[i][k] * acc_matB[k][j]; @@ -92,32 +94,33 @@ auto matrixMultSYCL(sycl::queue &q, const Matrix &matA, return matRes.chksum(); } -auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix &matA, - const Matrix &matB) { +template +auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix &matA, + const Matrix &matB) { - Matrix matB_t = matB.t(); - Matrix matRes(matA.rows, matB.cols); + Matrix matB_t = matB.t(); + Matrix matRes(matA.rows, matB.cols); sycl::range<2> global_range(matRes.rows, matRes.cols); { - sycl::buffer b_matA(matA.mem.data(), + sycl::buffer b_matA(matA.mem.data(), sycl::range<2>(matA.rows, matA.cols)); - sycl::buffer b_matB(matB_t.mem.data(), + sycl::buffer b_matB(matB_t.mem.data(), sycl::range<2>(matB_t.rows, matB_t.cols)); - sycl::buffer b_matRes( + sycl::buffer b_matRes( matRes.mem.data(), sycl::range<2>(matRes.rows, matRes.cols)); q.submit([&](sycl::handler &h) { - auto acc_matA = b_matA.get_access(h); - auto acc_matB = b_matB.get_access(h); - auto acc_matRes = b_matRes.get_access(h); + auto acc_matA = b_matA.template get_access(h); + auto acc_matB = b_matB.template get_access(h); + auto acc_matRes = b_matRes.template get_access(h); h.parallel_for(global_range, [=](sycl::id<2> ID) { auto i = ID[0]; auto j = ID[1]; - data_type sum = 0; + T sum = 0; for (auto k = 0; k < matA.cols; k++) { sum += acc_matA[i][k] * acc_matB[j][k]; @@ -246,32 +249,32 @@ auto main(int argc, char **argv) -> int { assert(matA.rows == matB.cols); #ifdef SEQ_BENCH - auto cpu_chksum = measure<>::duration(matrixMultCPU, matA, matB); + auto cpu_chksum = measure<>::duration(matrixMultCPU, matA, matB); print_pair("CPU - naive", cpu_chksum.first, cpu_chksum.second.count()); auto cpu_transp_chksum = - measure<>::duration(matrixMultTransposeCPU, matA, matB); + measure<>::duration(matrixMultTransposeCPU, matA, matB); print_pair("CPU - transposed", cpu_transp_chksum.first, cpu_transp_chksum.second.count()); #endif sycl::queue cpu_queue(sycl::cpu_selector_v); - auto omp_chksum = measure<>::duration(matrixMultSYCL, cpu_queue, matA, matB); + auto omp_chksum = measure<>::duration(matrixMultSYCL, cpu_queue, matA, matB); print_pair("OMP - naive", omp_chksum.first, omp_chksum.second.count()); auto omp_transp_chksum = - measure<>::duration(matrixMultTransposeSYCL, cpu_queue, matA, matB); + measure<>::duration(matrixMultTransposeSYCL, cpu_queue, matA, matB); print_pair("OMP - transposed", omp_transp_chksum.first, omp_transp_chksum.second.count()); sycl::queue gpu_queue(sycl::gpu_selector_v); - auto gpu_chksum = measure<>::duration(matrixMultSYCL, gpu_queue, matA, matB); + auto gpu_chksum = measure<>::duration(matrixMultSYCL, gpu_queue, matA, matB); print_pair("GPU - naive", gpu_chksum.first, gpu_chksum.second.count()); auto gpu_transp_chksum = - measure<>::duration(matrixMultTransposeSYCL, gpu_queue, matA, matB); + measure<>::duration(matrixMultTransposeSYCL, gpu_queue, matA, matB); print_pair("GPU - transposed", gpu_transp_chksum.first, gpu_transp_chksum.second.count());