From 96333022264d01ced4d6b32cb5e79098bafc420c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Max=20L=C3=BCbke?= Date: Fri, 6 Oct 2023 15:16:32 +0200 Subject: [PATCH] refactor code to compile with Intel oneAPI --- src/sycl_comp.cpp | 76 +++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 42 deletions(-) diff --git a/src/sycl_comp.cpp b/src/sycl_comp.cpp index 9f51d98..55df1c4 100644 --- a/src/sycl_comp.cpp +++ b/src/sycl_comp.cpp @@ -97,19 +97,19 @@ auto matrixMultSYCL(sycl::queue &q, const Matrix &matA, // cell of the product. Thus, leading to a problem size of N x M sycl::range<2> global_range(matRes.rows, matRes.cols); + const auto &inner_loop = matA.cols; + { // defining 2 dimensional buffers which can then be exposed to the device. // It also possible to use 1D buffers here, but then we have to manually // calculate the index to access the matrices for each thread in the kernel // code. Solving it this way will ask the compiler to do the work. - sycl::buffer b_matA(matA.mem.data(), - sycl::range<2>(matA.rows, matA.cols)); + sycl::buffer b_matA(matA.mem.data(), sycl::range(matA.rows, matA.cols)); - sycl::buffer b_matB(matB.mem.data(), - sycl::range<2>(matB.rows, matB.cols)); + sycl::buffer b_matB(matB.mem.data(), sycl::range(matB.rows, matB.cols)); - sycl::buffer b_matRes(matRes.mem.data(), - sycl::range<2>(matRes.rows, matRes.cols)); + sycl::buffer b_matRes(matRes.mem.data(), + sycl::range(matRes.rows, matRes.cols)); // submit work to the device. this is done by using a lambda function which // references all values known to the scope i.e. the previously defined @@ -121,10 +121,9 @@ auto matrixMultSYCL(sycl::queue &q, const Matrix &matA, // // Here, we only the matrix A and B to be read from and the matrix C to be // written to. - auto acc_matA = b_matA.template get_access(h); - auto acc_matB = b_matB.template get_access(h); - auto acc_matRes = - b_matRes.template get_access(h); + sycl::accessor acc_matA(b_matA, h, sycl::read_only); + sycl::accessor acc_matB(b_matB, h, sycl::read_only); + sycl::accessor acc_matRes(b_matRes, h, sycl::write_only); // For the parallelized loop another lambda function is used, but all // known values are passed by value, as host and device doesn't share the @@ -134,12 +133,12 @@ auto matrixMultSYCL(sycl::queue &q, const Matrix &matA, // the global range we defined earlier to provide the size of the problem // and launch the count of tasks accordingly. The identifier of the task // is then passed to the lambda function as a parameter. - h.parallel_for(global_range, [=](sycl::id<2> ID) { + h.parallel_for(global_range, [=](auto ID) { const auto i = ID[0]; const auto j = ID[1]; T sum = 0; - for (auto k = 0; k < matA.cols; k++) { + for (auto k = 0; k < inner_loop; k++) { sum += acc_matA[i][k] * acc_matB[k][j]; } acc_matRes[i][j] = sum; @@ -176,30 +175,30 @@ auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix &matA, Matrix matB_t = matB.t(); Matrix matRes(matA.rows, matB.cols); - sycl::range<2> global_range(matRes.rows, matRes.cols); + sycl::range global_range(matRes.rows, matRes.cols); + + const auto &inner_loop = matA.cols; { - sycl::buffer b_matA(matA.mem.data(), - sycl::range<2>(matA.rows, matA.cols)); + sycl::buffer b_matA(matA.mem.data(), sycl::range(matA.rows, matA.cols)); - sycl::buffer b_matB(matB_t.mem.data(), - sycl::range<2>(matB_t.rows, matB_t.cols)); + sycl::buffer b_matB(matB_t.mem.data(), + sycl::range(matB_t.rows, matB_t.cols)); - sycl::buffer b_matRes(matRes.mem.data(), - sycl::range<2>(matRes.rows, matRes.cols)); + sycl::buffer b_matRes(matRes.mem.data(), + sycl::range(matRes.rows, matRes.cols)); q.submit([&](sycl::handler &h) { - auto acc_matA = b_matA.template get_access(h); - auto acc_matB = b_matB.template get_access(h); - auto acc_matRes = - b_matRes.template get_access(h); + sycl::accessor acc_matA(b_matA, h, sycl::read_only); + sycl::accessor acc_matB(b_matB, h, sycl::read_only); + sycl::accessor acc_matRes(b_matRes, h, sycl::write_only); - h.parallel_for(global_range, [=](sycl::id<2> ID) { + h.parallel_for(global_range, [=](auto ID) { auto i = ID[0]; auto j = ID[1]; T sum = 0; - for (auto k = 0; k < matA.cols; k++) { + for (auto k = 0; k < inner_loop; k++) { sum += acc_matA[i][k] * acc_matB[j][k]; } acc_matRes[i][j] = sum; @@ -287,32 +286,25 @@ auto matrixMultTiledSYCL(sycl::queue &q, const Matrix &matA, { // allocate the buffers - sycl::buffer b_matA(matA.mem.data(), - sycl::range<2>(matA.rows, matA.cols)); + sycl::buffer b_matA(matA.mem.data(), sycl::range(matA.rows, matA.cols)); - sycl::buffer b_matB(matB.mem.data(), - sycl::range<2>(matB.rows, matB.cols)); + sycl::buffer b_matB(matB.mem.data(), sycl::range(matB.rows, matB.cols)); - sycl::buffer b_matRes(matRes.mem.data(), - sycl::range<2>(matRes.rows, matRes.cols)); + sycl::buffer b_matRes(matRes.mem.data(), + sycl::range(matRes.rows, matRes.cols)); q.submit([&](sycl::handler &h) { // provide access to the buffers and ... - auto acc_matA = b_matA.template get_access(h); - auto acc_matB = b_matB.template get_access(h); - auto acc_matRes = - b_matRes.template get_access(h); + sycl::accessor acc_matA(b_matA, h, sycl::read_only); + sycl::accessor acc_matB(b_matB, h, sycl::read_only); + sycl::accessor acc_matRes(b_matRes, h, sycl::write_only); // ... allocate memory in the local device memory which should be // accessble to each thread per matrix A ... - sycl::accessor - tileA(tile_range, h); + sycl::local_accessor tileA(tile_range, h); // ... and matrix B - sycl::accessor - tileB(tile_range, h); + sycl::local_accessor tileB(tile_range, h); // We define a kernel function by passing the global_range and the // tile_range to the parallel_for function of the handler. Secondly, @@ -320,7 +312,7 @@ auto matrixMultTiledSYCL(sycl::queue &q, const Matrix &matA, // passed-by-value lambda captures. As a parameter serves a nd_item, which // can be used to extract all relevant data linked to the running task. h.parallel_for( - sycl::nd_range{global_range, tile_range}, [=](sycl::nd_item<2> &ID) { + sycl::nd_range{global_range, tile_range}, [=](auto ID) { // extract all relevant information const int i = ID.get_global_id(0); const int j = ID.get_global_id(1);