From 96333022264d01ced4d6b32cb5e79098bafc420c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Max=20L=C3=BCbke?= <mluebke@uni-potsdam.de>
Date: Fri, 6 Oct 2023 15:16:32 +0200
Subject: [PATCH] refactor code to compile with Intel oneAPI

---
 src/sycl_comp.cpp | 76 +++++++++++++++++++++--------------------------
 1 file changed, 34 insertions(+), 42 deletions(-)
diff --git a/src/sycl_comp.cpp b/src/sycl_comp.cpp
index 9f51d98..55df1c4 100644
--- a/src/sycl_comp.cpp
+++ b/src/sycl_comp.cpp
@@ -97,19 +97,19 @@ auto matrixMultSYCL(sycl::queue &q, const Matrix<T> &matA,
   // cell of the product. Thus, leading to a problem size of N x M
   sycl::range<2> global_range(matRes.rows, matRes.cols);
 
+  const auto &inner_loop = matA.cols;
+
   {
     // defining 2 dimensional buffers which can then be exposed to the device.
     // It also possible to use 1D buffers here, but then we have to manually
     // calculate the index to access the matrices for each thread in the kernel
     // code. Solving it this way will ask the compiler to do the work.
-    sycl::buffer<T, 2> b_matA(matA.mem.data(),
-                              sycl::range<2>(matA.rows, matA.cols));
+    sycl::buffer b_matA(matA.mem.data(), sycl::range(matA.rows, matA.cols));
 
-    sycl::buffer<T, 2> b_matB(matB.mem.data(),
-                              sycl::range<2>(matB.rows, matB.cols));
+    sycl::buffer b_matB(matB.mem.data(), sycl::range(matB.rows, matB.cols));
 
-    sycl::buffer<T, 2> b_matRes(matRes.mem.data(),
-                                sycl::range<2>(matRes.rows, matRes.cols));
+    sycl::buffer b_matRes(matRes.mem.data(),
+                          sycl::range(matRes.rows, matRes.cols));
 
     // submit work to the device. this is done by using a lambda function which
     // references all values known to the scope i.e. the previously defined
@@ -121,10 +121,9 @@ auto matrixMultSYCL(sycl::queue &q, const Matrix<T> &matA,
       //
       // Here, we only the matrix A and B to be read from and the matrix C to be
       // written to.
-      auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
-      auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
-      auto acc_matRes =
-          b_matRes.template get_access<sycl::access::mode::write>(h);
+      sycl::accessor acc_matA(b_matA, h, sycl::read_only);
+      sycl::accessor acc_matB(b_matB, h, sycl::read_only);
+      sycl::accessor acc_matRes(b_matRes, h, sycl::write_only);
 
       // For the parallelized loop another lambda function is used, but all
       // known values are passed by value, as host and device doesn't share the
@@ -134,12 +133,12 @@ auto matrixMultSYCL(sycl::queue &q, const Matrix<T> &matA,
       // the global range we defined earlier to provide the size of the problem
       // and launch the count of tasks accordingly. The identifier of the task
       // is then passed to the lambda function as a parameter.
-      h.parallel_for(global_range, [=](sycl::id<2> ID) {
+      h.parallel_for(global_range, [=](auto ID) {
         const auto i = ID[0];
         const auto j = ID[1];
         T sum = 0;
 
-        for (auto k = 0; k < matA.cols; k++) {
+        for (auto k = 0; k < inner_loop; k++) {
           sum += acc_matA[i][k] * acc_matB[k][j];
         }
         acc_matRes[i][j] = sum;
@@ -176,30 +175,30 @@ auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix<T> &matA,
 
   Matrix<T> matB_t = matB.t();
   Matrix<T> matRes(matA.rows, matB.cols);
-  sycl::range<2> global_range(matRes.rows, matRes.cols);
+  sycl::range global_range(matRes.rows, matRes.cols);
+
+  const auto &inner_loop = matA.cols;
 
   {
-    sycl::buffer<T, 2> b_matA(matA.mem.data(),
-                              sycl::range<2>(matA.rows, matA.cols));
+    sycl::buffer b_matA(matA.mem.data(), sycl::range(matA.rows, matA.cols));
 
-    sycl::buffer<T, 2> b_matB(matB_t.mem.data(),
-                              sycl::range<2>(matB_t.rows, matB_t.cols));
+    sycl::buffer b_matB(matB_t.mem.data(),
+                        sycl::range(matB_t.rows, matB_t.cols));
 
-    sycl::buffer<T, 2> b_matRes(matRes.mem.data(),
-                                sycl::range<2>(matRes.rows, matRes.cols));
+    sycl::buffer b_matRes(matRes.mem.data(),
+                          sycl::range(matRes.rows, matRes.cols));
 
     q.submit([&](sycl::handler &h) {
-      auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
-      auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
-      auto acc_matRes =
-          b_matRes.template get_access<sycl::access::mode::write>(h);
+      sycl::accessor acc_matA(b_matA, h, sycl::read_only);
+      sycl::accessor acc_matB(b_matB, h, sycl::read_only);
+      sycl::accessor acc_matRes(b_matRes, h, sycl::write_only);
 
-      h.parallel_for(global_range, [=](sycl::id<2> ID) {
+      h.parallel_for(global_range, [=](auto ID) {
         auto i = ID[0];
         auto j = ID[1];
         T sum = 0;
 
-        for (auto k = 0; k < matA.cols; k++) {
+        for (auto k = 0; k < inner_loop; k++) {
           sum += acc_matA[i][k] * acc_matB[j][k];
         }
         acc_matRes[i][j] = sum;
@@ -287,32 +286,25 @@ auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,
 
   {
     // allocate the buffers
-    sycl::buffer<T, 2> b_matA(matA.mem.data(),
-                              sycl::range<2>(matA.rows, matA.cols));
+    sycl::buffer b_matA(matA.mem.data(), sycl::range(matA.rows, matA.cols));
 
-    sycl::buffer<T, 2> b_matB(matB.mem.data(),
-                              sycl::range<2>(matB.rows, matB.cols));
+    sycl::buffer b_matB(matB.mem.data(), sycl::range(matB.rows, matB.cols));
 
-    sycl::buffer<T, 2> b_matRes(matRes.mem.data(),
-                                sycl::range<2>(matRes.rows, matRes.cols));
+    sycl::buffer b_matRes(matRes.mem.data(),
+                          sycl::range(matRes.rows, matRes.cols));
 
     q.submit([&](sycl::handler &h) {
       // provide access to the buffers and ...
-      auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
-      auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
-      auto acc_matRes =
-          b_matRes.template get_access<sycl::access::mode::write>(h);
+      sycl::accessor acc_matA(b_matA, h, sycl::read_only);
+      sycl::accessor acc_matB(b_matB, h, sycl::read_only);
+      sycl::accessor acc_matRes(b_matRes, h, sycl::write_only);
 
       // ... allocate memory in the local device memory which should be
       // accessble to each thread per matrix A ...
-      sycl::accessor<int, 2, sycl::access::mode::read_write,
-                     sycl::access::target::local>
-          tileA(tile_range, h);
+      sycl::local_accessor<T, 2> tileA(tile_range, h);
 
       // ... and matrix B
-      sycl::accessor<int, 2, sycl::access::mode::read_write,
-                     sycl::access::target::local>
-          tileB(tile_range, h);
+      sycl::local_accessor<T, 2> tileB(tile_range, h);
 
       // We define a kernel function by passing the global_range and the
       // tile_range to the parallel_for function of the handler. Secondly,
@@ -320,7 +312,7 @@ auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,
       // passed-by-value lambda captures. As a parameter serves a nd_item, which
       // can be used to extract all relevant data linked to the running task.
       h.parallel_for<class tiled_matmul>(
-          sycl::nd_range{global_range, tile_range}, [=](sycl::nd_item<2> &ID) {
+          sycl::nd_range{global_range, tile_range}, [=](auto ID) {
             // extract all relevant information
             const int i = ID.get_global_id(0);
             const int j = ID.get_global_id(1);