refactor code to compile with Intel oneAPI

2023-10-06 15:16:32 +02:00 · 2023-10-06 15:16:32 +02:00 · 9633302226
commit 9633302226
parent 3fcd37fb1f
1 changed files with 34 additions and 42 deletions
--- a/src/sycl_comp.cpp
+++ b/src/sycl_comp.cpp
@ -97,19 +97,19 @@ auto matrixMultSYCL(sycl::queue &q, const Matrix<T> &matA,
  // cell of the product. Thus, leading to a problem size of N x M
  sycl::range<2> global_range(matRes.rows, matRes.cols);

+  const auto &inner_loop = matA.cols;
+
  {
    // defining 2 dimensional buffers which can then be exposed to the device.
    // It also possible to use 1D buffers here, but then we have to manually
    // calculate the index to access the matrices for each thread in the kernel
    // code. Solving it this way will ask the compiler to do the work.
-    sycl::buffer<T, 2> b_matA(matA.mem.data(),
-                              sycl::range<2>(matA.rows, matA.cols));
+    sycl::buffer b_matA(matA.mem.data(), sycl::range(matA.rows, matA.cols));

-    sycl::buffer<T, 2> b_matB(matB.mem.data(),
-                              sycl::range<2>(matB.rows, matB.cols));
+    sycl::buffer b_matB(matB.mem.data(), sycl::range(matB.rows, matB.cols));

-    sycl::buffer<T, 2> b_matRes(matRes.mem.data(),
-                                sycl::range<2>(matRes.rows, matRes.cols));
+    sycl::buffer b_matRes(matRes.mem.data(),
+                          sycl::range(matRes.rows, matRes.cols));

    // submit work to the device. this is done by using a lambda function which
    // references all values known to the scope i.e. the previously defined
@ -121,10 +121,9 @@ auto matrixMultSYCL(sycl::queue &q, const Matrix<T> &matA,
      //
      // Here, we only the matrix A and B to be read from and the matrix C to be
      // written to.
-      auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
-      auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
-      auto acc_matRes =
-          b_matRes.template get_access<sycl::access::mode::write>(h);
+      sycl::accessor acc_matA(b_matA, h, sycl::read_only);
+      sycl::accessor acc_matB(b_matB, h, sycl::read_only);
+      sycl::accessor acc_matRes(b_matRes, h, sycl::write_only);

      // For the parallelized loop another lambda function is used, but all
      // known values are passed by value, as host and device doesn't share the
@ -134,12 +133,12 @@ auto matrixMultSYCL(sycl::queue &q, const Matrix<T> &matA,
      // the global range we defined earlier to provide the size of the problem
      // and launch the count of tasks accordingly. The identifier of the task
      // is then passed to the lambda function as a parameter.
-      h.parallel_for(global_range, [=](sycl::id<2> ID) {
+      h.parallel_for(global_range, [=](auto ID) {
        const auto i = ID[0];
        const auto j = ID[1];
        T sum = 0;

-        for (auto k = 0; k < matA.cols; k++) {
+        for (auto k = 0; k < inner_loop; k++) {
          sum += acc_matA[i][k] * acc_matB[k][j];
        }
        acc_matRes[i][j] = sum;
@ -176,30 +175,30 @@ auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix<T> &matA,

  Matrix<T> matB_t = matB.t();
  Matrix<T> matRes(matA.rows, matB.cols);
-  sycl::range<2> global_range(matRes.rows, matRes.cols);
+  sycl::range global_range(matRes.rows, matRes.cols);
+
+  const auto &inner_loop = matA.cols;

  {
-    sycl::buffer<T, 2> b_matA(matA.mem.data(),
-                              sycl::range<2>(matA.rows, matA.cols));
+    sycl::buffer b_matA(matA.mem.data(), sycl::range(matA.rows, matA.cols));

-    sycl::buffer<T, 2> b_matB(matB_t.mem.data(),
-                              sycl::range<2>(matB_t.rows, matB_t.cols));
+    sycl::buffer b_matB(matB_t.mem.data(),
+                        sycl::range(matB_t.rows, matB_t.cols));

-    sycl::buffer<T, 2> b_matRes(matRes.mem.data(),
-                                sycl::range<2>(matRes.rows, matRes.cols));
+    sycl::buffer b_matRes(matRes.mem.data(),
+                          sycl::range(matRes.rows, matRes.cols));

    q.submit([&](sycl::handler &h) {
-      auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
-      auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
-      auto acc_matRes =
-          b_matRes.template get_access<sycl::access::mode::write>(h);
+      sycl::accessor acc_matA(b_matA, h, sycl::read_only);
+      sycl::accessor acc_matB(b_matB, h, sycl::read_only);
+      sycl::accessor acc_matRes(b_matRes, h, sycl::write_only);

-      h.parallel_for(global_range, [=](sycl::id<2> ID) {
+      h.parallel_for(global_range, [=](auto ID) {
        auto i = ID[0];
        auto j = ID[1];
        T sum = 0;

-        for (auto k = 0; k < matA.cols; k++) {
+        for (auto k = 0; k < inner_loop; k++) {
          sum += acc_matA[i][k] * acc_matB[j][k];
        }
        acc_matRes[i][j] = sum;
@ -287,32 +286,25 @@ auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,

  {
    // allocate the buffers
-    sycl::buffer<T, 2> b_matA(matA.mem.data(),
-                              sycl::range<2>(matA.rows, matA.cols));
+    sycl::buffer b_matA(matA.mem.data(), sycl::range(matA.rows, matA.cols));

-    sycl::buffer<T, 2> b_matB(matB.mem.data(),
-                              sycl::range<2>(matB.rows, matB.cols));
+    sycl::buffer b_matB(matB.mem.data(), sycl::range(matB.rows, matB.cols));

-    sycl::buffer<T, 2> b_matRes(matRes.mem.data(),
-                                sycl::range<2>(matRes.rows, matRes.cols));
+    sycl::buffer b_matRes(matRes.mem.data(),
+                          sycl::range(matRes.rows, matRes.cols));

    q.submit([&](sycl::handler &h) {
      // provide access to the buffers and ...
-      auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
-      auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
-      auto acc_matRes =
-          b_matRes.template get_access<sycl::access::mode::write>(h);
+      sycl::accessor acc_matA(b_matA, h, sycl::read_only);
+      sycl::accessor acc_matB(b_matB, h, sycl::read_only);
+      sycl::accessor acc_matRes(b_matRes, h, sycl::write_only);

      // ... allocate memory in the local device memory which should be
      // accessble to each thread per matrix A ...
-      sycl::accessor<int, 2, sycl::access::mode::read_write,
-                     sycl::access::target::local>
-          tileA(tile_range, h);
+      sycl::local_accessor<T, 2> tileA(tile_range, h);

      // ... and matrix B
-      sycl::accessor<int, 2, sycl::access::mode::read_write,
-                     sycl::access::target::local>
-          tileB(tile_range, h);
+      sycl::local_accessor<T, 2> tileB(tile_range, h);

      // We define a kernel function by passing the global_range and the
      // tile_range to the parallel_for function of the handler. Secondly,
@ -320,7 +312,7 @@ auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,
      // passed-by-value lambda captures. As a parameter serves a nd_item, which
      // can be used to extract all relevant data linked to the running task.
      h.parallel_for<class tiled_matmul>(
-          sycl::nd_range{global_range, tile_range}, [=](sycl::nd_item<2> &ID) {
+          sycl::nd_range{global_range, tile_range}, [=](auto ID) {
            // extract all relevant information
            const int i = ID.get_global_id(0);
            const int j = ID.get_global_id(1);