From 0e6e99c5dd7c77e8aa4ba479106a10fdae6ceba1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Max=20L=C3=BCbke?= <mluebke@uni-potsdam.de>
Date: Wed, 4 Oct 2023 19:47:58 +0200
Subject: [PATCH] add matmul tiled example

---
 src/sycl_comp.cpp | 144 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 106 insertions(+), 38 deletions(-)
diff --git a/src/sycl_comp.cpp b/src/sycl_comp.cpp
index 965e193..6263b99 100644
--- a/src/sycl_comp.cpp
+++ b/src/sycl_comp.cpp
@@ -1,4 +1,7 @@
+#include <algorithm>
 #include <cassert>
+#include <cmath>
+#include <cstddef>
 #include <cstdint>
 #include <cstdlib>
 #include <iostream>
@@ -53,30 +56,8 @@ auto matrixMultTransposeCPU(const Matrix<data_type> &matA,
 
 auto matrixMultSYCL(sycl::queue &q, const Matrix<data_type> &matA,
                     const Matrix<data_type> &matB) {
-
-  // auto d_matA = static_cast<T *>(sycl::malloc_device(matA.bytes(), q));
-  // q.memcpy(d_matA, matA.mem.data(), matA.bytes());
-
-  // auto d_matB = static_cast<T *>(sycl::malloc_device(matB_t.bytes(), q));
-  // q.memcpy(d_matB, matB_t.mem.data(), matB_t.bytes());
-
   Matrix<data_type> matRes(matA.rows, matB.cols);
-
-  // auto d_matRes = static_cast<T *>(sycl::malloc_device(matRes.bytes(), q));
-
-  // std::size_t max_group_size =
-  //     q.get_device().get_info<sycl::info::device::max_work_group_size>();
-
-  // lets assume we always have a maximum group size with a power of 2
-  // const std::uint32_t local_one_dim =
-  //     std::pow(2, static_cast<std::uint32_t>(std::log2(max_group_size) / 2));
-
   sycl::range<2> global_range(matRes.rows, matRes.cols);
-  // sycl::range<2> local_range(
-  //     local_one_dim > matRes.rows ? matRes.rows : local_one_dim,
-  //     local_one_dim > matRes.cols ? matRes.cols : local_one_dim);
-
-  q.wait();
 
   {
     sycl::buffer<data_type, 2> b_matA(matA.mem.data(),
@@ -115,24 +96,8 @@ auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix<data_type> &matA,
                              const Matrix<data_type> &matB) {
 
   Matrix<data_type> matB_t = matB.t();
-
   Matrix<data_type> matRes(matA.rows, matB.cols);
-
-  // auto d_matRes = static_cast<T *>(sycl::malloc_device(matRes.bytes(), q));
-
-  // std::size_t max_group_size =
-  //     q.get_device().get_info<sycl::info::device::max_work_group_size>();
-
-  // lets assume we always have a maximum group size with a power of 2
-  // const std::uint32_t local_one_dim =
-  //     std::pow(2, static_cast<std::uint32_t>(std::log2(max_group_size) / 2));
-
   sycl::range<2> global_range(matRes.rows, matRes.cols);
-  // sycl::range<2> local_range(
-  //     local_one_dim > matRes.rows ? matRes.rows : local_one_dim,
-  //     local_one_dim > matRes.cols ? matRes.cols : local_one_dim);
-
-  q.wait();
 
   {
     sycl::buffer<data_type, 2> b_matA(matA.mem.data(),
@@ -167,6 +132,102 @@ auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix<data_type> &matA,
   return matRes.chksum();
 }
 
+/* Obtains the previous power of two from the given integer.
+ * It works by masking out all ones after the first one bit,
+ * then leaves the first one bit intact, effectively
+ * yielding the first power of two < x. */
+inline int prevPowerOfTwo(int x) {
+  if (x < 0) {
+    return 0;
+  }
+  --x;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return x - (x >> 1);
+}
+
+template <class T>
+auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,
+                         const Matrix<T> &matB) {
+  Matrix<T> matRes(matA.rows, matB.cols);
+
+  std::size_t max_group_size =
+      q.get_device().get_info<sycl::info::device::max_work_group_size>();
+
+  // lets assume we always have a maximum group size with a power of 2
+  const std::uint32_t max_block_size =
+      static_cast<std::uint32_t>(prevPowerOfTwo(std::sqrt(max_group_size)));
+
+  const std::uint32_t block_size = std::min(matA.cols, max_block_size);
+
+  sycl::range<2> global_range(matRes.rows, matRes.cols);
+  sycl::range<2> local_range(block_size, block_size);
+
+  {
+    sycl::buffer<T, 2> b_matA(matA.mem.data(),
+                              sycl::range<2>(matA.rows, matA.cols));
+
+    sycl::buffer<T, 2> b_matB(matB.mem.data(),
+                              sycl::range<2>(matB.rows, matB.cols));
+
+    sycl::buffer<T, 2> b_matRes(matRes.mem.data(),
+                                sycl::range<2>(matRes.rows, matRes.cols));
+
+    q.submit([&](sycl::handler &h) {
+      auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
+      auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
+      auto acc_matRes =
+          b_matRes.template get_access<sycl::access::mode::write>(h);
+
+      sycl::accessor<int, 2, sycl::access::mode::read_write,
+                     sycl::access::target::local>
+          tileA(local_range, h);
+
+      sycl::accessor<int, 2, sycl::access::mode::read_write,
+                     sycl::access::target::local>
+          tileB(local_range, h);
+
+      h.parallel_for<class tiled_matmul>(
+          sycl::nd_range{global_range, local_range}, [=](sycl::nd_item<2> &ID) {
+            const int i = ID.get_global_id(0);
+            const int j = ID.get_global_id(1);
+
+            const int local_i = ID.get_local_id(0);
+            const int local_j = ID.get_local_id(1);
+
+            const int max_tile = ID.get_group_range(0);
+
+            // Current local IDem
+            T sum = 0;
+
+            for (int tile_i = 0; tile_i < max_tile; tile_i++) {
+              tileA[local_i][local_j] =
+                  acc_matA[i][tile_i * block_size + local_j];
+              tileB[local_j][local_i] =
+                  acc_matB[block_size * tile_i + local_i][j];
+
+              ID.barrier(sycl::access::fence_space::local_space);
+
+              for (auto k = 0; k < block_size; k++) {
+                sum += tileA[local_i][k] * tileB[local_j][k];
+              }
+
+              ID.barrier(sycl::access::fence_space::local_space);
+            }
+
+            acc_matRes[i][j] = sum;
+          });
+    });
+  }
+
+  q.wait();
+
+  return matRes.chksum();
+}
+
 auto main(int argc, char **argv) -> int {
 
   if (argc != 3) {
@@ -180,6 +241,7 @@ auto main(int argc, char **argv) -> int {
 
   assert(matA.rows == matB.cols);
 
+#ifdef SEQ_BENCH
   auto cpu_chksum = measure<>::duration(matrixMultCPU, matA, matB);
   print_pair("CPU - naive", cpu_chksum.first, cpu_chksum.second.count());
 
@@ -187,6 +249,7 @@ auto main(int argc, char **argv) -> int {
       measure<>::duration(matrixMultTransposeCPU, matA, matB);
   print_pair("CPU - transposed", cpu_transp_chksum.first,
              cpu_transp_chksum.second.count());
+#endif
 
   sycl::queue cpu_queue(sycl::cpu_selector_v);
 
@@ -208,5 +271,10 @@ auto main(int argc, char **argv) -> int {
   print_pair("GPU - transposed", gpu_transp_chksum.first,
              gpu_transp_chksum.second.count());
 
+  auto gpu_tiled_chksum = measure<>::duration(matrixMultTiledSYCL<data_type>,
+                                              gpu_queue, matA, matB);
+  print_pair("GPU - tiled", gpu_tiled_chksum.first,
+             gpu_tiled_chksum.second.count());
+
   return EXIT_SUCCESS;
 }