add matmul tiled example

2023-10-04 19:47:58 +02:00 · 2023-10-04 19:47:58 +02:00 · 0e6e99c5dd
commit 0e6e99c5dd
parent 86842a6a15
1 changed files with 106 additions and 38 deletions
--- a/src/sycl_comp.cpp
+++ b/src/sycl_comp.cpp
@ -1,4 +1,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <cstdlib>
 #include <iostream>
@ -53,30 +56,8 @@ auto matrixMultTransposeCPU(const Matrix<data_type> &matA,
 auto matrixMultSYCL(sycl::queue &q, const Matrix<data_type> &matA,
                    const Matrix<data_type> &matB) {
  // auto d_matA = static_cast<T *>(sycl::malloc_device(matA.bytes(), q));
  // q.memcpy(d_matA, matA.mem.data(), matA.bytes());
  // auto d_matB = static_cast<T *>(sycl::malloc_device(matB_t.bytes(), q));
  // q.memcpy(d_matB, matB_t.mem.data(), matB_t.bytes());
  Matrix<data_type> matRes(matA.rows, matB.cols);
  // auto d_matRes = static_cast<T *>(sycl::malloc_device(matRes.bytes(), q));
  // std::size_t max_group_size =
  //     q.get_device().get_info<sycl::info::device::max_work_group_size>();
  // lets assume we always have a maximum group size with a power of 2
  // const std::uint32_t local_one_dim =
  //     std::pow(2, static_cast<std::uint32_t>(std::log2(max_group_size) / 2));
  sycl::range<2> global_range(matRes.rows, matRes.cols);
  // sycl::range<2> local_range(
  //     local_one_dim > matRes.rows ? matRes.rows : local_one_dim,
  //     local_one_dim > matRes.cols ? matRes.cols : local_one_dim);
  q.wait();
  {
    sycl::buffer<data_type, 2> b_matA(matA.mem.data(),
@ -115,24 +96,8 @@ auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix<data_type> &matA,
                             const Matrix<data_type> &matB) {
  Matrix<data_type> matB_t = matB.t();
  Matrix<data_type> matRes(matA.rows, matB.cols);
  // auto d_matRes = static_cast<T *>(sycl::malloc_device(matRes.bytes(), q));
  // std::size_t max_group_size =
  //     q.get_device().get_info<sycl::info::device::max_work_group_size>();
  // lets assume we always have a maximum group size with a power of 2
  // const std::uint32_t local_one_dim =
  //     std::pow(2, static_cast<std::uint32_t>(std::log2(max_group_size) / 2));
  sycl::range<2> global_range(matRes.rows, matRes.cols);
  // sycl::range<2> local_range(
  //     local_one_dim > matRes.rows ? matRes.rows : local_one_dim,
  //     local_one_dim > matRes.cols ? matRes.cols : local_one_dim);
  q.wait();
  {
    sycl::buffer<data_type, 2> b_matA(matA.mem.data(),
@ -167,6 +132,102 @@ auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix<data_type> &matA,
  return matRes.chksum();
 }
 /* Obtains the previous power of two from the given integer.
 * It works by masking out all ones after the first one bit,
 * then leaves the first one bit intact, effectively
 * yielding the first power of two < x. */
 inline int prevPowerOfTwo(int x) {
  if (x < 0) {
    return 0;
  }
  --x;
  x |= x >> 1;
  x |= x >> 2;
  x |= x >> 4;
  x |= x >> 8;
  x |= x >> 16;
  return x - (x >> 1);
 }
 template <class T>
 auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,
                         const Matrix<T> &matB) {
  Matrix<T> matRes(matA.rows, matB.cols);
  std::size_t max_group_size =
      q.get_device().get_info<sycl::info::device::max_work_group_size>();
  // lets assume we always have a maximum group size with a power of 2
  const std::uint32_t max_block_size =
      static_cast<std::uint32_t>(prevPowerOfTwo(std::sqrt(max_group_size)));
  const std::uint32_t block_size = std::min(matA.cols, max_block_size);
  sycl::range<2> global_range(matRes.rows, matRes.cols);
  sycl::range<2> local_range(block_size, block_size);
  {
    sycl::buffer<T, 2> b_matA(matA.mem.data(),
                              sycl::range<2>(matA.rows, matA.cols));
    sycl::buffer<T, 2> b_matB(matB.mem.data(),
                              sycl::range<2>(matB.rows, matB.cols));
    sycl::buffer<T, 2> b_matRes(matRes.mem.data(),
                                sycl::range<2>(matRes.rows, matRes.cols));
    q.submit([&](sycl::handler &h) {
      auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
      auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
      auto acc_matRes =
          b_matRes.template get_access<sycl::access::mode::write>(h);
      sycl::accessor<int, 2, sycl::access::mode::read_write,
                     sycl::access::target::local>
          tileA(local_range, h);
      sycl::accessor<int, 2, sycl::access::mode::read_write,
                     sycl::access::target::local>
          tileB(local_range, h);
      h.parallel_for<class tiled_matmul>(
          sycl::nd_range{global_range, local_range}, [=](sycl::nd_item<2> &ID) {
            const int i = ID.get_global_id(0);
            const int j = ID.get_global_id(1);
            const int local_i = ID.get_local_id(0);
            const int local_j = ID.get_local_id(1);
            const int max_tile = ID.get_group_range(0);
            // Current local IDem
            T sum = 0;
            for (int tile_i = 0; tile_i < max_tile; tile_i++) {
              tileA[local_i][local_j] =
                  acc_matA[i][tile_i * block_size + local_j];
              tileB[local_j][local_i] =
                  acc_matB[block_size * tile_i + local_i][j];
              ID.barrier(sycl::access::fence_space::local_space);
              for (auto k = 0; k < block_size; k++) {
                sum += tileA[local_i][k] * tileB[local_j][k];
              }
              ID.barrier(sycl::access::fence_space::local_space);
            }
            acc_matRes[i][j] = sum;
          });
    });
  }
  q.wait();
  return matRes.chksum();
 }
 auto main(int argc, char **argv) -> int {
  if (argc != 3) {
@ -180,6 +241,7 @@ auto main(int argc, char **argv) -> int {
  assert(matA.rows == matB.cols);
 #ifdef SEQ_BENCH
  auto cpu_chksum = measure<>::duration(matrixMultCPU, matA, matB);
  print_pair("CPU - naive", cpu_chksum.first, cpu_chksum.second.count());
@ -187,6 +249,7 @@ auto main(int argc, char **argv) -> int {
      measure<>::duration(matrixMultTransposeCPU, matA, matB);
  print_pair("CPU - transposed", cpu_transp_chksum.first,
             cpu_transp_chksum.second.count());
 #endif
  sycl::queue cpu_queue(sycl::cpu_selector_v);
@ -208,5 +271,10 @@ auto main(int argc, char **argv) -> int {
  print_pair("GPU - transposed", gpu_transp_chksum.first,
             gpu_transp_chksum.second.count());
  auto gpu_tiled_chksum = measure<>::duration(matrixMultTiledSYCL<data_type>,
                                              gpu_queue, matA, matB);
  print_pair("GPU - tiled", gpu_tiled_chksum.first,
             gpu_tiled_chksum.second.count());
  return EXIT_SUCCESS;
 }