From 1091a55b81fb345904b77cdf73ed7604c6323d39 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Max=20L=C3=BCbke?= <mluebke@uni-potsdam.de>
Date: Thu, 12 Oct 2023 14:57:26 +0200
Subject: [PATCH] adapt tiled version

---
 src/sycl_comp.cpp | 40 ++++++++++++++++++----------------------
 1 file changed, 18 insertions(+), 22 deletions(-)
diff --git a/src/sycl_comp.cpp b/src/sycl_comp.cpp
index 55df1c4..adc5a37 100644
--- a/src/sycl_comp.cpp
+++ b/src/sycl_comp.cpp
@@ -1,3 +1,5 @@
+#include <sycl/sycl.hpp>
+
 #include <algorithm>
 #include <cassert>
 #include <cmath>
@@ -6,7 +8,6 @@
 #include <cstdlib>
 #include <iostream>
 
-#include <sycl/sycl.hpp>
 #include <utility>
 
 #include "matrix.hpp"
@@ -211,25 +212,19 @@ auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix<T> &matA,
   return matRes.chksum();
 }
 
-/*
- * Obtained from
- * https://github.com/codeplaysoftware/computecpp-sdk/blob/master/samples/matrix-multiply.cpp
- *
- * Obtains the previous power of two from the given integer.
- * It works by masking out all ones after the first one bit,
- * then leaves the first one bit intact, effectively
- * yielding the first power of two < x. */
-inline int prevPowerOfTwo(int x) {
-  if (x < 0) {
+inline int prevPowerOfTwo(int n) {
+  if (n <= 0) {
     return 0;
   }
-  --x;
-  x |= x >> 1;
-  x |= x >> 2;
-  x |= x >> 4;
-  x |= x >> 8;
-  x |= x >> 16;
-  return x - (x >> 1);
+
+  // Calculate the most significant bit position (MSB)
+  int msbPos = 0;
+  while (n > 1) {
+    n >>= 1;
+    msbPos++;
+  }
+
+  return 1 << msbPos;
 }
 
 /**
@@ -331,7 +326,7 @@ auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,
               tileA[local_i][local_j] =
                   acc_matA[i][tile_i * block_size + local_j];
               // here we will also transpose the B matrix
-              tileB[local_j][local_i] =
+              tileB[local_i][local_j] =
                   acc_matB[block_size * tile_i + local_i][j];
 
               // we need an explicit barrier to ensure all threads of the
@@ -339,10 +334,11 @@ auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,
               // memory
               ID.barrier(sycl::access::fence_space::local_space);
 
-              // build the 'local' sum over the part of matrix A and B stored in
-              // the local memory
+#pragma unroll
               for (auto k = 0; k < block_size; k++) {
-                sum += tileA[local_i][k] * tileB[local_j][k];
+                // build the 'local' sum over the part of matrix A and B stored
+                // in the local memory
+                sum += tileA[local_i][k] * tileB[k][local_j];
               }
 
               // ensure all threads finished the multiplication, allowing to