From 1091a55b81fb345904b77cdf73ed7604c6323d39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Max=20L=C3=BCbke?= Date: Thu, 12 Oct 2023 14:57:26 +0200 Subject: [PATCH] adapt tiled version --- src/sycl_comp.cpp | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/src/sycl_comp.cpp b/src/sycl_comp.cpp index 55df1c4..adc5a37 100644 --- a/src/sycl_comp.cpp +++ b/src/sycl_comp.cpp @@ -1,3 +1,5 @@ +#include + #include #include #include @@ -6,7 +8,6 @@ #include #include -#include #include #include "matrix.hpp" @@ -211,25 +212,19 @@ auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix &matA, return matRes.chksum(); } -/* - * Obtained from - * https://github.com/codeplaysoftware/computecpp-sdk/blob/master/samples/matrix-multiply.cpp - * - * Obtains the previous power of two from the given integer. - * It works by masking out all ones after the first one bit, - * then leaves the first one bit intact, effectively - * yielding the first power of two < x. */ -inline int prevPowerOfTwo(int x) { - if (x < 0) { +inline int prevPowerOfTwo(int n) { + if (n <= 0) { return 0; } - --x; - x |= x >> 1; - x |= x >> 2; - x |= x >> 4; - x |= x >> 8; - x |= x >> 16; - return x - (x >> 1); + + // Calculate the most significant bit position (MSB) + int msbPos = 0; + while (n > 1) { + n >>= 1; + msbPos++; + } + + return 1 << msbPos; } /** @@ -331,7 +326,7 @@ auto matrixMultTiledSYCL(sycl::queue &q, const Matrix &matA, tileA[local_i][local_j] = acc_matA[i][tile_i * block_size + local_j]; // here we will also transpose the B matrix - tileB[local_j][local_i] = + tileB[local_i][local_j] = acc_matB[block_size * tile_i + local_i][j]; // we need an explicit barrier to ensure all threads of the @@ -339,10 +334,11 @@ auto matrixMultTiledSYCL(sycl::queue &q, const Matrix &matA, // memory ID.barrier(sycl::access::fence_space::local_space); - // build the 'local' sum over the part of matrix A and B stored in - // the local memory +#pragma unroll for (auto k = 0; k < block_size; k++) { - sum += tileA[local_i][k] * tileB[local_j][k]; + // build the 'local' sum over the part of matrix A and B stored + // in the local memory + sum += tileA[local_i][k] * tileB[k][local_j]; } // ensure all threads finished the multiplication, allowing to