adapt tiled version

This commit is contained in:
Max Lübke 2023-10-12 14:57:26 +02:00
parent 493be3576c
commit 1091a55b81

View File

@ -1,3 +1,5 @@
#include <sycl/sycl.hpp>
#include <algorithm> #include <algorithm>
#include <cassert> #include <cassert>
#include <cmath> #include <cmath>
@ -6,7 +8,6 @@
#include <cstdlib> #include <cstdlib>
#include <iostream> #include <iostream>
#include <sycl/sycl.hpp>
#include <utility> #include <utility>
#include "matrix.hpp" #include "matrix.hpp"
@ -211,25 +212,19 @@ auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix<T> &matA,
return matRes.chksum(); return matRes.chksum();
} }
/* inline int prevPowerOfTwo(int n) {
* Obtained from if (n <= 0) {
* https://github.com/codeplaysoftware/computecpp-sdk/blob/master/samples/matrix-multiply.cpp
*
* Obtains the previous power of two from the given integer.
* It works by masking out all ones after the first one bit,
* then leaves the first one bit intact, effectively
* yielding the first power of two < x. */
inline int prevPowerOfTwo(int x) {
if (x < 0) {
return 0; return 0;
} }
--x;
x |= x >> 1; // Calculate the most significant bit position (MSB)
x |= x >> 2; int msbPos = 0;
x |= x >> 4; while (n > 1) {
x |= x >> 8; n >>= 1;
x |= x >> 16; msbPos++;
return x - (x >> 1); }
return 1 << msbPos;
} }
/** /**
@ -331,7 +326,7 @@ auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,
tileA[local_i][local_j] = tileA[local_i][local_j] =
acc_matA[i][tile_i * block_size + local_j]; acc_matA[i][tile_i * block_size + local_j];
// here we will also transpose the B matrix // here we will also transpose the B matrix
tileB[local_j][local_i] = tileB[local_i][local_j] =
acc_matB[block_size * tile_i + local_i][j]; acc_matB[block_size * tile_i + local_i][j];
// we need an explicit barrier to ensure all threads of the // we need an explicit barrier to ensure all threads of the
@ -339,10 +334,11 @@ auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,
// memory // memory
ID.barrier(sycl::access::fence_space::local_space); ID.barrier(sycl::access::fence_space::local_space);
// build the 'local' sum over the part of matrix A and B stored in #pragma unroll
// the local memory
for (auto k = 0; k < block_size; k++) { for (auto k = 0; k < block_size; k++) {
sum += tileA[local_i][k] * tileB[local_j][k]; // build the 'local' sum over the part of matrix A and B stored
// in the local memory
sum += tileA[local_i][k] * tileB[k][local_j];
} }
// ensure all threads finished the multiplication, allowing to // ensure all threads finished the multiplication, allowing to