adapt tiled version
This commit is contained in:
parent
493be3576c
commit
1091a55b81
@ -1,3 +1,5 @@
|
||||
#include <sycl/sycl.hpp>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
@ -6,7 +8,6 @@
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <utility>
|
||||
|
||||
#include "matrix.hpp"
|
||||
@ -211,25 +212,19 @@ auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix<T> &matA,
|
||||
return matRes.chksum();
|
||||
}
|
||||
|
||||
/*
|
||||
* Obtained from
|
||||
* https://github.com/codeplaysoftware/computecpp-sdk/blob/master/samples/matrix-multiply.cpp
|
||||
*
|
||||
* Obtains the previous power of two from the given integer.
|
||||
* It works by masking out all ones after the first one bit,
|
||||
* then leaves the first one bit intact, effectively
|
||||
* yielding the first power of two < x. */
|
||||
inline int prevPowerOfTwo(int x) {
|
||||
if (x < 0) {
|
||||
inline int prevPowerOfTwo(int n) {
|
||||
if (n <= 0) {
|
||||
return 0;
|
||||
}
|
||||
--x;
|
||||
x |= x >> 1;
|
||||
x |= x >> 2;
|
||||
x |= x >> 4;
|
||||
x |= x >> 8;
|
||||
x |= x >> 16;
|
||||
return x - (x >> 1);
|
||||
|
||||
// Calculate the most significant bit position (MSB)
|
||||
int msbPos = 0;
|
||||
while (n > 1) {
|
||||
n >>= 1;
|
||||
msbPos++;
|
||||
}
|
||||
|
||||
return 1 << msbPos;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -331,7 +326,7 @@ auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,
|
||||
tileA[local_i][local_j] =
|
||||
acc_matA[i][tile_i * block_size + local_j];
|
||||
// here we will also transpose the B matrix
|
||||
tileB[local_j][local_i] =
|
||||
tileB[local_i][local_j] =
|
||||
acc_matB[block_size * tile_i + local_i][j];
|
||||
|
||||
// we need an explicit barrier to ensure all threads of the
|
||||
@ -339,10 +334,11 @@ auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,
|
||||
// memory
|
||||
ID.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
// build the 'local' sum over the part of matrix A and B stored in
|
||||
// the local memory
|
||||
#pragma unroll
|
||||
for (auto k = 0; k < block_size; k++) {
|
||||
sum += tileA[local_i][k] * tileB[local_j][k];
|
||||
// build the 'local' sum over the part of matrix A and B stored
|
||||
// in the local memory
|
||||
sum += tileA[local_i][k] * tileB[k][local_j];
|
||||
}
|
||||
|
||||
// ensure all threads finished the multiplication, allowing to
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user