add matmul tiled example
This commit is contained in:
parent
86842a6a15
commit
0e6e99c5dd
@ -1,4 +1,7 @@
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
@ -53,30 +56,8 @@ auto matrixMultTransposeCPU(const Matrix<data_type> &matA,
|
||||
|
||||
auto matrixMultSYCL(sycl::queue &q, const Matrix<data_type> &matA,
|
||||
const Matrix<data_type> &matB) {
|
||||
|
||||
// auto d_matA = static_cast<T *>(sycl::malloc_device(matA.bytes(), q));
|
||||
// q.memcpy(d_matA, matA.mem.data(), matA.bytes());
|
||||
|
||||
// auto d_matB = static_cast<T *>(sycl::malloc_device(matB_t.bytes(), q));
|
||||
// q.memcpy(d_matB, matB_t.mem.data(), matB_t.bytes());
|
||||
|
||||
Matrix<data_type> matRes(matA.rows, matB.cols);
|
||||
|
||||
// auto d_matRes = static_cast<T *>(sycl::malloc_device(matRes.bytes(), q));
|
||||
|
||||
// std::size_t max_group_size =
|
||||
// q.get_device().get_info<sycl::info::device::max_work_group_size>();
|
||||
|
||||
// lets assume we always have a maximum group size with a power of 2
|
||||
// const std::uint32_t local_one_dim =
|
||||
// std::pow(2, static_cast<std::uint32_t>(std::log2(max_group_size) / 2));
|
||||
|
||||
sycl::range<2> global_range(matRes.rows, matRes.cols);
|
||||
// sycl::range<2> local_range(
|
||||
// local_one_dim > matRes.rows ? matRes.rows : local_one_dim,
|
||||
// local_one_dim > matRes.cols ? matRes.cols : local_one_dim);
|
||||
|
||||
q.wait();
|
||||
|
||||
{
|
||||
sycl::buffer<data_type, 2> b_matA(matA.mem.data(),
|
||||
@ -115,24 +96,8 @@ auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix<data_type> &matA,
|
||||
const Matrix<data_type> &matB) {
|
||||
|
||||
Matrix<data_type> matB_t = matB.t();
|
||||
|
||||
Matrix<data_type> matRes(matA.rows, matB.cols);
|
||||
|
||||
// auto d_matRes = static_cast<T *>(sycl::malloc_device(matRes.bytes(), q));
|
||||
|
||||
// std::size_t max_group_size =
|
||||
// q.get_device().get_info<sycl::info::device::max_work_group_size>();
|
||||
|
||||
// lets assume we always have a maximum group size with a power of 2
|
||||
// const std::uint32_t local_one_dim =
|
||||
// std::pow(2, static_cast<std::uint32_t>(std::log2(max_group_size) / 2));
|
||||
|
||||
sycl::range<2> global_range(matRes.rows, matRes.cols);
|
||||
// sycl::range<2> local_range(
|
||||
// local_one_dim > matRes.rows ? matRes.rows : local_one_dim,
|
||||
// local_one_dim > matRes.cols ? matRes.cols : local_one_dim);
|
||||
|
||||
q.wait();
|
||||
|
||||
{
|
||||
sycl::buffer<data_type, 2> b_matA(matA.mem.data(),
|
||||
@ -167,6 +132,102 @@ auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix<data_type> &matA,
|
||||
return matRes.chksum();
|
||||
}
|
||||
|
||||
/* Obtains the previous power of two from the given integer.
|
||||
* It works by masking out all ones after the first one bit,
|
||||
* then leaves the first one bit intact, effectively
|
||||
* yielding the first power of two < x. */
|
||||
inline int prevPowerOfTwo(int x) {
|
||||
if (x < 0) {
|
||||
return 0;
|
||||
}
|
||||
--x;
|
||||
x |= x >> 1;
|
||||
x |= x >> 2;
|
||||
x |= x >> 4;
|
||||
x |= x >> 8;
|
||||
x |= x >> 16;
|
||||
return x - (x >> 1);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,
|
||||
const Matrix<T> &matB) {
|
||||
Matrix<T> matRes(matA.rows, matB.cols);
|
||||
|
||||
std::size_t max_group_size =
|
||||
q.get_device().get_info<sycl::info::device::max_work_group_size>();
|
||||
|
||||
// lets assume we always have a maximum group size with a power of 2
|
||||
const std::uint32_t max_block_size =
|
||||
static_cast<std::uint32_t>(prevPowerOfTwo(std::sqrt(max_group_size)));
|
||||
|
||||
const std::uint32_t block_size = std::min(matA.cols, max_block_size);
|
||||
|
||||
sycl::range<2> global_range(matRes.rows, matRes.cols);
|
||||
sycl::range<2> local_range(block_size, block_size);
|
||||
|
||||
{
|
||||
sycl::buffer<T, 2> b_matA(matA.mem.data(),
|
||||
sycl::range<2>(matA.rows, matA.cols));
|
||||
|
||||
sycl::buffer<T, 2> b_matB(matB.mem.data(),
|
||||
sycl::range<2>(matB.rows, matB.cols));
|
||||
|
||||
sycl::buffer<T, 2> b_matRes(matRes.mem.data(),
|
||||
sycl::range<2>(matRes.rows, matRes.cols));
|
||||
|
||||
q.submit([&](sycl::handler &h) {
|
||||
auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
|
||||
auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
|
||||
auto acc_matRes =
|
||||
b_matRes.template get_access<sycl::access::mode::write>(h);
|
||||
|
||||
sycl::accessor<int, 2, sycl::access::mode::read_write,
|
||||
sycl::access::target::local>
|
||||
tileA(local_range, h);
|
||||
|
||||
sycl::accessor<int, 2, sycl::access::mode::read_write,
|
||||
sycl::access::target::local>
|
||||
tileB(local_range, h);
|
||||
|
||||
h.parallel_for<class tiled_matmul>(
|
||||
sycl::nd_range{global_range, local_range}, [=](sycl::nd_item<2> &ID) {
|
||||
const int i = ID.get_global_id(0);
|
||||
const int j = ID.get_global_id(1);
|
||||
|
||||
const int local_i = ID.get_local_id(0);
|
||||
const int local_j = ID.get_local_id(1);
|
||||
|
||||
const int max_tile = ID.get_group_range(0);
|
||||
|
||||
// Current local IDem
|
||||
T sum = 0;
|
||||
|
||||
for (int tile_i = 0; tile_i < max_tile; tile_i++) {
|
||||
tileA[local_i][local_j] =
|
||||
acc_matA[i][tile_i * block_size + local_j];
|
||||
tileB[local_j][local_i] =
|
||||
acc_matB[block_size * tile_i + local_i][j];
|
||||
|
||||
ID.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
for (auto k = 0; k < block_size; k++) {
|
||||
sum += tileA[local_i][k] * tileB[local_j][k];
|
||||
}
|
||||
|
||||
ID.barrier(sycl::access::fence_space::local_space);
|
||||
}
|
||||
|
||||
acc_matRes[i][j] = sum;
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
q.wait();
|
||||
|
||||
return matRes.chksum();
|
||||
}
|
||||
|
||||
auto main(int argc, char **argv) -> int {
|
||||
|
||||
if (argc != 3) {
|
||||
@ -180,6 +241,7 @@ auto main(int argc, char **argv) -> int {
|
||||
|
||||
assert(matA.rows == matB.cols);
|
||||
|
||||
#ifdef SEQ_BENCH
|
||||
auto cpu_chksum = measure<>::duration(matrixMultCPU, matA, matB);
|
||||
print_pair("CPU - naive", cpu_chksum.first, cpu_chksum.second.count());
|
||||
|
||||
@ -187,6 +249,7 @@ auto main(int argc, char **argv) -> int {
|
||||
measure<>::duration(matrixMultTransposeCPU, matA, matB);
|
||||
print_pair("CPU - transposed", cpu_transp_chksum.first,
|
||||
cpu_transp_chksum.second.count());
|
||||
#endif
|
||||
|
||||
sycl::queue cpu_queue(sycl::cpu_selector_v);
|
||||
|
||||
@ -208,5 +271,10 @@ auto main(int argc, char **argv) -> int {
|
||||
print_pair("GPU - transposed", gpu_transp_chksum.first,
|
||||
gpu_transp_chksum.second.count());
|
||||
|
||||
auto gpu_tiled_chksum = measure<>::duration(matrixMultTiledSYCL<data_type>,
|
||||
gpu_queue, matA, matB);
|
||||
print_pair("GPU - tiled", gpu_tiled_chksum.first,
|
||||
gpu_tiled_chksum.second.count());
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user