add matmul tiled example

This commit is contained in:
Max Lübke 2023-10-04 19:47:58 +02:00
parent 86842a6a15
commit 0e6e99c5dd

View File

@ -1,4 +1,7 @@
#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <iostream>
@ -53,30 +56,8 @@ auto matrixMultTransposeCPU(const Matrix<data_type> &matA,
auto matrixMultSYCL(sycl::queue &q, const Matrix<data_type> &matA,
const Matrix<data_type> &matB) {
// auto d_matA = static_cast<T *>(sycl::malloc_device(matA.bytes(), q));
// q.memcpy(d_matA, matA.mem.data(), matA.bytes());
// auto d_matB = static_cast<T *>(sycl::malloc_device(matB_t.bytes(), q));
// q.memcpy(d_matB, matB_t.mem.data(), matB_t.bytes());
Matrix<data_type> matRes(matA.rows, matB.cols);
// auto d_matRes = static_cast<T *>(sycl::malloc_device(matRes.bytes(), q));
// std::size_t max_group_size =
// q.get_device().get_info<sycl::info::device::max_work_group_size>();
// lets assume we always have a maximum group size with a power of 2
// const std::uint32_t local_one_dim =
// std::pow(2, static_cast<std::uint32_t>(std::log2(max_group_size) / 2));
sycl::range<2> global_range(matRes.rows, matRes.cols);
// sycl::range<2> local_range(
// local_one_dim > matRes.rows ? matRes.rows : local_one_dim,
// local_one_dim > matRes.cols ? matRes.cols : local_one_dim);
q.wait();
{
sycl::buffer<data_type, 2> b_matA(matA.mem.data(),
@ -115,24 +96,8 @@ auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix<data_type> &matA,
const Matrix<data_type> &matB) {
Matrix<data_type> matB_t = matB.t();
Matrix<data_type> matRes(matA.rows, matB.cols);
// auto d_matRes = static_cast<T *>(sycl::malloc_device(matRes.bytes(), q));
// std::size_t max_group_size =
// q.get_device().get_info<sycl::info::device::max_work_group_size>();
// lets assume we always have a maximum group size with a power of 2
// const std::uint32_t local_one_dim =
// std::pow(2, static_cast<std::uint32_t>(std::log2(max_group_size) / 2));
sycl::range<2> global_range(matRes.rows, matRes.cols);
// sycl::range<2> local_range(
// local_one_dim > matRes.rows ? matRes.rows : local_one_dim,
// local_one_dim > matRes.cols ? matRes.cols : local_one_dim);
q.wait();
{
sycl::buffer<data_type, 2> b_matA(matA.mem.data(),
@ -167,6 +132,102 @@ auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix<data_type> &matA,
return matRes.chksum();
}
/* Obtains the previous power of two from the given integer.
* It works by masking out all ones after the first one bit,
* then leaves the first one bit intact, effectively
* yielding the first power of two < x. */
inline int prevPowerOfTwo(int x) {
if (x < 0) {
return 0;
}
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return x - (x >> 1);
}
template <class T>
auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,
const Matrix<T> &matB) {
Matrix<T> matRes(matA.rows, matB.cols);
std::size_t max_group_size =
q.get_device().get_info<sycl::info::device::max_work_group_size>();
// lets assume we always have a maximum group size with a power of 2
const std::uint32_t max_block_size =
static_cast<std::uint32_t>(prevPowerOfTwo(std::sqrt(max_group_size)));
const std::uint32_t block_size = std::min(matA.cols, max_block_size);
sycl::range<2> global_range(matRes.rows, matRes.cols);
sycl::range<2> local_range(block_size, block_size);
{
sycl::buffer<T, 2> b_matA(matA.mem.data(),
sycl::range<2>(matA.rows, matA.cols));
sycl::buffer<T, 2> b_matB(matB.mem.data(),
sycl::range<2>(matB.rows, matB.cols));
sycl::buffer<T, 2> b_matRes(matRes.mem.data(),
sycl::range<2>(matRes.rows, matRes.cols));
q.submit([&](sycl::handler &h) {
auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
auto acc_matRes =
b_matRes.template get_access<sycl::access::mode::write>(h);
sycl::accessor<int, 2, sycl::access::mode::read_write,
sycl::access::target::local>
tileA(local_range, h);
sycl::accessor<int, 2, sycl::access::mode::read_write,
sycl::access::target::local>
tileB(local_range, h);
h.parallel_for<class tiled_matmul>(
sycl::nd_range{global_range, local_range}, [=](sycl::nd_item<2> &ID) {
const int i = ID.get_global_id(0);
const int j = ID.get_global_id(1);
const int local_i = ID.get_local_id(0);
const int local_j = ID.get_local_id(1);
const int max_tile = ID.get_group_range(0);
// Current local IDem
T sum = 0;
for (int tile_i = 0; tile_i < max_tile; tile_i++) {
tileA[local_i][local_j] =
acc_matA[i][tile_i * block_size + local_j];
tileB[local_j][local_i] =
acc_matB[block_size * tile_i + local_i][j];
ID.barrier(sycl::access::fence_space::local_space);
for (auto k = 0; k < block_size; k++) {
sum += tileA[local_i][k] * tileB[local_j][k];
}
ID.barrier(sycl::access::fence_space::local_space);
}
acc_matRes[i][j] = sum;
});
});
}
q.wait();
return matRes.chksum();
}
auto main(int argc, char **argv) -> int {
if (argc != 3) {
@ -180,6 +241,7 @@ auto main(int argc, char **argv) -> int {
assert(matA.rows == matB.cols);
#ifdef SEQ_BENCH
auto cpu_chksum = measure<>::duration(matrixMultCPU, matA, matB);
print_pair("CPU - naive", cpu_chksum.first, cpu_chksum.second.count());
@ -187,6 +249,7 @@ auto main(int argc, char **argv) -> int {
measure<>::duration(matrixMultTransposeCPU, matA, matB);
print_pair("CPU - transposed", cpu_transp_chksum.first,
cpu_transp_chksum.second.count());
#endif
sycl::queue cpu_queue(sycl::cpu_selector_v);
@ -208,5 +271,10 @@ auto main(int argc, char **argv) -> int {
print_pair("GPU - transposed", gpu_transp_chksum.first,
gpu_transp_chksum.second.count());
auto gpu_tiled_chksum = measure<>::duration(matrixMultTiledSYCL<data_type>,
gpu_queue, matA, matB);
print_pair("GPU - tiled", gpu_tiled_chksum.first,
gpu_tiled_chksum.second.count());
return EXIT_SUCCESS;
}