add matmul tiled example
This commit is contained in:
parent
86842a6a15
commit
0e6e99c5dd
@ -1,4 +1,7 @@
|
|||||||
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstddef>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
@ -53,30 +56,8 @@ auto matrixMultTransposeCPU(const Matrix<data_type> &matA,
|
|||||||
|
|
||||||
auto matrixMultSYCL(sycl::queue &q, const Matrix<data_type> &matA,
|
auto matrixMultSYCL(sycl::queue &q, const Matrix<data_type> &matA,
|
||||||
const Matrix<data_type> &matB) {
|
const Matrix<data_type> &matB) {
|
||||||
|
|
||||||
// auto d_matA = static_cast<T *>(sycl::malloc_device(matA.bytes(), q));
|
|
||||||
// q.memcpy(d_matA, matA.mem.data(), matA.bytes());
|
|
||||||
|
|
||||||
// auto d_matB = static_cast<T *>(sycl::malloc_device(matB_t.bytes(), q));
|
|
||||||
// q.memcpy(d_matB, matB_t.mem.data(), matB_t.bytes());
|
|
||||||
|
|
||||||
Matrix<data_type> matRes(matA.rows, matB.cols);
|
Matrix<data_type> matRes(matA.rows, matB.cols);
|
||||||
|
|
||||||
// auto d_matRes = static_cast<T *>(sycl::malloc_device(matRes.bytes(), q));
|
|
||||||
|
|
||||||
// std::size_t max_group_size =
|
|
||||||
// q.get_device().get_info<sycl::info::device::max_work_group_size>();
|
|
||||||
|
|
||||||
// lets assume we always have a maximum group size with a power of 2
|
|
||||||
// const std::uint32_t local_one_dim =
|
|
||||||
// std::pow(2, static_cast<std::uint32_t>(std::log2(max_group_size) / 2));
|
|
||||||
|
|
||||||
sycl::range<2> global_range(matRes.rows, matRes.cols);
|
sycl::range<2> global_range(matRes.rows, matRes.cols);
|
||||||
// sycl::range<2> local_range(
|
|
||||||
// local_one_dim > matRes.rows ? matRes.rows : local_one_dim,
|
|
||||||
// local_one_dim > matRes.cols ? matRes.cols : local_one_dim);
|
|
||||||
|
|
||||||
q.wait();
|
|
||||||
|
|
||||||
{
|
{
|
||||||
sycl::buffer<data_type, 2> b_matA(matA.mem.data(),
|
sycl::buffer<data_type, 2> b_matA(matA.mem.data(),
|
||||||
@ -115,24 +96,8 @@ auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix<data_type> &matA,
|
|||||||
const Matrix<data_type> &matB) {
|
const Matrix<data_type> &matB) {
|
||||||
|
|
||||||
Matrix<data_type> matB_t = matB.t();
|
Matrix<data_type> matB_t = matB.t();
|
||||||
|
|
||||||
Matrix<data_type> matRes(matA.rows, matB.cols);
|
Matrix<data_type> matRes(matA.rows, matB.cols);
|
||||||
|
|
||||||
// auto d_matRes = static_cast<T *>(sycl::malloc_device(matRes.bytes(), q));
|
|
||||||
|
|
||||||
// std::size_t max_group_size =
|
|
||||||
// q.get_device().get_info<sycl::info::device::max_work_group_size>();
|
|
||||||
|
|
||||||
// lets assume we always have a maximum group size with a power of 2
|
|
||||||
// const std::uint32_t local_one_dim =
|
|
||||||
// std::pow(2, static_cast<std::uint32_t>(std::log2(max_group_size) / 2));
|
|
||||||
|
|
||||||
sycl::range<2> global_range(matRes.rows, matRes.cols);
|
sycl::range<2> global_range(matRes.rows, matRes.cols);
|
||||||
// sycl::range<2> local_range(
|
|
||||||
// local_one_dim > matRes.rows ? matRes.rows : local_one_dim,
|
|
||||||
// local_one_dim > matRes.cols ? matRes.cols : local_one_dim);
|
|
||||||
|
|
||||||
q.wait();
|
|
||||||
|
|
||||||
{
|
{
|
||||||
sycl::buffer<data_type, 2> b_matA(matA.mem.data(),
|
sycl::buffer<data_type, 2> b_matA(matA.mem.data(),
|
||||||
@ -167,6 +132,102 @@ auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix<data_type> &matA,
|
|||||||
return matRes.chksum();
|
return matRes.chksum();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Obtains the previous power of two from the given integer.
|
||||||
|
* It works by masking out all ones after the first one bit,
|
||||||
|
* then leaves the first one bit intact, effectively
|
||||||
|
* yielding the first power of two < x. */
|
||||||
|
inline int prevPowerOfTwo(int x) {
|
||||||
|
if (x < 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
--x;
|
||||||
|
x |= x >> 1;
|
||||||
|
x |= x >> 2;
|
||||||
|
x |= x >> 4;
|
||||||
|
x |= x >> 8;
|
||||||
|
x |= x >> 16;
|
||||||
|
return x - (x >> 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,
|
||||||
|
const Matrix<T> &matB) {
|
||||||
|
Matrix<T> matRes(matA.rows, matB.cols);
|
||||||
|
|
||||||
|
std::size_t max_group_size =
|
||||||
|
q.get_device().get_info<sycl::info::device::max_work_group_size>();
|
||||||
|
|
||||||
|
// lets assume we always have a maximum group size with a power of 2
|
||||||
|
const std::uint32_t max_block_size =
|
||||||
|
static_cast<std::uint32_t>(prevPowerOfTwo(std::sqrt(max_group_size)));
|
||||||
|
|
||||||
|
const std::uint32_t block_size = std::min(matA.cols, max_block_size);
|
||||||
|
|
||||||
|
sycl::range<2> global_range(matRes.rows, matRes.cols);
|
||||||
|
sycl::range<2> local_range(block_size, block_size);
|
||||||
|
|
||||||
|
{
|
||||||
|
sycl::buffer<T, 2> b_matA(matA.mem.data(),
|
||||||
|
sycl::range<2>(matA.rows, matA.cols));
|
||||||
|
|
||||||
|
sycl::buffer<T, 2> b_matB(matB.mem.data(),
|
||||||
|
sycl::range<2>(matB.rows, matB.cols));
|
||||||
|
|
||||||
|
sycl::buffer<T, 2> b_matRes(matRes.mem.data(),
|
||||||
|
sycl::range<2>(matRes.rows, matRes.cols));
|
||||||
|
|
||||||
|
q.submit([&](sycl::handler &h) {
|
||||||
|
auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
|
||||||
|
auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
|
||||||
|
auto acc_matRes =
|
||||||
|
b_matRes.template get_access<sycl::access::mode::write>(h);
|
||||||
|
|
||||||
|
sycl::accessor<int, 2, sycl::access::mode::read_write,
|
||||||
|
sycl::access::target::local>
|
||||||
|
tileA(local_range, h);
|
||||||
|
|
||||||
|
sycl::accessor<int, 2, sycl::access::mode::read_write,
|
||||||
|
sycl::access::target::local>
|
||||||
|
tileB(local_range, h);
|
||||||
|
|
||||||
|
h.parallel_for<class tiled_matmul>(
|
||||||
|
sycl::nd_range{global_range, local_range}, [=](sycl::nd_item<2> &ID) {
|
||||||
|
const int i = ID.get_global_id(0);
|
||||||
|
const int j = ID.get_global_id(1);
|
||||||
|
|
||||||
|
const int local_i = ID.get_local_id(0);
|
||||||
|
const int local_j = ID.get_local_id(1);
|
||||||
|
|
||||||
|
const int max_tile = ID.get_group_range(0);
|
||||||
|
|
||||||
|
// Current local IDem
|
||||||
|
T sum = 0;
|
||||||
|
|
||||||
|
for (int tile_i = 0; tile_i < max_tile; tile_i++) {
|
||||||
|
tileA[local_i][local_j] =
|
||||||
|
acc_matA[i][tile_i * block_size + local_j];
|
||||||
|
tileB[local_j][local_i] =
|
||||||
|
acc_matB[block_size * tile_i + local_i][j];
|
||||||
|
|
||||||
|
ID.barrier(sycl::access::fence_space::local_space);
|
||||||
|
|
||||||
|
for (auto k = 0; k < block_size; k++) {
|
||||||
|
sum += tileA[local_i][k] * tileB[local_j][k];
|
||||||
|
}
|
||||||
|
|
||||||
|
ID.barrier(sycl::access::fence_space::local_space);
|
||||||
|
}
|
||||||
|
|
||||||
|
acc_matRes[i][j] = sum;
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
q.wait();
|
||||||
|
|
||||||
|
return matRes.chksum();
|
||||||
|
}
|
||||||
|
|
||||||
auto main(int argc, char **argv) -> int {
|
auto main(int argc, char **argv) -> int {
|
||||||
|
|
||||||
if (argc != 3) {
|
if (argc != 3) {
|
||||||
@ -180,6 +241,7 @@ auto main(int argc, char **argv) -> int {
|
|||||||
|
|
||||||
assert(matA.rows == matB.cols);
|
assert(matA.rows == matB.cols);
|
||||||
|
|
||||||
|
#ifdef SEQ_BENCH
|
||||||
auto cpu_chksum = measure<>::duration(matrixMultCPU, matA, matB);
|
auto cpu_chksum = measure<>::duration(matrixMultCPU, matA, matB);
|
||||||
print_pair("CPU - naive", cpu_chksum.first, cpu_chksum.second.count());
|
print_pair("CPU - naive", cpu_chksum.first, cpu_chksum.second.count());
|
||||||
|
|
||||||
@ -187,6 +249,7 @@ auto main(int argc, char **argv) -> int {
|
|||||||
measure<>::duration(matrixMultTransposeCPU, matA, matB);
|
measure<>::duration(matrixMultTransposeCPU, matA, matB);
|
||||||
print_pair("CPU - transposed", cpu_transp_chksum.first,
|
print_pair("CPU - transposed", cpu_transp_chksum.first,
|
||||||
cpu_transp_chksum.second.count());
|
cpu_transp_chksum.second.count());
|
||||||
|
#endif
|
||||||
|
|
||||||
sycl::queue cpu_queue(sycl::cpu_selector_v);
|
sycl::queue cpu_queue(sycl::cpu_selector_v);
|
||||||
|
|
||||||
@ -208,5 +271,10 @@ auto main(int argc, char **argv) -> int {
|
|||||||
print_pair("GPU - transposed", gpu_transp_chksum.first,
|
print_pair("GPU - transposed", gpu_transp_chksum.first,
|
||||||
gpu_transp_chksum.second.count());
|
gpu_transp_chksum.second.count());
|
||||||
|
|
||||||
|
auto gpu_tiled_chksum = measure<>::duration(matrixMultTiledSYCL<data_type>,
|
||||||
|
gpu_queue, matA, matB);
|
||||||
|
print_pair("GPU - tiled", gpu_tiled_chksum.first,
|
||||||
|
gpu_tiled_chksum.second.count());
|
||||||
|
|
||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user