438 lines
15 KiB
C++
438 lines
15 KiB
C++
#include <algorithm>
|
|
#include <cassert>
|
|
#include <cmath>
|
|
#include <cstddef>
|
|
#include <cstdint>
|
|
#include <cstdlib>
|
|
#include <hipSYCL/sycl/info/device.hpp>
|
|
#include <iostream>
|
|
|
|
#include <CL/sycl.hpp>
|
|
#include <utility>
|
|
|
|
#include "matrix.hpp"
|
|
#include "timer.hpp"
|
|
|
|
namespace sycl = cl::sycl;
|
|
|
|
#define stream_hex(_val) std::hex << _val << std::dec
|
|
|
|
#define print_pair(_desc, _chksm, _time) \
|
|
std::cout << _desc << "\n\t" \
|
|
<< "\t-> Check: 0x" << stream_hex(_chksm) \
|
|
<< "\tRuntime: " << _time << " us\n\n"
|
|
|
|
using data_type = int;
|
|
|
|
/**
|
|
* Straightforward matrix multiplication using one single CPU core and 3 nested
|
|
* loops.
|
|
*
|
|
* @param matA Matrix A
|
|
* @param matB Matrix B
|
|
*
|
|
* @return returns the checksum of the result produced by multiplication of
|
|
* matrix A and B
|
|
*/
|
|
template <class T>
|
|
auto matrixMultCPU(const Matrix<T> &matA, const Matrix<T> &matB) {
|
|
Matrix<T> res(matA.rows, matB.cols);
|
|
for (std::uint32_t i = 0; i < res.rows; i++) {
|
|
for (std::uint32_t j = 0; j < res.cols; j++) {
|
|
auto &res_val = res(i, j) = 0;
|
|
for (std::uint32_t k = 0; k < matA.cols; k++) {
|
|
res_val += matA(i, k) * matB(k, j);
|
|
}
|
|
}
|
|
}
|
|
|
|
return res.chksum();
|
|
}
|
|
|
|
/**
|
|
* Also a straightforward solution, but before dive into the three nested loop,
|
|
* transpose matrix B. This allows faster access to the second factor using as
|
|
* CPU caching is more utilized.
|
|
*
|
|
* @param matA Matrix A
|
|
* @param matB Matrix B
|
|
*
|
|
* @return returns the checksum of the result produced by multiplication of
|
|
* matrix A and B
|
|
*/
|
|
template <class T>
|
|
auto matrixMultTransposeCPU(const Matrix<T> &matA, const Matrix<T> &matB) {
|
|
Matrix<T> matB_t = matB.t();
|
|
Matrix<T> res(matA.rows, matB.cols);
|
|
for (std::uint32_t i = 0; i < res.rows; i++) {
|
|
for (std::uint32_t j = 0; j < res.cols; j++) {
|
|
auto &res_val = res(i, j) = 0;
|
|
for (std::uint32_t k = 0; k < matA.cols; k++) {
|
|
res_val += matA(i, k) * matB_t(j, k);
|
|
}
|
|
}
|
|
}
|
|
|
|
return res.chksum();
|
|
}
|
|
|
|
/**
|
|
* The naive implementation of the matrix multiplication using sycl. The actual
|
|
* implementation (OpenMP CPU parallelization, GPU oflloading) is done by the
|
|
* compiler.
|
|
*
|
|
* @param q Passing the desired SYCL queue. If you want to utilize GPU,
|
|
* initialize the queue with a gpu selector.
|
|
* @param matA Matrix A
|
|
* @param matB Matrix B
|
|
*
|
|
* @return returns the checksum of the result produced by multiplication of
|
|
* matrix A and B
|
|
*/
|
|
template <class T>
|
|
auto matrixMultSYCL(sycl::queue &q, const Matrix<T> &matA,
|
|
const Matrix<T> &matB) {
|
|
|
|
// allocate memory for the result matrix on the host
|
|
Matrix<T> matRes(matA.rows, matB.cols);
|
|
|
|
// define the size of our problem - here we want each task to calculate one
|
|
// cell of the product. Thus, leading to a problem size of N x M
|
|
sycl::range<2> global_range(matRes.rows, matRes.cols);
|
|
|
|
{
|
|
// defining 2 dimensional buffers which can then be exposed to the device.
|
|
// It also possible to use 1D buffers here, but then we have to manually
|
|
// calculate the index to access the matrices for each thread in the kernel
|
|
// code. Solving it this way will ask the compiler to do the work.
|
|
sycl::buffer<T, 2> b_matA(matA.mem.data(),
|
|
sycl::range<2>(matA.rows, matA.cols));
|
|
|
|
sycl::buffer<T, 2> b_matB(matB.mem.data(),
|
|
sycl::range<2>(matB.rows, matB.cols));
|
|
|
|
sycl::buffer<T, 2> b_matRes(matRes.mem.data(),
|
|
sycl::range<2>(matRes.rows, matRes.cols));
|
|
|
|
// submit work to the device. this is done by using a lambda function which
|
|
// references all values known to the scope i.e. the previously defined
|
|
// buffers.
|
|
q.submit([&](sycl::handler &h) {
|
|
// create accessors and expose/copy the data to the device or mark them to
|
|
// be written back after successfull execution of the kernel. Access can
|
|
// be controlled by access modes.
|
|
//
|
|
// Here, we only the matrix A and B to be read from and the matrix C to be
|
|
// written to.
|
|
auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
|
|
auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
|
|
auto acc_matRes =
|
|
b_matRes.template get_access<sycl::access::mode::write>(h);
|
|
|
|
// For the parallelized loop another lambda function is used, but all
|
|
// known values are passed by value, as host and device doesn't share the
|
|
// same address room.
|
|
//
|
|
// Additionally to the lambda function, the parallel_for function is given
|
|
// the global range we defined earlier to provide the size of the problem
|
|
// and launch the count of tasks accordingly. The identifier of the task
|
|
// is then passed to the lambda function as a parameter.
|
|
h.parallel_for(global_range, [=](sycl::id<2> ID) {
|
|
const auto i = ID[0];
|
|
const auto j = ID[1];
|
|
T sum = 0;
|
|
|
|
for (auto k = 0; k < matA.cols; k++) {
|
|
sum += acc_matA[i][k] * acc_matB[k][j];
|
|
}
|
|
acc_matRes[i][j] = sum;
|
|
});
|
|
});
|
|
}
|
|
// As buffers are destroyed after the scope, ensuring to copy the data back
|
|
// from the device, no need to call a barrier here.
|
|
//
|
|
// We are able to return the cksum of matrix C.
|
|
|
|
return matRes.chksum();
|
|
}
|
|
|
|
/**
|
|
* The implementation of the matrix multiplication using sycl with transposing
|
|
* matrix B beforehand. The actual implementation (OpenMP CPU parallelization,
|
|
* GPU oflloading) is done by the compiler.
|
|
*
|
|
* For inline comments please refer to matrixMultSYCL(). Nothing changes unless
|
|
* a buffer to the transposed matrix B is created.
|
|
*
|
|
* @param q Passing the desired SYCL queue. If you want to utilize GPU,
|
|
* initialize the queue with a gpu selector.
|
|
* @param matA Matrix A
|
|
* @param matB Matrix B
|
|
*
|
|
* @return returns the checksum of the result produced by multiplication of
|
|
* matrix A and B
|
|
*/
|
|
template <class T>
|
|
auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix<T> &matA,
|
|
const Matrix<T> &matB) {
|
|
|
|
Matrix<T> matB_t = matB.t();
|
|
Matrix<T> matRes(matA.rows, matB.cols);
|
|
sycl::range<2> global_range(matRes.rows, matRes.cols);
|
|
|
|
{
|
|
sycl::buffer<T, 2> b_matA(matA.mem.data(),
|
|
sycl::range<2>(matA.rows, matA.cols));
|
|
|
|
sycl::buffer<T, 2> b_matB(matB_t.mem.data(),
|
|
sycl::range<2>(matB_t.rows, matB_t.cols));
|
|
|
|
sycl::buffer<T, 2> b_matRes(matRes.mem.data(),
|
|
sycl::range<2>(matRes.rows, matRes.cols));
|
|
|
|
q.submit([&](sycl::handler &h) {
|
|
auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
|
|
auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
|
|
auto acc_matRes =
|
|
b_matRes.template get_access<sycl::access::mode::write>(h);
|
|
|
|
h.parallel_for(global_range, [=](sycl::id<2> ID) {
|
|
auto i = ID[0];
|
|
auto j = ID[1];
|
|
T sum = 0;
|
|
|
|
for (auto k = 0; k < matA.cols; k++) {
|
|
sum += acc_matA[i][k] * acc_matB[j][k];
|
|
}
|
|
acc_matRes[i][j] = sum;
|
|
});
|
|
});
|
|
}
|
|
|
|
q.wait();
|
|
|
|
return matRes.chksum();
|
|
}
|
|
|
|
/*
|
|
* Obtained from
|
|
* https://github.com/codeplaysoftware/computecpp-sdk/blob/master/samples/matrix-multiply.cpp
|
|
*
|
|
* Obtains the previous power of two from the given integer.
|
|
* It works by masking out all ones after the first one bit,
|
|
* then leaves the first one bit intact, effectively
|
|
* yielding the first power of two < x. */
|
|
inline int prevPowerOfTwo(int x) {
|
|
if (x < 0) {
|
|
return 0;
|
|
}
|
|
--x;
|
|
x |= x >> 1;
|
|
x |= x >> 2;
|
|
x |= x >> 4;
|
|
x |= x >> 8;
|
|
x |= x >> 16;
|
|
return x - (x >> 1);
|
|
}
|
|
|
|
/**
|
|
* The implementation of the matrix multiplication using sycl using tiling.
|
|
*
|
|
* Each task fills a field in the local memory of the device with values from
|
|
* the global memory. For this tasks are grouped to a work group. The size of
|
|
* the work group is given by the maximum value defined by the device
|
|
* description. All tasks from one work group share the same local memory.
|
|
*
|
|
* As the tile might be smaller than the actual matrix A and B sizes, the tile
|
|
* is 'moved' along the x-axis and the y-axis of those matrices resp.
|
|
*
|
|
* By using tiling, each task only performs N/TILE_SIZE global memory operations
|
|
* and utilizing way faster local memory, speeding up multiplication on most
|
|
* devices.
|
|
*
|
|
* Keep in mind that this function is intended to only be executed on GPUs, as
|
|
* nd_range paradigm with parallel_for is not efficiently implementable on CPUs.
|
|
* See: <a
|
|
* href="https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/compilation.md#compiler-support-to-accelerate-nd_range-parallel_for-on-cpus-ompaccelerated">Compilation
|
|
* model of AdaptiveCpp</a>
|
|
*
|
|
* @param q Passing the desired SYCL queue. If you want to utilize GPU,
|
|
* initialize the queue with a gpu selector.
|
|
* @param matA Matrix A
|
|
* @param matB Matrix B
|
|
*
|
|
* @return returns the checksum of the result produced by multiplication of
|
|
* matrix A and B
|
|
*/
|
|
template <class T>
|
|
auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,
|
|
const Matrix<T> &matB) {
|
|
Matrix<T> matRes(matA.rows, matB.cols);
|
|
|
|
// obtain the maximum work group size for the given device
|
|
std::size_t max_group_size =
|
|
q.get_device().get_info<sycl::info::device::max_work_group_size>();
|
|
|
|
// each tile should not exceed the max_group_size -> T x T <= max_group_size
|
|
const std::uint32_t max_tile_size =
|
|
static_cast<std::uint32_t>(prevPowerOfTwo(std::sqrt(max_group_size)));
|
|
|
|
// maybe the problem size of the multiplication is smaller than the maximum
|
|
// tile size
|
|
const std::uint32_t block_size = std::min(matA.cols, max_tile_size);
|
|
|
|
// define the global range ...
|
|
sycl::range<2> global_range(matRes.rows, matRes.cols);
|
|
|
|
// ... and the local_range (one tile per work group)
|
|
sycl::range<2> tile_range(block_size, block_size);
|
|
|
|
{
|
|
// allocate the buffers
|
|
sycl::buffer<T, 2> b_matA(matA.mem.data(),
|
|
sycl::range<2>(matA.rows, matA.cols));
|
|
|
|
sycl::buffer<T, 2> b_matB(matB.mem.data(),
|
|
sycl::range<2>(matB.rows, matB.cols));
|
|
|
|
sycl::buffer<T, 2> b_matRes(matRes.mem.data(),
|
|
sycl::range<2>(matRes.rows, matRes.cols));
|
|
|
|
q.submit([&](sycl::handler &h) {
|
|
// provide access to the buffers and ...
|
|
auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
|
|
auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
|
|
auto acc_matRes =
|
|
b_matRes.template get_access<sycl::access::mode::write>(h);
|
|
|
|
// ... allocate memory in the local device memory which should be
|
|
// accessble to each thread per matrix A ...
|
|
sycl::accessor<int, 2, sycl::access::mode::read_write,
|
|
sycl::access::target::local>
|
|
tileA(tile_range, h);
|
|
|
|
// ... and matrix B
|
|
sycl::accessor<int, 2, sycl::access::mode::read_write,
|
|
sycl::access::target::local>
|
|
tileB(tile_range, h);
|
|
|
|
// We define a kernel function by passing the global_range and the
|
|
// tile_range to the parallel_for function of the handler. Secondly,
|
|
// another lambda is passed as the kernel function, also with
|
|
// passed-by-value lambda captures. As a parameter serves a nd_item, which
|
|
// can be used to extract all relevant data linked to the running task.
|
|
h.parallel_for<class tiled_matmul>(
|
|
sycl::nd_range{global_range, tile_range}, [=](sycl::nd_item<2> &ID) {
|
|
// extract all relevant information
|
|
const int i = ID.get_global_id(0);
|
|
const int j = ID.get_global_id(1);
|
|
|
|
const int local_i = ID.get_local_id(0);
|
|
const int local_j = ID.get_local_id(1);
|
|
|
|
const int max_tile = ID.get_group_range(0);
|
|
|
|
T sum = 0;
|
|
|
|
// 'move' the tile over the A and B matrix
|
|
for (int tile_i = 0; tile_i < max_tile; tile_i++) {
|
|
// each thread copy 'its' value to the local memory from the
|
|
// global memory.
|
|
tileA[local_i][local_j] =
|
|
acc_matA[i][tile_i * block_size + local_j];
|
|
// here we will also transpose the B matrix
|
|
tileB[local_j][local_i] =
|
|
acc_matB[block_size * tile_i + local_i][j];
|
|
|
|
// we need an explicit barrier to ensure all threads of the
|
|
// working group succeeded to write their value into the local
|
|
// memory
|
|
ID.barrier(sycl::access::fence_space::local_space);
|
|
|
|
// build the 'local' sum over the part of matrix A and B stored in
|
|
// the local memory
|
|
for (auto k = 0; k < block_size; k++) {
|
|
sum += tileA[local_i][k] * tileB[local_j][k];
|
|
}
|
|
|
|
// ensure all threads finished the multiplication, allowing to
|
|
// rewrite the local memory
|
|
ID.barrier(sycl::access::fence_space::local_space);
|
|
}
|
|
|
|
// each thread stores its sum in their resulting matrix C
|
|
acc_matRes[i][j] = sum;
|
|
});
|
|
});
|
|
}
|
|
|
|
return matRes.chksum();
|
|
}
|
|
|
|
auto main(int argc, char **argv) -> int {
|
|
|
|
if (argc != 3) {
|
|
std::cerr << "Provide 2 arguments to the program!\n"
|
|
<< "Usage: <prog> <matA>.txt <matB>.txt\n";
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
Matrix<data_type> matA(argv[1]);
|
|
Matrix<data_type> matB(argv[2]);
|
|
|
|
assert(matA.rows == matB.cols);
|
|
|
|
#ifdef SEQ_BENCH
|
|
auto cpu_chksum = measure<>::duration(matrixMultCPU<data_type>, matA, matB);
|
|
print_pair("CPU - naive", cpu_chksum.first, cpu_chksum.second.count());
|
|
|
|
auto cpu_transp_chksum =
|
|
measure<>::duration(matrixMultTransposeCPU<data_type>, matA, matB);
|
|
print_pair("CPU - transposed", cpu_transp_chksum.first,
|
|
cpu_transp_chksum.second.count());
|
|
#endif
|
|
|
|
sycl::queue cpu_queue(sycl::cpu_selector_v);
|
|
|
|
std::cout << "Starting CPU benchmarks with device: "
|
|
<< cpu_queue.get_device().get_info<sycl::info::device::name>()
|
|
<< "\n";
|
|
|
|
auto omp_chksum =
|
|
measure<>::duration(matrixMultSYCL<data_type>, cpu_queue, matA, matB);
|
|
print_pair("OMP - naive", omp_chksum.first, omp_chksum.second.count());
|
|
|
|
auto omp_transp_chksum = measure<>::duration(
|
|
matrixMultTransposeSYCL<data_type>, cpu_queue, matA, matB);
|
|
print_pair("OMP - transposed", omp_transp_chksum.first,
|
|
omp_transp_chksum.second.count());
|
|
|
|
sycl::queue gpu_queue(sycl::gpu_selector_v);
|
|
|
|
if (!gpu_queue.get_device().is_gpu()) {
|
|
std::cout << "No GPU found, skipping GPU benchmarks\n";
|
|
return EXIT_SUCCESS;
|
|
}
|
|
|
|
std::cout << "Starting GPU benchmarks with device: "
|
|
<< gpu_queue.get_device().get_info<sycl::info::device::name>()
|
|
<< "\n";
|
|
|
|
auto gpu_chksum =
|
|
measure<>::duration(matrixMultSYCL<data_type>, gpu_queue, matA, matB);
|
|
print_pair("GPU - naive", gpu_chksum.first, gpu_chksum.second.count());
|
|
|
|
auto gpu_transp_chksum = measure<>::duration(
|
|
matrixMultTransposeSYCL<data_type>, gpu_queue, matA, matB);
|
|
print_pair("GPU - transposed", gpu_transp_chksum.first,
|
|
gpu_transp_chksum.second.count());
|
|
|
|
auto gpu_tiled_chksum = measure<>::duration(matrixMultTiledSYCL<data_type>,
|
|
gpu_queue, matA, matB);
|
|
print_pair("GPU - tiled", gpu_tiled_chksum.first,
|
|
gpu_tiled_chksum.second.count());
|
|
|
|
return EXIT_SUCCESS;
|
|
}
|