diff --git a/src/sycl_comp.cpp b/src/sycl_comp.cpp index 15b7e00..e32bcd8 100644 --- a/src/sycl_comp.cpp +++ b/src/sycl_comp.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -23,6 +24,16 @@ namespace sycl = cl::sycl; using data_type = int; +/** + * Straightforward matrix multiplication using one single CPU core and 3 nested + * loops. + * + * @param matA Matrix A + * @param matB Matrix B + * + * @return returns the checksum of the result produced by multiplication of + * matrix A and B + */ template auto matrixMultCPU(const Matrix &matA, const Matrix &matB) { Matrix res(matA.rows, matB.cols); @@ -38,9 +49,19 @@ auto matrixMultCPU(const Matrix &matA, const Matrix &matB) { return res.chksum(); } +/** + * Also a straightforward solution, but before dive into the three nested loop, + * transpose matrix B. This allows faster access to the second factor using as + * CPU caching is more utilized. + * + * @param matA Matrix A + * @param matB Matrix B + * + * @return returns the checksum of the result produced by multiplication of + * matrix A and B + */ template -auto matrixMultTransposeCPU(const Matrix &matA, - const Matrix &matB) { +auto matrixMultTransposeCPU(const Matrix &matA, const Matrix &matB) { Matrix matB_t = matB.t(); Matrix res(matA.rows, matB.cols); for (std::uint32_t i = 0; i < res.rows; i++) { @@ -55,30 +76,70 @@ auto matrixMultTransposeCPU(const Matrix &matA, return res.chksum(); } +/** + * The naive implementation of the matrix multiplication using sycl. The actual + * implementation (OpenMP CPU parallelization, GPU oflloading) is done by the + * compiler. + * + * @param q Passing the desired SYCL queue. If you want to utilize GPU, + * initialize the queue with a gpu selector. + * @param matA Matrix A + * @param matB Matrix B + * + * @return returns the checksum of the result produced by multiplication of + * matrix A and B + */ template auto matrixMultSYCL(sycl::queue &q, const Matrix &matA, const Matrix &matB) { + + // allocate memory for the result matrix on the host Matrix matRes(matA.rows, matB.cols); + + // define the size of our problem - here we want each task to calculate one + // cell of the product. Thus, leading to a problem size of N x M sycl::range<2> global_range(matRes.rows, matRes.cols); { + // defining 2 dimensional buffers which can then be exposed to the device. + // It also possible to use 1D buffers here, but then we have to manually + // calculate the index to access the matrices for each thread in the kernel + // code. Solving it this way will ask the compiler to do the work. sycl::buffer b_matA(matA.mem.data(), - sycl::range<2>(matA.rows, matA.cols)); + sycl::range<2>(matA.rows, matA.cols)); sycl::buffer b_matB(matB.mem.data(), - sycl::range<2>(matB.rows, matB.cols)); + sycl::range<2>(matB.rows, matB.cols)); - sycl::buffer b_matRes( - matRes.mem.data(), sycl::range<2>(matRes.rows, matRes.cols)); + sycl::buffer b_matRes(matRes.mem.data(), + sycl::range<2>(matRes.rows, matRes.cols)); + // submit work to the device. this is done by using a lambda function which + // references all values known to the scope i.e. the previously defined + // buffers. q.submit([&](sycl::handler &h) { + // create accessors and expose/copy the data to the device or mark them to + // be written back after successfull execution of the kernel. Access can + // be controlled by access modes. + // + // Here, we only the matrix A and B to be read from and the matrix C to be + // written to. auto acc_matA = b_matA.template get_access(h); auto acc_matB = b_matB.template get_access(h); - auto acc_matRes = b_matRes.template get_access(h); + auto acc_matRes = + b_matRes.template get_access(h); + // For the parallelized loop another lambda function is used, but all + // known values are passed by value, as host and device doesn't share the + // same address room. + // + // Additionally to the lambda function, the parallel_for function is given + // the global range we defined earlier to provide the size of the problem + // and launch the count of tasks accordingly. The identifier of the task + // is then passed to the lambda function as a parameter. h.parallel_for(global_range, [=](sycl::id<2> ID) { - auto i = ID[0]; - auto j = ID[1]; + const auto i = ID[0]; + const auto j = ID[1]; T sum = 0; for (auto k = 0; k < matA.cols; k++) { @@ -88,12 +149,30 @@ auto matrixMultSYCL(sycl::queue &q, const Matrix &matA, }); }); } - - q.wait(); + // As buffers are destroyed after the scope, ensuring to copy the data back + // from the device, no need to call a barrier here. + // + // We are able to return the cksum of matrix C. return matRes.chksum(); } +/** + * The implementation of the matrix multiplication using sycl with transposing + * matrix B beforehand. The actual implementation (OpenMP CPU parallelization, + * GPU oflloading) is done by the compiler. + * + * For inline comments please refer to matrixMultSYCL(). Nothing changes unless + * a buffer to the transposed matrix B is created. + * + * @param q Passing the desired SYCL queue. If you want to utilize GPU, + * initialize the queue with a gpu selector. + * @param matA Matrix A + * @param matB Matrix B + * + * @return returns the checksum of the result produced by multiplication of + * matrix A and B + */ template auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix &matA, const Matrix &matB) { @@ -104,18 +183,19 @@ auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix &matA, { sycl::buffer b_matA(matA.mem.data(), - sycl::range<2>(matA.rows, matA.cols)); + sycl::range<2>(matA.rows, matA.cols)); sycl::buffer b_matB(matB_t.mem.data(), - sycl::range<2>(matB_t.rows, matB_t.cols)); + sycl::range<2>(matB_t.rows, matB_t.cols)); - sycl::buffer b_matRes( - matRes.mem.data(), sycl::range<2>(matRes.rows, matRes.cols)); + sycl::buffer b_matRes(matRes.mem.data(), + sycl::range<2>(matRes.rows, matRes.cols)); q.submit([&](sycl::handler &h) { auto acc_matA = b_matA.template get_access(h); auto acc_matB = b_matB.template get_access(h); - auto acc_matRes = b_matRes.template get_access(h); + auto acc_matRes = + b_matRes.template get_access(h); h.parallel_for(global_range, [=](sycl::id<2> ID) { auto i = ID[0]; @@ -156,24 +236,60 @@ inline int prevPowerOfTwo(int x) { return x - (x >> 1); } +/** + * The implementation of the matrix multiplication using sycl using tiling. + * + * Each task fills a field in the local memory of the device with values from + * the global memory. For this tasks are grouped to a work group. The size of + * the work group is given by the maximum value defined by the device + * description. All tasks from one work group share the same local memory. + * + * As the tile might be smaller than the actual matrix A and B sizes, the tile + * is 'moved' along the x-axis and the y-axis of those matrices resp. + * + * By using tiling, each task only performs N/TILE_SIZE global memory operations + * and utilizing way faster local memory, speeding up multiplication on most + * devices. + * + * Keep in mind that this function is intended to only be executed on GPUs, as + * nd_range paradigm with parallel_for is not efficiently implementable on CPUs. + * See: Compilation + * model of AdaptiveCpp + * + * @param q Passing the desired SYCL queue. If you want to utilize GPU, + * initialize the queue with a gpu selector. + * @param matA Matrix A + * @param matB Matrix B + * + * @return returns the checksum of the result produced by multiplication of + * matrix A and B + */ template auto matrixMultTiledSYCL(sycl::queue &q, const Matrix &matA, const Matrix &matB) { Matrix matRes(matA.rows, matB.cols); + // obtain the maximum work group size for the given device std::size_t max_group_size = q.get_device().get_info(); - // lets assume we always have a maximum group size with a power of 2 - const std::uint32_t max_block_size = + // each tile should not exceed the max_group_size -> T x T <= max_group_size + const std::uint32_t max_tile_size = static_cast(prevPowerOfTwo(std::sqrt(max_group_size))); - const std::uint32_t block_size = std::min(matA.cols, max_block_size); + // maybe the problem size of the multiplication is smaller than the maximum + // tile size + const std::uint32_t block_size = std::min(matA.cols, max_tile_size); + // define the global range ... sycl::range<2> global_range(matRes.rows, matRes.cols); - sycl::range<2> local_range(block_size, block_size); + + // ... and the local_range (one tile per work group) + sycl::range<2> tile_range(block_size, block_size); { + // allocate the buffers sycl::buffer b_matA(matA.mem.data(), sycl::range<2>(matA.rows, matA.cols)); @@ -184,21 +300,31 @@ auto matrixMultTiledSYCL(sycl::queue &q, const Matrix &matA, sycl::range<2>(matRes.rows, matRes.cols)); q.submit([&](sycl::handler &h) { + // provide access to the buffers and ... auto acc_matA = b_matA.template get_access(h); auto acc_matB = b_matB.template get_access(h); auto acc_matRes = b_matRes.template get_access(h); + // ... allocate memory in the local device memory which should be + // accessble to each thread per matrix A ... sycl::accessor - tileA(local_range, h); + tileA(tile_range, h); + // ... and matrix B sycl::accessor - tileB(local_range, h); + tileB(tile_range, h); + // We define a kernel function by passing the global_range and the + // tile_range to the parallel_for function of the handler. Secondly, + // another lambda is passed as the kernel function, also with + // passed-by-value lambda captures. As a parameter serves a nd_item, which + // can be used to extract all relevant data linked to the running task. h.parallel_for( - sycl::nd_range{global_range, local_range}, [=](sycl::nd_item<2> &ID) { + sycl::nd_range{global_range, tile_range}, [=](sycl::nd_item<2> &ID) { + // extract all relevant information const int i = ID.get_global_id(0); const int j = ID.get_global_id(1); @@ -207,31 +333,40 @@ auto matrixMultTiledSYCL(sycl::queue &q, const Matrix &matA, const int max_tile = ID.get_group_range(0); - // Current local IDem T sum = 0; + // 'move' the tile over the A and B matrix for (int tile_i = 0; tile_i < max_tile; tile_i++) { + // each thread copy 'its' value to the local memory from the + // global memory. tileA[local_i][local_j] = acc_matA[i][tile_i * block_size + local_j]; + // here we will also transpose the B matrix tileB[local_j][local_i] = acc_matB[block_size * tile_i + local_i][j]; + // we need an explicit barrier to ensure all threads of the + // working group succeeded to write their value into the local + // memory ID.barrier(sycl::access::fence_space::local_space); + // build the 'local' sum over the part of matrix A and B stored in + // the local memory for (auto k = 0; k < block_size; k++) { sum += tileA[local_i][k] * tileB[local_j][k]; } + // ensure all threads finished the multiplication, allowing to + // rewrite the local memory ID.barrier(sycl::access::fence_space::local_space); } + // each thread stores its sum in their resulting matrix C acc_matRes[i][j] = sum; }); }); } - q.wait(); - return matRes.chksum(); } @@ -260,21 +395,36 @@ auto main(int argc, char **argv) -> int { sycl::queue cpu_queue(sycl::cpu_selector_v); - auto omp_chksum = measure<>::duration(matrixMultSYCL, cpu_queue, matA, matB); + std::cout << "Starting CPU benchmarks with device: " + << cpu_queue.get_device().get_info() + << "\n"; + + auto omp_chksum = + measure<>::duration(matrixMultSYCL, cpu_queue, matA, matB); print_pair("OMP - naive", omp_chksum.first, omp_chksum.second.count()); - auto omp_transp_chksum = - measure<>::duration(matrixMultTransposeSYCL, cpu_queue, matA, matB); + auto omp_transp_chksum = measure<>::duration( + matrixMultTransposeSYCL, cpu_queue, matA, matB); print_pair("OMP - transposed", omp_transp_chksum.first, omp_transp_chksum.second.count()); sycl::queue gpu_queue(sycl::gpu_selector_v); - auto gpu_chksum = measure<>::duration(matrixMultSYCL, gpu_queue, matA, matB); + if (!gpu_queue.get_device().is_gpu()) { + std::cout << "No GPU found, skipping GPU benchmarks\n"; + return EXIT_SUCCESS; + } + + std::cout << "Starting GPU benchmarks with device: " + << gpu_queue.get_device().get_info() + << "\n"; + + auto gpu_chksum = + measure<>::duration(matrixMultSYCL, gpu_queue, matA, matB); print_pair("GPU - naive", gpu_chksum.first, gpu_chksum.second.count()); - auto gpu_transp_chksum = - measure<>::duration(matrixMultTransposeSYCL, gpu_queue, matA, matB); + auto gpu_transp_chksum = measure<>::duration( + matrixMultTransposeSYCL, gpu_queue, matA, matB); print_pair("GPU - transposed", gpu_transp_chksum.first, gpu_transp_chksum.second.count());