refactor code to compile with Intel oneAPI
This commit is contained in:
parent
3fcd37fb1f
commit
9633302226
@ -97,19 +97,19 @@ auto matrixMultSYCL(sycl::queue &q, const Matrix<T> &matA,
|
||||
// cell of the product. Thus, leading to a problem size of N x M
|
||||
sycl::range<2> global_range(matRes.rows, matRes.cols);
|
||||
|
||||
const auto &inner_loop = matA.cols;
|
||||
|
||||
{
|
||||
// defining 2 dimensional buffers which can then be exposed to the device.
|
||||
// It also possible to use 1D buffers here, but then we have to manually
|
||||
// calculate the index to access the matrices for each thread in the kernel
|
||||
// code. Solving it this way will ask the compiler to do the work.
|
||||
sycl::buffer<T, 2> b_matA(matA.mem.data(),
|
||||
sycl::range<2>(matA.rows, matA.cols));
|
||||
sycl::buffer b_matA(matA.mem.data(), sycl::range(matA.rows, matA.cols));
|
||||
|
||||
sycl::buffer<T, 2> b_matB(matB.mem.data(),
|
||||
sycl::range<2>(matB.rows, matB.cols));
|
||||
sycl::buffer b_matB(matB.mem.data(), sycl::range(matB.rows, matB.cols));
|
||||
|
||||
sycl::buffer<T, 2> b_matRes(matRes.mem.data(),
|
||||
sycl::range<2>(matRes.rows, matRes.cols));
|
||||
sycl::buffer b_matRes(matRes.mem.data(),
|
||||
sycl::range(matRes.rows, matRes.cols));
|
||||
|
||||
// submit work to the device. this is done by using a lambda function which
|
||||
// references all values known to the scope i.e. the previously defined
|
||||
@ -121,10 +121,9 @@ auto matrixMultSYCL(sycl::queue &q, const Matrix<T> &matA,
|
||||
//
|
||||
// Here, we only the matrix A and B to be read from and the matrix C to be
|
||||
// written to.
|
||||
auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
|
||||
auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
|
||||
auto acc_matRes =
|
||||
b_matRes.template get_access<sycl::access::mode::write>(h);
|
||||
sycl::accessor acc_matA(b_matA, h, sycl::read_only);
|
||||
sycl::accessor acc_matB(b_matB, h, sycl::read_only);
|
||||
sycl::accessor acc_matRes(b_matRes, h, sycl::write_only);
|
||||
|
||||
// For the parallelized loop another lambda function is used, but all
|
||||
// known values are passed by value, as host and device doesn't share the
|
||||
@ -134,12 +133,12 @@ auto matrixMultSYCL(sycl::queue &q, const Matrix<T> &matA,
|
||||
// the global range we defined earlier to provide the size of the problem
|
||||
// and launch the count of tasks accordingly. The identifier of the task
|
||||
// is then passed to the lambda function as a parameter.
|
||||
h.parallel_for(global_range, [=](sycl::id<2> ID) {
|
||||
h.parallel_for(global_range, [=](auto ID) {
|
||||
const auto i = ID[0];
|
||||
const auto j = ID[1];
|
||||
T sum = 0;
|
||||
|
||||
for (auto k = 0; k < matA.cols; k++) {
|
||||
for (auto k = 0; k < inner_loop; k++) {
|
||||
sum += acc_matA[i][k] * acc_matB[k][j];
|
||||
}
|
||||
acc_matRes[i][j] = sum;
|
||||
@ -176,30 +175,30 @@ auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix<T> &matA,
|
||||
|
||||
Matrix<T> matB_t = matB.t();
|
||||
Matrix<T> matRes(matA.rows, matB.cols);
|
||||
sycl::range<2> global_range(matRes.rows, matRes.cols);
|
||||
sycl::range global_range(matRes.rows, matRes.cols);
|
||||
|
||||
const auto &inner_loop = matA.cols;
|
||||
|
||||
{
|
||||
sycl::buffer<T, 2> b_matA(matA.mem.data(),
|
||||
sycl::range<2>(matA.rows, matA.cols));
|
||||
sycl::buffer b_matA(matA.mem.data(), sycl::range(matA.rows, matA.cols));
|
||||
|
||||
sycl::buffer<T, 2> b_matB(matB_t.mem.data(),
|
||||
sycl::range<2>(matB_t.rows, matB_t.cols));
|
||||
sycl::buffer b_matB(matB_t.mem.data(),
|
||||
sycl::range(matB_t.rows, matB_t.cols));
|
||||
|
||||
sycl::buffer<T, 2> b_matRes(matRes.mem.data(),
|
||||
sycl::range<2>(matRes.rows, matRes.cols));
|
||||
sycl::buffer b_matRes(matRes.mem.data(),
|
||||
sycl::range(matRes.rows, matRes.cols));
|
||||
|
||||
q.submit([&](sycl::handler &h) {
|
||||
auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
|
||||
auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
|
||||
auto acc_matRes =
|
||||
b_matRes.template get_access<sycl::access::mode::write>(h);
|
||||
sycl::accessor acc_matA(b_matA, h, sycl::read_only);
|
||||
sycl::accessor acc_matB(b_matB, h, sycl::read_only);
|
||||
sycl::accessor acc_matRes(b_matRes, h, sycl::write_only);
|
||||
|
||||
h.parallel_for(global_range, [=](sycl::id<2> ID) {
|
||||
h.parallel_for(global_range, [=](auto ID) {
|
||||
auto i = ID[0];
|
||||
auto j = ID[1];
|
||||
T sum = 0;
|
||||
|
||||
for (auto k = 0; k < matA.cols; k++) {
|
||||
for (auto k = 0; k < inner_loop; k++) {
|
||||
sum += acc_matA[i][k] * acc_matB[j][k];
|
||||
}
|
||||
acc_matRes[i][j] = sum;
|
||||
@ -287,32 +286,25 @@ auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,
|
||||
|
||||
{
|
||||
// allocate the buffers
|
||||
sycl::buffer<T, 2> b_matA(matA.mem.data(),
|
||||
sycl::range<2>(matA.rows, matA.cols));
|
||||
sycl::buffer b_matA(matA.mem.data(), sycl::range(matA.rows, matA.cols));
|
||||
|
||||
sycl::buffer<T, 2> b_matB(matB.mem.data(),
|
||||
sycl::range<2>(matB.rows, matB.cols));
|
||||
sycl::buffer b_matB(matB.mem.data(), sycl::range(matB.rows, matB.cols));
|
||||
|
||||
sycl::buffer<T, 2> b_matRes(matRes.mem.data(),
|
||||
sycl::range<2>(matRes.rows, matRes.cols));
|
||||
sycl::buffer b_matRes(matRes.mem.data(),
|
||||
sycl::range(matRes.rows, matRes.cols));
|
||||
|
||||
q.submit([&](sycl::handler &h) {
|
||||
// provide access to the buffers and ...
|
||||
auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
|
||||
auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
|
||||
auto acc_matRes =
|
||||
b_matRes.template get_access<sycl::access::mode::write>(h);
|
||||
sycl::accessor acc_matA(b_matA, h, sycl::read_only);
|
||||
sycl::accessor acc_matB(b_matB, h, sycl::read_only);
|
||||
sycl::accessor acc_matRes(b_matRes, h, sycl::write_only);
|
||||
|
||||
// ... allocate memory in the local device memory which should be
|
||||
// accessble to each thread per matrix A ...
|
||||
sycl::accessor<int, 2, sycl::access::mode::read_write,
|
||||
sycl::access::target::local>
|
||||
tileA(tile_range, h);
|
||||
sycl::local_accessor<T, 2> tileA(tile_range, h);
|
||||
|
||||
// ... and matrix B
|
||||
sycl::accessor<int, 2, sycl::access::mode::read_write,
|
||||
sycl::access::target::local>
|
||||
tileB(tile_range, h);
|
||||
sycl::local_accessor<T, 2> tileB(tile_range, h);
|
||||
|
||||
// We define a kernel function by passing the global_range and the
|
||||
// tile_range to the parallel_for function of the handler. Secondly,
|
||||
@ -320,7 +312,7 @@ auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,
|
||||
// passed-by-value lambda captures. As a parameter serves a nd_item, which
|
||||
// can be used to extract all relevant data linked to the running task.
|
||||
h.parallel_for<class tiled_matmul>(
|
||||
sycl::nd_range{global_range, tile_range}, [=](sycl::nd_item<2> &ID) {
|
||||
sycl::nd_range{global_range, tile_range}, [=](auto ID) {
|
||||
// extract all relevant information
|
||||
const int i = ID.get_global_id(0);
|
||||
const int j = ID.get_global_id(1);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user