reintroduce template functions
This commit is contained in:
parent
c9c0daa098
commit
45e621f2b3
@ -23,9 +23,9 @@ namespace sycl = cl::sycl;
|
|||||||
|
|
||||||
using data_type = int;
|
using data_type = int;
|
||||||
|
|
||||||
auto matrixMultCPU(const Matrix<data_type> &matA,
|
template <class T>
|
||||||
const Matrix<data_type> &matB) {
|
auto matrixMultCPU(const Matrix<T> &matA, const Matrix<T> &matB) {
|
||||||
Matrix<data_type> res(matA.rows, matB.cols);
|
Matrix<T> res(matA.rows, matB.cols);
|
||||||
for (std::uint32_t i = 0; i < res.rows; i++) {
|
for (std::uint32_t i = 0; i < res.rows; i++) {
|
||||||
for (std::uint32_t j = 0; j < res.cols; j++) {
|
for (std::uint32_t j = 0; j < res.cols; j++) {
|
||||||
auto &res_val = res(i, j) = 0;
|
auto &res_val = res(i, j) = 0;
|
||||||
@ -38,10 +38,11 @@ auto matrixMultCPU(const Matrix<data_type> &matA,
|
|||||||
return res.chksum();
|
return res.chksum();
|
||||||
}
|
}
|
||||||
|
|
||||||
auto matrixMultTransposeCPU(const Matrix<data_type> &matA,
|
template <class T>
|
||||||
const Matrix<data_type> &matB) {
|
auto matrixMultTransposeCPU(const Matrix<T> &matA,
|
||||||
Matrix<data_type> matB_t = matB.t();
|
const Matrix<T> &matB) {
|
||||||
Matrix<data_type> res(matA.rows, matB.cols);
|
Matrix<T> matB_t = matB.t();
|
||||||
|
Matrix<T> res(matA.rows, matB.cols);
|
||||||
for (std::uint32_t i = 0; i < res.rows; i++) {
|
for (std::uint32_t i = 0; i < res.rows; i++) {
|
||||||
for (std::uint32_t j = 0; j < res.cols; j++) {
|
for (std::uint32_t j = 0; j < res.cols; j++) {
|
||||||
auto &res_val = res(i, j) = 0;
|
auto &res_val = res(i, j) = 0;
|
||||||
@ -54,30 +55,31 @@ auto matrixMultTransposeCPU(const Matrix<data_type> &matA,
|
|||||||
return res.chksum();
|
return res.chksum();
|
||||||
}
|
}
|
||||||
|
|
||||||
auto matrixMultSYCL(sycl::queue &q, const Matrix<data_type> &matA,
|
template <class T>
|
||||||
const Matrix<data_type> &matB) {
|
auto matrixMultSYCL(sycl::queue &q, const Matrix<T> &matA,
|
||||||
Matrix<data_type> matRes(matA.rows, matB.cols);
|
const Matrix<T> &matB) {
|
||||||
|
Matrix<T> matRes(matA.rows, matB.cols);
|
||||||
sycl::range<2> global_range(matRes.rows, matRes.cols);
|
sycl::range<2> global_range(matRes.rows, matRes.cols);
|
||||||
|
|
||||||
{
|
{
|
||||||
sycl::buffer<data_type, 2> b_matA(matA.mem.data(),
|
sycl::buffer<T, 2> b_matA(matA.mem.data(),
|
||||||
sycl::range<2>(matA.rows, matA.cols));
|
sycl::range<2>(matA.rows, matA.cols));
|
||||||
|
|
||||||
sycl::buffer<data_type, 2> b_matB(matB.mem.data(),
|
sycl::buffer<T, 2> b_matB(matB.mem.data(),
|
||||||
sycl::range<2>(matB.rows, matB.cols));
|
sycl::range<2>(matB.rows, matB.cols));
|
||||||
|
|
||||||
sycl::buffer<data_type, 2> b_matRes(
|
sycl::buffer<T, 2> b_matRes(
|
||||||
matRes.mem.data(), sycl::range<2>(matRes.rows, matRes.cols));
|
matRes.mem.data(), sycl::range<2>(matRes.rows, matRes.cols));
|
||||||
|
|
||||||
q.submit([&](sycl::handler &h) {
|
q.submit([&](sycl::handler &h) {
|
||||||
auto acc_matA = b_matA.get_access<sycl::access::mode::read>(h);
|
auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
|
||||||
auto acc_matB = b_matB.get_access<sycl::access::mode::read>(h);
|
auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
|
||||||
auto acc_matRes = b_matRes.get_access<sycl::access::mode::write>(h);
|
auto acc_matRes = b_matRes.template get_access<sycl::access::mode::write>(h);
|
||||||
|
|
||||||
h.parallel_for(global_range, [=](sycl::id<2> ID) {
|
h.parallel_for(global_range, [=](sycl::id<2> ID) {
|
||||||
auto i = ID[0];
|
auto i = ID[0];
|
||||||
auto j = ID[1];
|
auto j = ID[1];
|
||||||
data_type sum = 0;
|
T sum = 0;
|
||||||
|
|
||||||
for (auto k = 0; k < matA.cols; k++) {
|
for (auto k = 0; k < matA.cols; k++) {
|
||||||
sum += acc_matA[i][k] * acc_matB[k][j];
|
sum += acc_matA[i][k] * acc_matB[k][j];
|
||||||
@ -92,32 +94,33 @@ auto matrixMultSYCL(sycl::queue &q, const Matrix<data_type> &matA,
|
|||||||
return matRes.chksum();
|
return matRes.chksum();
|
||||||
}
|
}
|
||||||
|
|
||||||
auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix<data_type> &matA,
|
template <class T>
|
||||||
const Matrix<data_type> &matB) {
|
auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix<T> &matA,
|
||||||
|
const Matrix<T> &matB) {
|
||||||
|
|
||||||
Matrix<data_type> matB_t = matB.t();
|
Matrix<T> matB_t = matB.t();
|
||||||
Matrix<data_type> matRes(matA.rows, matB.cols);
|
Matrix<T> matRes(matA.rows, matB.cols);
|
||||||
sycl::range<2> global_range(matRes.rows, matRes.cols);
|
sycl::range<2> global_range(matRes.rows, matRes.cols);
|
||||||
|
|
||||||
{
|
{
|
||||||
sycl::buffer<data_type, 2> b_matA(matA.mem.data(),
|
sycl::buffer<T, 2> b_matA(matA.mem.data(),
|
||||||
sycl::range<2>(matA.rows, matA.cols));
|
sycl::range<2>(matA.rows, matA.cols));
|
||||||
|
|
||||||
sycl::buffer<data_type, 2> b_matB(matB_t.mem.data(),
|
sycl::buffer<T, 2> b_matB(matB_t.mem.data(),
|
||||||
sycl::range<2>(matB_t.rows, matB_t.cols));
|
sycl::range<2>(matB_t.rows, matB_t.cols));
|
||||||
|
|
||||||
sycl::buffer<data_type, 2> b_matRes(
|
sycl::buffer<T, 2> b_matRes(
|
||||||
matRes.mem.data(), sycl::range<2>(matRes.rows, matRes.cols));
|
matRes.mem.data(), sycl::range<2>(matRes.rows, matRes.cols));
|
||||||
|
|
||||||
q.submit([&](sycl::handler &h) {
|
q.submit([&](sycl::handler &h) {
|
||||||
auto acc_matA = b_matA.get_access<sycl::access::mode::read>(h);
|
auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
|
||||||
auto acc_matB = b_matB.get_access<sycl::access::mode::read>(h);
|
auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
|
||||||
auto acc_matRes = b_matRes.get_access<sycl::access::mode::write>(h);
|
auto acc_matRes = b_matRes.template get_access<sycl::access::mode::write>(h);
|
||||||
|
|
||||||
h.parallel_for(global_range, [=](sycl::id<2> ID) {
|
h.parallel_for(global_range, [=](sycl::id<2> ID) {
|
||||||
auto i = ID[0];
|
auto i = ID[0];
|
||||||
auto j = ID[1];
|
auto j = ID[1];
|
||||||
data_type sum = 0;
|
T sum = 0;
|
||||||
|
|
||||||
for (auto k = 0; k < matA.cols; k++) {
|
for (auto k = 0; k < matA.cols; k++) {
|
||||||
sum += acc_matA[i][k] * acc_matB[j][k];
|
sum += acc_matA[i][k] * acc_matB[j][k];
|
||||||
@ -246,32 +249,32 @@ auto main(int argc, char **argv) -> int {
|
|||||||
assert(matA.rows == matB.cols);
|
assert(matA.rows == matB.cols);
|
||||||
|
|
||||||
#ifdef SEQ_BENCH
|
#ifdef SEQ_BENCH
|
||||||
auto cpu_chksum = measure<>::duration(matrixMultCPU, matA, matB);
|
auto cpu_chksum = measure<>::duration(matrixMultCPU<data_type>, matA, matB);
|
||||||
print_pair("CPU - naive", cpu_chksum.first, cpu_chksum.second.count());
|
print_pair("CPU - naive", cpu_chksum.first, cpu_chksum.second.count());
|
||||||
|
|
||||||
auto cpu_transp_chksum =
|
auto cpu_transp_chksum =
|
||||||
measure<>::duration(matrixMultTransposeCPU, matA, matB);
|
measure<>::duration(matrixMultTransposeCPU<data_type>, matA, matB);
|
||||||
print_pair("CPU - transposed", cpu_transp_chksum.first,
|
print_pair("CPU - transposed", cpu_transp_chksum.first,
|
||||||
cpu_transp_chksum.second.count());
|
cpu_transp_chksum.second.count());
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
sycl::queue cpu_queue(sycl::cpu_selector_v);
|
sycl::queue cpu_queue(sycl::cpu_selector_v);
|
||||||
|
|
||||||
auto omp_chksum = measure<>::duration(matrixMultSYCL, cpu_queue, matA, matB);
|
auto omp_chksum = measure<>::duration(matrixMultSYCL<data_type>, cpu_queue, matA, matB);
|
||||||
print_pair("OMP - naive", omp_chksum.first, omp_chksum.second.count());
|
print_pair("OMP - naive", omp_chksum.first, omp_chksum.second.count());
|
||||||
|
|
||||||
auto omp_transp_chksum =
|
auto omp_transp_chksum =
|
||||||
measure<>::duration(matrixMultTransposeSYCL, cpu_queue, matA, matB);
|
measure<>::duration(matrixMultTransposeSYCL<data_type>, cpu_queue, matA, matB);
|
||||||
print_pair("OMP - transposed", omp_transp_chksum.first,
|
print_pair("OMP - transposed", omp_transp_chksum.first,
|
||||||
omp_transp_chksum.second.count());
|
omp_transp_chksum.second.count());
|
||||||
|
|
||||||
sycl::queue gpu_queue(sycl::gpu_selector_v);
|
sycl::queue gpu_queue(sycl::gpu_selector_v);
|
||||||
|
|
||||||
auto gpu_chksum = measure<>::duration(matrixMultSYCL, gpu_queue, matA, matB);
|
auto gpu_chksum = measure<>::duration(matrixMultSYCL<data_type>, gpu_queue, matA, matB);
|
||||||
print_pair("GPU - naive", gpu_chksum.first, gpu_chksum.second.count());
|
print_pair("GPU - naive", gpu_chksum.first, gpu_chksum.second.count());
|
||||||
|
|
||||||
auto gpu_transp_chksum =
|
auto gpu_transp_chksum =
|
||||||
measure<>::duration(matrixMultTransposeSYCL, gpu_queue, matA, matB);
|
measure<>::duration(matrixMultTransposeSYCL<data_type>, gpu_queue, matA, matB);
|
||||||
print_pair("GPU - transposed", gpu_transp_chksum.first,
|
print_pair("GPU - transposed", gpu_transp_chksum.first,
|
||||||
gpu_transp_chksum.second.count());
|
gpu_transp_chksum.second.count());
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user