#include #include #include #include #include #include #include "matrix.hpp" #include "timer.hpp" namespace sycl = cl::sycl; #define stream_hex(_val) std::hex << _val << std::dec template auto matrixMultCPU(const Matrix &matA, const Matrix &matB) { Matrix res(matA.rows, matB.cols); for (std::uint32_t i = 0; i < res.rows; i++) { for (std::uint32_t j = 0; j < res.cols; j++) { auto &res_val = res(j, i) = 0; for (std::uint32_t k = 0; k < matA.cols; k++) { res_val += matA(i, k) * matB(k, j); } } } return res.chksum(); } template auto matrixMultTransposeCPU(const Matrix &matA, const Matrix &matB) { Matrix matB_t = matB.t(); Matrix res(matA.rows, matB.cols); for (std::uint32_t i = 0; i < res.rows; i++) { for (std::uint32_t j = 0; j < res.cols; j++) { auto &res_val = res(j, i) = 0; for (std::uint32_t k = 0; k < matA.cols; k++) { res_val += matA(i, k) * matB_t(j, k); } } } return res.chksum(); } auto main(int argc, char **argv) -> int { if (argc != 3) { std::cerr << "Provide 2 arguments to the program!\n" << "Usage: .txt .txt\n"; return EXIT_FAILURE; } Matrix matA(argv[1]); Matrix matB(argv[2]); assert(matA.rows == matB.cols); auto cpu_chksum = measure<>::duration(matrixMultCPU, matA, matB); std::cout << "CPU only \n\t->" << "Check: 0x" << stream_hex(cpu_chksum.first) << "\tRuntime: " << cpu_chksum.second.count() << " us\n\n"; auto cpu_transp_chksum = measure<>::duration(matrixMultTransposeCPU, matA, matB); std::cout << "CPU only - transposed \n\t->" << "Check: 0x" << stream_hex(cpu_transp_chksum.first) << "\tRuntime: " << cpu_transp_chksum.second.count() << " us\n\n"; return EXIT_SUCCESS; // sycl::queue q; // std::cout << "Using device: " // << q.get_device().get_info() << "\n"; // int hostArray[42]; // auto deviceArray = static_cast(malloc_device(42 * sizeof(int), q)); // for (int i = 0; i < 42; i++) { // hostArray[i] = i; // } // q.memcpy(deviceArray, hostArray, 42 * sizeof(int)); // q.wait(); // q.submit([&](sycl::handler &h) { // h.parallel_for(sycl::range<1>(42), [=](auto ID) { deviceArray[ID]++; }); // }); // q.wait(); // q.memcpy(hostArray, deviceArray, 42 * sizeof(int)); // q.wait(); // for (int i = 0; i < 42; i++) { // std::cout << hostArray[i] << " "; // } // std::cout << "\n"; }