From 4f92c2a976375a1b9c41833a046319d697b3d036 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Max=20L=C3=BCbke?= Date: Mon, 2 Oct 2023 12:59:02 +0200 Subject: [PATCH] implement CPU only functions --- sycl_comp.cpp | 110 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 87 insertions(+), 23 deletions(-) diff --git a/sycl_comp.cpp b/sycl_comp.cpp index d6889cd..8def274 100644 --- a/sycl_comp.cpp +++ b/sycl_comp.cpp @@ -1,38 +1,102 @@ +#include +#include +#include #include #include +#include -using namespace cl::sycl; +#include "matrix.hpp" +#include "timer.hpp" + +namespace sycl = cl::sycl; + +#define stream_hex(_val) std::hex << _val << std::dec + +template +auto matrixMultCPU(const Matrix &matA, const Matrix &matB) { + Matrix res(matA.rows, matB.cols); + for (std::uint32_t i = 0; i < res.rows; i++) { + for (std::uint32_t j = 0; j < res.cols; j++) { + auto &res_val = res(j, i) = 0; + for (std::uint32_t k = 0; k < matA.cols; k++) { + res_val += matA(i, k) * matB(k, j); + } + } + } + + return res.chksum(); +} + +template +auto matrixMultTransposeCPU(const Matrix &matA, const Matrix &matB) { + Matrix matB_t = matB.t(); + Matrix res(matA.rows, matB.cols); + for (std::uint32_t i = 0; i < res.rows; i++) { + for (std::uint32_t j = 0; j < res.cols; j++) { + auto &res_val = res(j, i) = 0; + for (std::uint32_t k = 0; k < matA.cols; k++) { + res_val += matA(i, k) * matB_t(j, k); + } + } + } + + return res.chksum(); +} auto main(int argc, char **argv) -> int { - queue q; - - std::cout << "Using device: " << q.get_device().get_info() - << "\n"; - - int hostArray[42]; - auto deviceArray = static_cast(malloc_device(42 * sizeof(int), q)); - - for (int i = 0; i < 42; i++) { - hostArray[i] = i; + if (argc != 3) { + std::cerr << "Provide 2 arguments to the program!\n" + << "Usage: .txt .txt\n"; + return EXIT_FAILURE; } - q.memcpy(deviceArray, hostArray, 42 * sizeof(int)); - q.wait(); + Matrix matA(argv[1]); + Matrix matB(argv[2]); - q.submit([&](handler &h) { - h.parallel_for(range<1>(42), [=](auto ID) { deviceArray[ID]++; }); - }); + assert(matA.rows == matB.cols); - q.wait(); + auto cpu_chksum = measure<>::duration(matrixMultCPU, matA, matB); + std::cout << "CPU only \n\t->" + << "Check: 0x" << stream_hex(cpu_chksum.first) + << "\tRuntime: " << cpu_chksum.second.count() << " us\n\n"; - q.memcpy(hostArray, deviceArray, 42 * sizeof(int)); - q.wait(); + auto cpu_transp_chksum = + measure<>::duration(matrixMultTransposeCPU, matA, matB); + std::cout << "CPU only - transposed \n\t->" + << "Check: 0x" << stream_hex(cpu_transp_chksum.first) + << "\tRuntime: " << cpu_transp_chksum.second.count() << " us\n\n"; - for (int i = 0; i < 42; i++) { - std::cout << hostArray[i] << " "; - } + return EXIT_SUCCESS; - std::cout << "\n"; + // sycl::queue q; + + // std::cout << "Using device: " + // << q.get_device().get_info() << "\n"; + + // int hostArray[42]; + // auto deviceArray = static_cast(malloc_device(42 * sizeof(int), q)); + + // for (int i = 0; i < 42; i++) { + // hostArray[i] = i; + // } + + // q.memcpy(deviceArray, hostArray, 42 * sizeof(int)); + // q.wait(); + + // q.submit([&](sycl::handler &h) { + // h.parallel_for(sycl::range<1>(42), [=](auto ID) { deviceArray[ID]++; }); + // }); + + // q.wait(); + + // q.memcpy(hostArray, deviceArray, 42 * sizeof(int)); + // q.wait(); + + // for (int i = 0; i < 42; i++) { + // std::cout << hostArray[i] << " "; + // } + + // std::cout << "\n"; }