implement CPU only functions

2023-10-02 12:59:02 +02:00 · 2023-10-02 12:59:02 +02:00 · 4f92c2a976
commit 4f92c2a976
parent 381fa26aac
1 changed files with 87 additions and 23 deletions
--- a/sycl_comp.cpp
+++ b/sycl_comp.cpp
@ -1,38 +1,102 @@
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
 #include <iostream>

 #include <CL/sycl.hpp>
+#include <utility>

-using namespace cl::sycl;
+#include "matrix.hpp"
+#include "timer.hpp"
+
+namespace sycl = cl::sycl;
+
+#define stream_hex(_val) std::hex << _val << std::dec
+
+template <class T>
+auto matrixMultCPU(const Matrix<T> &matA, const Matrix<T> &matB) {
+  Matrix<T> res(matA.rows, matB.cols);
+  for (std::uint32_t i = 0; i < res.rows; i++) {
+    for (std::uint32_t j = 0; j < res.cols; j++) {
+      auto &res_val = res(j, i) = 0;
+      for (std::uint32_t k = 0; k < matA.cols; k++) {
+        res_val += matA(i, k) * matB(k, j);
+      }
+    }
+  }
+
+  return res.chksum();
+}
+
+template <class T>
+auto matrixMultTransposeCPU(const Matrix<T> &matA, const Matrix<T> &matB) {
+  Matrix<T> matB_t = matB.t();
+  Matrix<T> res(matA.rows, matB.cols);
+  for (std::uint32_t i = 0; i < res.rows; i++) {
+    for (std::uint32_t j = 0; j < res.cols; j++) {
+      auto &res_val = res(j, i) = 0;
+      for (std::uint32_t k = 0; k < matA.cols; k++) {
+        res_val += matA(i, k) * matB_t(j, k);
+      }
+    }
+  }
+
+  return res.chksum();
+}

 auto main(int argc, char **argv) -> int {

-  queue q;
-
-  std::cout << "Using device: " << q.get_device().get_info<info::device::name>()
-            << "\n";
-
-  int hostArray[42];
-  auto deviceArray = static_cast<int *>(malloc_device(42 * sizeof(int), q));
-
-  for (int i = 0; i < 42; i++) {
-    hostArray[i] = i;
+  if (argc != 3) {
+    std::cerr << "Provide 2 arguments to the program!\n"
+              << "Usage: <prog> <matA>.txt <matB>.txt\n";
+    return EXIT_FAILURE;
  }

-  q.memcpy(deviceArray, hostArray, 42 * sizeof(int));
-  q.wait();
+  Matrix<int> matA(argv[1]);
+  Matrix<int> matB(argv[2]);

-  q.submit([&](handler &h) {
-    h.parallel_for(range<1>(42), [=](auto ID) { deviceArray[ID]++; });
-  });
+  assert(matA.rows == matB.cols);

-  q.wait();
+  auto cpu_chksum = measure<>::duration(matrixMultCPU<int>, matA, matB);
+  std::cout << "CPU only \n\t->"
+            << "Check: 0x" << stream_hex(cpu_chksum.first)
+            << "\tRuntime: " << cpu_chksum.second.count() << " us\n\n";

-  q.memcpy(hostArray, deviceArray, 42 * sizeof(int));
-  q.wait();
+  auto cpu_transp_chksum =
+      measure<>::duration(matrixMultTransposeCPU<int>, matA, matB);
+  std::cout << "CPU only - transposed \n\t->"
+            << "Check: 0x" << stream_hex(cpu_transp_chksum.first)
+            << "\tRuntime: " << cpu_transp_chksum.second.count() << " us\n\n";

-  for (int i = 0; i < 42; i++) {
-    std::cout << hostArray[i] << " ";
-  }
+  return EXIT_SUCCESS;

-  std::cout << "\n";
+  // sycl::queue q;
+
+  // std::cout << "Using device: "
+  //           << q.get_device().get_info<sycl::info::device::name>() << "\n";
+
+  // int hostArray[42];
+  // auto deviceArray = static_cast<int *>(malloc_device(42 * sizeof(int), q));
+
+  // for (int i = 0; i < 42; i++) {
+  //   hostArray[i] = i;
+  // }
+
+  // q.memcpy(deviceArray, hostArray, 42 * sizeof(int));
+  // q.wait();
+
+  // q.submit([&](sycl::handler &h) {
+  //   h.parallel_for(sycl::range<1>(42), [=](auto ID) { deviceArray[ID]++; });
+  // });
+
+  // q.wait();
+
+  // q.memcpy(hostArray, deviceArray, 42 * sizeof(int));
+  // q.wait();
+
+  // for (int i = 0; i < 42; i++) {
+  //   std::cout << hostArray[i] << " ";
+  // }
+
+  // std::cout << "\n";
 }