From 4f92c2a976375a1b9c41833a046319d697b3d036 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Max=20L=C3=BCbke?= <mluebke@uni-potsdam.de>
Date: Mon, 2 Oct 2023 12:59:02 +0200
Subject: [PATCH] implement CPU only functions

---
 sycl_comp.cpp | 110 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 87 insertions(+), 23 deletions(-)
diff --git a/sycl_comp.cpp b/sycl_comp.cpp
index d6889cd..8def274 100644
--- a/sycl_comp.cpp
+++ b/sycl_comp.cpp
@@ -1,38 +1,102 @@
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
 #include <iostream>
 
 #include <CL/sycl.hpp>
+#include <utility>
 
-using namespace cl::sycl;
+#include "matrix.hpp"
+#include "timer.hpp"
+
+namespace sycl = cl::sycl;
+
+#define stream_hex(_val) std::hex << _val << std::dec
+
+template <class T>
+auto matrixMultCPU(const Matrix<T> &matA, const Matrix<T> &matB) {
+  Matrix<T> res(matA.rows, matB.cols);
+  for (std::uint32_t i = 0; i < res.rows; i++) {
+    for (std::uint32_t j = 0; j < res.cols; j++) {
+      auto &res_val = res(j, i) = 0;
+      for (std::uint32_t k = 0; k < matA.cols; k++) {
+        res_val += matA(i, k) * matB(k, j);
+      }
+    }
+  }
+
+  return res.chksum();
+}
+
+template <class T>
+auto matrixMultTransposeCPU(const Matrix<T> &matA, const Matrix<T> &matB) {
+  Matrix<T> matB_t = matB.t();
+  Matrix<T> res(matA.rows, matB.cols);
+  for (std::uint32_t i = 0; i < res.rows; i++) {
+    for (std::uint32_t j = 0; j < res.cols; j++) {
+      auto &res_val = res(j, i) = 0;
+      for (std::uint32_t k = 0; k < matA.cols; k++) {
+        res_val += matA(i, k) * matB_t(j, k);
+      }
+    }
+  }
+
+  return res.chksum();
+}
 
 auto main(int argc, char **argv) -> int {
 
-  queue q;
-
-  std::cout << "Using device: " << q.get_device().get_info<info::device::name>()
-            << "\n";
-
-  int hostArray[42];
-  auto deviceArray = static_cast<int *>(malloc_device(42 * sizeof(int), q));
-
-  for (int i = 0; i < 42; i++) {
-    hostArray[i] = i;
+  if (argc != 3) {
+    std::cerr << "Provide 2 arguments to the program!\n"
+              << "Usage: <prog> <matA>.txt <matB>.txt\n";
+    return EXIT_FAILURE;
   }
 
-  q.memcpy(deviceArray, hostArray, 42 * sizeof(int));
-  q.wait();
+  Matrix<int> matA(argv[1]);
+  Matrix<int> matB(argv[2]);
 
-  q.submit([&](handler &h) {
-    h.parallel_for(range<1>(42), [=](auto ID) { deviceArray[ID]++; });
-  });
+  assert(matA.rows == matB.cols);
 
-  q.wait();
+  auto cpu_chksum = measure<>::duration(matrixMultCPU<int>, matA, matB);
+  std::cout << "CPU only \n\t->"
+            << "Check: 0x" << stream_hex(cpu_chksum.first)
+            << "\tRuntime: " << cpu_chksum.second.count() << " us\n\n";
 
-  q.memcpy(hostArray, deviceArray, 42 * sizeof(int));
-  q.wait();
+  auto cpu_transp_chksum =
+      measure<>::duration(matrixMultTransposeCPU<int>, matA, matB);
+  std::cout << "CPU only - transposed \n\t->"
+            << "Check: 0x" << stream_hex(cpu_transp_chksum.first)
+            << "\tRuntime: " << cpu_transp_chksum.second.count() << " us\n\n";
 
-  for (int i = 0; i < 42; i++) {
-    std::cout << hostArray[i] << " ";
-  }
+  return EXIT_SUCCESS;
 
-  std::cout << "\n";
+  // sycl::queue q;
+
+  // std::cout << "Using device: "
+  //           << q.get_device().get_info<sycl::info::device::name>() << "\n";
+
+  // int hostArray[42];
+  // auto deviceArray = static_cast<int *>(malloc_device(42 * sizeof(int), q));
+
+  // for (int i = 0; i < 42; i++) {
+  //   hostArray[i] = i;
+  // }
+
+  // q.memcpy(deviceArray, hostArray, 42 * sizeof(int));
+  // q.wait();
+
+  // q.submit([&](sycl::handler &h) {
+  //   h.parallel_for(sycl::range<1>(42), [=](auto ID) { deviceArray[ID]++; });
+  // });
+
+  // q.wait();
+
+  // q.memcpy(hostArray, deviceArray, 42 * sizeof(int));
+  // q.wait();
+
+  // for (int i = 0; i < 42; i++) {
+  //   std::cout << hostArray[i] << " ";
+  // }
+
+  // std::cout << "\n";
 }