From 77c6b84a6f3ff6a5c5d6fe06bd0d4c205e87f133 Mon Sep 17 00:00:00 2001
From: Marco De Lucia <delucia@gfz-potsdam.de>
Date: Fri, 13 Oct 2023 10:09:35 +0200
Subject: [PATCH] Commit CUDA code in src/cuda

---
 src/cuda/matrix.hpp    |  87 +++++++++++++++++++
 src/cuda/matrixMult.cu | 193 +++++++++++++++++++++++++++++++++++++++++
 src/cuda/timer.hpp     |  22 +++++
 3 files changed, 302 insertions(+)
 create mode 100644 src/cuda/matrix.hpp
 create mode 100644 src/cuda/matrixMult.cu
 create mode 100644 src/cuda/timer.hpp
diff --git a/src/cuda/matrix.hpp b/src/cuda/matrix.hpp
new file mode 100644
index 0000000..91f3aa7
--- /dev/null
+++ b/src/cuda/matrix.hpp
@@ -0,0 +1,87 @@
+#ifndef MATRIX_H_
+#define MATRIX_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <xxhash.h>
+
+template <class T> struct Matrix {
+  std::uint32_t rows;
+  std::uint32_t cols;
+  std::vector<T> mem;
+
+  Matrix<T>(std::uint32_t _rows, std::uint32_t _cols)
+      : rows(_rows), cols(_cols) {
+    mem.resize(rows * cols);
+  }
+
+  Matrix<T>(const char *filepath) {
+    std::ifstream matfs(filepath);
+
+    if (matfs.fail() || !matfs.is_open()) {
+      throw std::runtime_error("Error opening matrix file");
+    }
+
+    matfs >> this->rows >> this->cols;
+    this->mem.resize(this->rows * this->cols);
+
+    for (std::uint32_t i = 0; i < rows; i++) {
+      for (std::uint32_t j = 0; j < cols; j++) {
+        matfs >> (*this)(i, j);
+      }
+    }
+
+    matfs.close();
+  }
+
+  T &operator()(std::uint32_t row_i, std::uint32_t col_j) {
+    return mem[row_i * cols + col_j];
+  }
+
+  const T &operator()(std::uint32_t row_i, std::uint32_t col_j) const {
+    return mem[row_i * cols + col_j];
+  }
+
+  Matrix<T> &operator=(const Matrix<T> &mat) {
+    this->rows = mat.rows;
+    this->cols = mat.cols;
+    this->data = mem;
+
+    return *this;
+  }
+
+  Matrix<T> t() const {
+    Matrix<T> transposed = *this;
+    for (std::uint32_t i = 0; i < this->rows; i++) {
+      for (std::uint32_t j = 0; j < this->cols; j++) {
+        transposed(j, i) = (*this)(i, j);
+      }
+    }
+    return transposed;
+  }
+
+  XXH32_hash_t chksum() const {
+    constexpr XXH32_hash_t HASH_SEED = 42;
+    return XXH32(this->mem.data(), mem.size(), HASH_SEED);
+  }
+
+  std::size_t bytes() const { return this->mem.size() * sizeof(T); }
+};
+
+template <class T> std::ostream &operator<<(std::ostream &os, Matrix<T> &mat) {
+  for (std::uint32_t i = 0; i < mat.rows; i++) {
+    for (std::uint32_t j = 0; j < mat.cols; j++) {
+      os << mat(i, j) << "\t";
+    }
+    os << "\n";
+  }
+
+  return os;
+}
+
+#endif // MATRIX_H_
diff --git a/src/cuda/matrixMult.cu b/src/cuda/matrixMult.cu
new file mode 100644
index 0000000..1a1853d
--- /dev/null
+++ b/src/cuda/matrixMult.cu
@@ -0,0 +1,193 @@
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <cuda_runtime.h>
+#include <iostream>
+
+#include "matrix.hpp"
+#include "timer.hpp"
+
+#define stream_hex(_val) std::hex << _val << std::dec
+
+#define print_pair(_desc, _chksm, _time)                                       \
+  std::cout << _desc << "\n\t"                                                 \
+            << "\t-> Check: 0x" << stream_hex(_chksm)                          \
+            << "\tRuntime: " << _time << " us\n\n"
+
+#define checkCudaErrors(call)                                                  \
+  do {                                                                         \
+    cudaError_t err = call;                                                    \
+    if (err != cudaSuccess) {                                                  \
+      printf("CUDA error at %s %d: %s\n", __FILE__, __LINE__,                  \
+             cudaGetErrorString(err));                                         \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+enum CUDA_FUNC { CUDA_NAIVE, CUDA_TILED };
+
+using data_type = int;
+
+template <class T>
+__global__ void matrixMultCuda(T *matA, T *matB, std::uint32_t colsA,
+                               T *matRes) {
+
+  const auto tidx = blockIdx.x * blockDim.x + threadIdx.x;
+  const auto tidy = blockIdx.y * blockDim.y + threadIdx.y;
+
+  T sum = 0;
+
+  for (std::uint32_t k = 0; k < colsA; k++) {
+    sum += matA[tidy * colsA + k] * matB[k * colsA + tidx];
+  }
+
+  matRes[tidy * colsA + tidx] = sum;
+}
+
+template <class T>
+__global__ void matrixMultCudaTiled(T *matA, T *matB, std::uint32_t colsA,
+                                    std::uint32_t colsB, T *matRes) {
+
+  const auto tidx = blockIdx.x * blockDim.x + threadIdx.x;
+  const auto tidy = blockIdx.y * blockDim.y + threadIdx.y;
+
+  const auto tile_size = blockDim.x;
+
+  std::uint32_t a = tidy * colsA + threadIdx.x;
+  std::uint32_t b = tile_size * blockIdx.x + threadIdx.x + threadIdx.y * colsB;
+
+  const auto a_step = tile_size;
+  const auto b_step = tile_size * colsB;
+
+  const auto a_end = a + colsA;
+
+  extern __shared__ T localMem[];
+
+  T *localA = &localMem[0];
+  T *localB = &localMem[tile_size * tile_size];
+
+  T sum = 0;
+
+  for (; a < a_end; a += a_step, b += b_step) {
+    localA[threadIdx.y * tile_size + threadIdx.x] = matA[a];
+    localB[threadIdx.y * tile_size + threadIdx.x] = matB[b];
+
+    __syncthreads();
+
+#pragma unroll
+    for (std::uint32_t k = 0; k < tile_size; k++) {
+      sum += localA[threadIdx.y * tile_size + k] *
+             localB[k * tile_size + threadIdx.x];
+    }
+
+    __syncthreads();
+  }
+
+  matRes[tidy * colsA + tidx] = sum;
+}
+
+inline int prevPowerOfTwo(int n) {
+  if (n <= 0) {
+    return 0;
+  }
+
+  // Calculate the most significant bit position (MSB)
+  int msbPos = 0;
+  while (n > 1) {
+    n >>= 1;
+    msbPos++;
+  }
+
+  // Calculate the next power of 2
+  int result = 1 << msbPos;
+
+  return result;
+}
+
+template <class T, CUDA_FUNC F>
+std::uint32_t matrixMult(const Matrix<T> &matA, const Matrix<T> &matB) {
+  Matrix<T> matRes(matA.rows, matB.cols);
+
+  T *d_matA;
+  checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_matA), matA.bytes()));
+  checkCudaErrors(cudaMemcpy(d_matA, matA.mem.data(), matA.bytes(),
+                             cudaMemcpyHostToDevice));
+
+  T *d_matB;
+  checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_matB), matB.bytes()));
+  checkCudaErrors(cudaMemcpy(d_matB, matB.mem.data(), matB.bytes(),
+                             cudaMemcpyHostToDevice));
+
+  T *d_matRes;
+  checkCudaErrors(
+      cudaMalloc(reinterpret_cast<void **>(&d_matRes), matRes.bytes()));
+
+  // obtain the maximum work group size for the given device
+  int device_id;
+  checkCudaErrors(cudaGetDevice(&device_id));
+
+  cudaDeviceProp dev_props;
+  checkCudaErrors(cudaGetDeviceProperties(&dev_props, device_id));
+
+  std::size_t max_block_size = dev_props.maxThreadsPerBlock;
+
+  // each tile should not exceed the max_group_size -> T x T <= max_group_size
+  const std::uint32_t max_tile_size =
+      static_cast<std::uint32_t>(prevPowerOfTwo(std::sqrt(max_block_size)));
+
+  // maybe the problem size of the multiplication is smaller than the maximum
+  // tile size
+  const std::uint32_t block_size = std::min(max_tile_size, matA.cols);
+
+  dim3 threads(block_size, block_size);
+  dim3 grid(matB.cols / threads.x, matA.rows / threads.y);
+
+  cudaDeviceSynchronize();
+
+  if constexpr (F == CUDA_NAIVE) {
+    matrixMultCuda<T><<<grid, threads>>>(d_matA, d_matB, matA.cols, d_matRes);
+  } else if constexpr (F == CUDA_TILED) {
+    const auto shared_mem_size = sizeof(T) * block_size * block_size * 2;
+    matrixMultCudaTiled<T><<<grid, threads, shared_mem_size>>>(
+        d_matA, d_matB, matA.cols, matB.cols, d_matRes);
+  }
+
+  cudaDeviceSynchronize();
+
+  checkCudaErrors(cudaMemcpy(matRes.mem.data(), d_matRes, matRes.bytes(),
+                             cudaMemcpyDeviceToHost));
+  cudaDeviceSynchronize();
+
+  checkCudaErrors(cudaFree(d_matA));
+  checkCudaErrors(cudaFree(d_matB));
+  checkCudaErrors(cudaFree(d_matRes));
+
+  return matRes.chksum();
+}
+
+auto main(int argc, char *argv[]) -> int {
+
+  if (argc != 3) {
+    std::cerr << "Provide 2 arguments to the program!\n"
+              << "Usage: <prog> <matA>.txt <matB>.txt\n";
+    return EXIT_FAILURE;
+  }
+
+  Matrix<data_type> matA(argv[1]);
+  Matrix<data_type> matB(argv[2]);
+
+  assert(matA.rows == matB.cols);
+
+  const auto naive_chk =
+      measure<>::duration(matrixMult<data_type, CUDA_NAIVE>, matA, matB);
+
+  print_pair("CUDA naive", naive_chk.first, naive_chk.second.count());
+
+  const auto tiled_chk =
+      measure<>::duration(matrixMult<data_type, CUDA_TILED>, matA, matB);
+
+  print_pair("CUDA tiled", tiled_chk.first, tiled_chk.second.count());
+
+  return 0;
+}
diff --git a/src/cuda/timer.hpp b/src/cuda/timer.hpp
new file mode 100644
index 0000000..5e7d1d7
--- /dev/null
+++ b/src/cuda/timer.hpp
@@ -0,0 +1,22 @@
+#ifndef TIMER_H_
+#define TIMER_H_
+
+#include <chrono>
+#include <functional>
+#include <utility>
+
+template <class TimeUnit = std::chrono::microseconds,
+          class ClockType = std::chrono::steady_clock>
+struct measure {
+  template <class F, class... Args>
+  static auto duration(F &&func, Args &&...args) {
+    auto start = ClockType::now();
+    auto retVal =
+        std::invoke(std::forward<F>(func), std::forward<Args>(args)...);
+    auto end = ClockType::now();
+    return std::make_pair(retVal,
+                          std::chrono::duration_cast<TimeUnit>(end - start));
+  }
+};
+
+#endif // TIMER_H_