From 77c6b84a6f3ff6a5c5d6fe06bd0d4c205e87f133 Mon Sep 17 00:00:00 2001 From: Marco De Lucia Date: Fri, 13 Oct 2023 10:09:35 +0200 Subject: [PATCH] Commit CUDA code in src/cuda --- src/cuda/matrix.hpp | 87 +++++++++++++++++++ src/cuda/matrixMult.cu | 193 +++++++++++++++++++++++++++++++++++++++++ src/cuda/timer.hpp | 22 +++++ 3 files changed, 302 insertions(+) create mode 100644 src/cuda/matrix.hpp create mode 100644 src/cuda/matrixMult.cu create mode 100644 src/cuda/timer.hpp diff --git a/src/cuda/matrix.hpp b/src/cuda/matrix.hpp new file mode 100644 index 0000000..91f3aa7 --- /dev/null +++ b/src/cuda/matrix.hpp @@ -0,0 +1,87 @@ +#ifndef MATRIX_H_ +#define MATRIX_H_ + +#include +#include +#include +#include +#include +#include + +#include + +template struct Matrix { + std::uint32_t rows; + std::uint32_t cols; + std::vector mem; + + Matrix(std::uint32_t _rows, std::uint32_t _cols) + : rows(_rows), cols(_cols) { + mem.resize(rows * cols); + } + + Matrix(const char *filepath) { + std::ifstream matfs(filepath); + + if (matfs.fail() || !matfs.is_open()) { + throw std::runtime_error("Error opening matrix file"); + } + + matfs >> this->rows >> this->cols; + this->mem.resize(this->rows * this->cols); + + for (std::uint32_t i = 0; i < rows; i++) { + for (std::uint32_t j = 0; j < cols; j++) { + matfs >> (*this)(i, j); + } + } + + matfs.close(); + } + + T &operator()(std::uint32_t row_i, std::uint32_t col_j) { + return mem[row_i * cols + col_j]; + } + + const T &operator()(std::uint32_t row_i, std::uint32_t col_j) const { + return mem[row_i * cols + col_j]; + } + + Matrix &operator=(const Matrix &mat) { + this->rows = mat.rows; + this->cols = mat.cols; + this->data = mem; + + return *this; + } + + Matrix t() const { + Matrix transposed = *this; + for (std::uint32_t i = 0; i < this->rows; i++) { + for (std::uint32_t j = 0; j < this->cols; j++) { + transposed(j, i) = (*this)(i, j); + } + } + return transposed; + } + + XXH32_hash_t chksum() const { + constexpr XXH32_hash_t HASH_SEED = 42; + return XXH32(this->mem.data(), mem.size(), HASH_SEED); + } + + std::size_t bytes() const { return this->mem.size() * sizeof(T); } +}; + +template std::ostream &operator<<(std::ostream &os, Matrix &mat) { + for (std::uint32_t i = 0; i < mat.rows; i++) { + for (std::uint32_t j = 0; j < mat.cols; j++) { + os << mat(i, j) << "\t"; + } + os << "\n"; + } + + return os; +} + +#endif // MATRIX_H_ diff --git a/src/cuda/matrixMult.cu b/src/cuda/matrixMult.cu new file mode 100644 index 0000000..1a1853d --- /dev/null +++ b/src/cuda/matrixMult.cu @@ -0,0 +1,193 @@ +#include +#include +#include +#include +#include +#include + +#include "matrix.hpp" +#include "timer.hpp" + +#define stream_hex(_val) std::hex << _val << std::dec + +#define print_pair(_desc, _chksm, _time) \ + std::cout << _desc << "\n\t" \ + << "\t-> Check: 0x" << stream_hex(_chksm) \ + << "\tRuntime: " << _time << " us\n\n" + +#define checkCudaErrors(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + printf("CUDA error at %s %d: %s\n", __FILE__, __LINE__, \ + cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +enum CUDA_FUNC { CUDA_NAIVE, CUDA_TILED }; + +using data_type = int; + +template +__global__ void matrixMultCuda(T *matA, T *matB, std::uint32_t colsA, + T *matRes) { + + const auto tidx = blockIdx.x * blockDim.x + threadIdx.x; + const auto tidy = blockIdx.y * blockDim.y + threadIdx.y; + + T sum = 0; + + for (std::uint32_t k = 0; k < colsA; k++) { + sum += matA[tidy * colsA + k] * matB[k * colsA + tidx]; + } + + matRes[tidy * colsA + tidx] = sum; +} + +template +__global__ void matrixMultCudaTiled(T *matA, T *matB, std::uint32_t colsA, + std::uint32_t colsB, T *matRes) { + + const auto tidx = blockIdx.x * blockDim.x + threadIdx.x; + const auto tidy = blockIdx.y * blockDim.y + threadIdx.y; + + const auto tile_size = blockDim.x; + + std::uint32_t a = tidy * colsA + threadIdx.x; + std::uint32_t b = tile_size * blockIdx.x + threadIdx.x + threadIdx.y * colsB; + + const auto a_step = tile_size; + const auto b_step = tile_size * colsB; + + const auto a_end = a + colsA; + + extern __shared__ T localMem[]; + + T *localA = &localMem[0]; + T *localB = &localMem[tile_size * tile_size]; + + T sum = 0; + + for (; a < a_end; a += a_step, b += b_step) { + localA[threadIdx.y * tile_size + threadIdx.x] = matA[a]; + localB[threadIdx.y * tile_size + threadIdx.x] = matB[b]; + + __syncthreads(); + +#pragma unroll + for (std::uint32_t k = 0; k < tile_size; k++) { + sum += localA[threadIdx.y * tile_size + k] * + localB[k * tile_size + threadIdx.x]; + } + + __syncthreads(); + } + + matRes[tidy * colsA + tidx] = sum; +} + +inline int prevPowerOfTwo(int n) { + if (n <= 0) { + return 0; + } + + // Calculate the most significant bit position (MSB) + int msbPos = 0; + while (n > 1) { + n >>= 1; + msbPos++; + } + + // Calculate the next power of 2 + int result = 1 << msbPos; + + return result; +} + +template +std::uint32_t matrixMult(const Matrix &matA, const Matrix &matB) { + Matrix matRes(matA.rows, matB.cols); + + T *d_matA; + checkCudaErrors(cudaMalloc(reinterpret_cast(&d_matA), matA.bytes())); + checkCudaErrors(cudaMemcpy(d_matA, matA.mem.data(), matA.bytes(), + cudaMemcpyHostToDevice)); + + T *d_matB; + checkCudaErrors(cudaMalloc(reinterpret_cast(&d_matB), matB.bytes())); + checkCudaErrors(cudaMemcpy(d_matB, matB.mem.data(), matB.bytes(), + cudaMemcpyHostToDevice)); + + T *d_matRes; + checkCudaErrors( + cudaMalloc(reinterpret_cast(&d_matRes), matRes.bytes())); + + // obtain the maximum work group size for the given device + int device_id; + checkCudaErrors(cudaGetDevice(&device_id)); + + cudaDeviceProp dev_props; + checkCudaErrors(cudaGetDeviceProperties(&dev_props, device_id)); + + std::size_t max_block_size = dev_props.maxThreadsPerBlock; + + // each tile should not exceed the max_group_size -> T x T <= max_group_size + const std::uint32_t max_tile_size = + static_cast(prevPowerOfTwo(std::sqrt(max_block_size))); + + // maybe the problem size of the multiplication is smaller than the maximum + // tile size + const std::uint32_t block_size = std::min(max_tile_size, matA.cols); + + dim3 threads(block_size, block_size); + dim3 grid(matB.cols / threads.x, matA.rows / threads.y); + + cudaDeviceSynchronize(); + + if constexpr (F == CUDA_NAIVE) { + matrixMultCuda<<>>(d_matA, d_matB, matA.cols, d_matRes); + } else if constexpr (F == CUDA_TILED) { + const auto shared_mem_size = sizeof(T) * block_size * block_size * 2; + matrixMultCudaTiled<<>>( + d_matA, d_matB, matA.cols, matB.cols, d_matRes); + } + + cudaDeviceSynchronize(); + + checkCudaErrors(cudaMemcpy(matRes.mem.data(), d_matRes, matRes.bytes(), + cudaMemcpyDeviceToHost)); + cudaDeviceSynchronize(); + + checkCudaErrors(cudaFree(d_matA)); + checkCudaErrors(cudaFree(d_matB)); + checkCudaErrors(cudaFree(d_matRes)); + + return matRes.chksum(); +} + +auto main(int argc, char *argv[]) -> int { + + if (argc != 3) { + std::cerr << "Provide 2 arguments to the program!\n" + << "Usage: .txt .txt\n"; + return EXIT_FAILURE; + } + + Matrix matA(argv[1]); + Matrix matB(argv[2]); + + assert(matA.rows == matB.cols); + + const auto naive_chk = + measure<>::duration(matrixMult, matA, matB); + + print_pair("CUDA naive", naive_chk.first, naive_chk.second.count()); + + const auto tiled_chk = + measure<>::duration(matrixMult, matA, matB); + + print_pair("CUDA tiled", tiled_chk.first, tiled_chk.second.count()); + + return 0; +} diff --git a/src/cuda/timer.hpp b/src/cuda/timer.hpp new file mode 100644 index 0000000..5e7d1d7 --- /dev/null +++ b/src/cuda/timer.hpp @@ -0,0 +1,22 @@ +#ifndef TIMER_H_ +#define TIMER_H_ + +#include +#include +#include + +template +struct measure { + template + static auto duration(F &&func, Args &&...args) { + auto start = ClockType::now(); + auto retVal = + std::invoke(std::forward(func), std::forward(args)...); + auto end = ClockType::now(); + return std::make_pair(retVal, + std::chrono::duration_cast(end - start)); + } +}; + +#endif // TIMER_H_