Compare commits
10 Commits
6bac8f5a22
...
77c6b84a6f
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
77c6b84a6f | ||
|
|
1091a55b81 | ||
|
|
493be3576c | ||
|
|
8145d533dd | ||
|
|
9633302226 | ||
|
|
3fcd37fb1f | ||
|
|
ca136ac2e6 | ||
|
|
47331f9b00 | ||
|
|
3598bb7642 | ||
|
|
69350034eb |
@ -4,8 +4,33 @@ set(CMAKE_CXX_STANDARD 17)
|
||||
|
||||
project(sycl_example)
|
||||
|
||||
find_package(AdaptiveCpp REQUIRED)
|
||||
option(USE_INTELSYCL "Use Intel oneAPI compiler" OFF)
|
||||
option(USE_ACPP "Use AdaptiveCpp compiler" OFF)
|
||||
|
||||
if(USE_INTELSYCL AND USE_ACPP)
|
||||
message(FATAL_ERROR "Only one SYCL compiler can be selected.")
|
||||
endif()
|
||||
|
||||
if(USE_INTELSYCL)
|
||||
find_package(IntelSYCL REQUIRED)
|
||||
elseif(USE_ACPP)
|
||||
find_package(AdaptiveCpp REQUIRED)
|
||||
else()
|
||||
message(FATAL_ERROR "Set either -DUSE_INTELSYCL or -DUSE_ACPP.")
|
||||
endif()
|
||||
|
||||
|
||||
|
||||
find_library(LIB_XXHASH xxhash)
|
||||
|
||||
message(STATUS "${IntelSYCL_FOUND}")
|
||||
|
||||
if ((NOT(AdaptiveCpp_FOUND)) AND (NOT(IntelSYCL_FOUND)))
|
||||
message(FATAL_ERROR
|
||||
"Could not find either AdaptiveCpp or IntelDPCPP.
|
||||
Install one of them and provide -D<Compiler>_DIR=/path/to/cmake/config.
|
||||
Configuration not possible.")
|
||||
endif()
|
||||
|
||||
|
||||
add_subdirectory(src)
|
||||
|
||||
135
README.org
135
README.org
@ -8,7 +8,134 @@ multiplication on a single CPU core while using SYCL for both OpenMP and GPU
|
||||
parallelization. Subsequently, we will record and analyze the execution times.
|
||||
|
||||
At this stage, the project showcases how to transfer and manipulate data on the
|
||||
GPU using the Unified Shared Memory (USM) model with explicit data movement.
|
||||
Unfortunately, I've encountered a hurdle as my current implementation with =hip=
|
||||
lacks a valid USM provider for my graphics card, the AMD Radeon RX 6700 XT,
|
||||
preventing me from achieving implicit data movement for demonstration 😔
|
||||
GPU using +the Unified Shared Memory (USM) model with explicit data movement+ an
|
||||
abstract view to the host and device memory using buffers and accessors. I will
|
||||
not attend to implement those functions using Unified Shared Memory.
|
||||
|
||||
For more detailed information about the implementation and how specific
|
||||
functions are used, as well as explanations for the reasoning behind certain
|
||||
design choices, I recommend referring to the source code itself. The source code
|
||||
typically contains comments that provide insights into the code's functionality
|
||||
and rationale.
|
||||
|
||||
* Prerequisites
|
||||
|
||||
To use the project, you'll need the following prerequisites:
|
||||
|
||||
** Mandatory Prerequisites
|
||||
|
||||
- A functional SYCL compiler. You can choose from options like Intel's oneAPI or
|
||||
AdaptiveCpp.
|
||||
|
||||
- The "xxhash" library.
|
||||
|
||||
** Optional Prerequisite
|
||||
|
||||
- CMake (for generating build files)
|
||||
|
||||
* Compilation
|
||||
|
||||
** Using Intel oneAPI
|
||||
|
||||
Finally, I've made to code run with Intel's oneAPI and adapated the CMake
|
||||
generation process.
|
||||
|
||||
#+BEGIN_SRC bash
|
||||
# Make sure to source Intels vars together with the inbuild llvm!
|
||||
. /opt/intel/oneapi/setvars.sh --include-intel-llvm
|
||||
|
||||
# Create a build directory and navigate to it
|
||||
mkdir build && cd build
|
||||
|
||||
# Adjust the path to AdaptiveCpp and your target devices according to your system
|
||||
CXX=$(which icpx) cmake .. -DUSE_INTELSYCL=ON \
|
||||
-DCMAKE_BUILD_TYPE="Release"
|
||||
|
||||
# Compile the executable
|
||||
make
|
||||
#+END_SRC
|
||||
|
||||
** Using AdaptiveCpp
|
||||
|
||||
Regrettably, integrating Intel's oneAPI with the AMD GPU plugin proves to be
|
||||
quite challenging on Arch Linux, primarily due to the plugin's dependency on an
|
||||
older version of ROCm than what's available in the official repositories. While
|
||||
I could have chosen to compile my own ROCm/hip version, I opted for a more
|
||||
convenient solution and turned to the [[https://github.com/AdaptiveCpp/AdaptiveCpp/tree/develop][AdaptiveCpp]] compiler, which offers both
|
||||
CPU and GPU acceleration through CUDA and ROCm support. You can find a version
|
||||
of AdaptiveCpp compatible with AMD GPUs on the AUR (Arch User Repository).
|
||||
|
||||
If your goal is to run benchmarks on an AMD GPU alongside AdaptiveCpp, I
|
||||
recommend using [[https://github.com/sobc/pkgbuilds/tree/master/hipsycl-rocm-git][this]] specific PKGBUILD. Other versions that rely on ROCm might
|
||||
not build correctly at the moment. I've already raised an issue with the
|
||||
responsible maintainer of the PKGBUILDs to address this compatibility issu
|
||||
|
||||
Currently, I can only utilize CMake for generating makefiles when working with
|
||||
AdaptiveCpp. However, I intend to add CMake support for Intel's oneAPI as soon
|
||||
as I have a working version of the compiler.
|
||||
|
||||
To generate Makefiles for AdaptiveCpp, you can follow these steps:
|
||||
|
||||
#+BEGIN_SRC bash
|
||||
# Create a build directory and navigate to it
|
||||
mkdir build && cd build
|
||||
|
||||
# Adjust the path to AdaptiveCpp and your target devices according to your system
|
||||
cmake .. -DUSE_ACPP=ON \
|
||||
-DAdaptiveCpp_DIR=/opt/AdaptiveCpp/ROCm/lib/cmake/AdaptiveCpp \
|
||||
-DACPP_TARGETS="omp.accelerated;hip.integrated-multipass;gfx90c" \
|
||||
-DCMAKE_BUILD_TYPE="Release"
|
||||
#+END_SRC
|
||||
|
||||
You can find more information about =ACPP_TARGETS= and the compilation process in
|
||||
the documentation [[https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/compilation.md][here]].
|
||||
|
||||
Once your Makefiles are generated, you can build the project using the following
|
||||
command:
|
||||
|
||||
#+BEGIN_SRC bash
|
||||
make -j$(nproc)
|
||||
#+END_SRC
|
||||
|
||||
The compiled executable can be found in the =build/src= directory.
|
||||
|
||||
* Data Information
|
||||
|
||||
I have provided a set of 6 matrices, each with 3 different sizes:
|
||||
|
||||
- =sma*.txt=: These matrices are of size 16x16
|
||||
- =med*.txt=: These matrices are of size 2048x2048
|
||||
- =big*.txt=: These matrices are of size 8192x8192
|
||||
|
||||
All of these matrices are available in text file format, and you can locate them
|
||||
within the =data/= directory.
|
||||
|
||||
|
||||
*Important note*:
|
||||
|
||||
A word of caution when working with the large matrices (=big*.txt=): To avoid
|
||||
exceedingly long execution times, it is advisable to disable the benchmark for a
|
||||
single CPU core. You can achieve this by invoking CMake with the option
|
||||
=-DSYCL_EX_COMPILE_SEQUENTIAL_BENCH=OFF= and then recompiling the executable
|
||||
accordingly.
|
||||
|
||||
Additionally, below, you will find the results of multiplying all combinations
|
||||
of these matrices along with their corresponding checksums. Please feel free to
|
||||
reach out if you come across any other checksums or encounter further questions.
|
||||
|
||||
| Matrix A | Matrix B | Checksum |
|
||||
|------------+------------+--------------|
|
||||
| =sma1.txt= | =sma1.txt= | =0xe6134d8e= |
|
||||
| =sma2.txt= | =sma2.txt= | =0xf1ba0ac6= |
|
||||
| =sma1.txt= | =sma2.txt= | =0xe71fdf1e= |
|
||||
| =sma2.txt= | =sma1.txt= | =0x36b44d2c= |
|
||||
|------------+------------+--------------|
|
||||
| =med1.txt= | =med1.txt= | =0xd92eb6d6= |
|
||||
| =med2.txt= | =med2.txt= | =0x9f0e1206= |
|
||||
| =med1.txt= | =med2.txt= | =0x4cf45b91= |
|
||||
| =med2.txt= | =med1.txt= | =0xfdeb52bf= |
|
||||
|------------+------------+--------------|
|
||||
| =big1.txt= | =big1.txt= | =0xde9b4c0d= |
|
||||
| =big2.txt= | =big2.txt= | =0x05365fc1= |
|
||||
| =big1.txt= | =big2.txt= | =0xb185e6c1= |
|
||||
| =big2.txt= | =big1.txt= | =0x59f5ffef= |
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
add_executable(sycl_comp sycl_comp.cpp)
|
||||
add_sycl_to_target(TARGET sycl_comp)
|
||||
|
||||
add_sycl_to_target(TARGET sycl_comp SOURCES sycl_comp.cpp)
|
||||
|
||||
target_link_libraries(sycl_comp PRIVATE ${LIB_XXHASH})
|
||||
|
||||
option(SYCL_EX_COMPILE_SEQUENTIAL_BENCH
|
||||
|
||||
87
src/cuda/matrix.hpp
Normal file
87
src/cuda/matrix.hpp
Normal file
@ -0,0 +1,87 @@
|
||||
#ifndef MATRIX_H_
|
||||
#define MATRIX_H_
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include <xxhash.h>
|
||||
|
||||
template <class T> struct Matrix {
|
||||
std::uint32_t rows;
|
||||
std::uint32_t cols;
|
||||
std::vector<T> mem;
|
||||
|
||||
Matrix<T>(std::uint32_t _rows, std::uint32_t _cols)
|
||||
: rows(_rows), cols(_cols) {
|
||||
mem.resize(rows * cols);
|
||||
}
|
||||
|
||||
Matrix<T>(const char *filepath) {
|
||||
std::ifstream matfs(filepath);
|
||||
|
||||
if (matfs.fail() || !matfs.is_open()) {
|
||||
throw std::runtime_error("Error opening matrix file");
|
||||
}
|
||||
|
||||
matfs >> this->rows >> this->cols;
|
||||
this->mem.resize(this->rows * this->cols);
|
||||
|
||||
for (std::uint32_t i = 0; i < rows; i++) {
|
||||
for (std::uint32_t j = 0; j < cols; j++) {
|
||||
matfs >> (*this)(i, j);
|
||||
}
|
||||
}
|
||||
|
||||
matfs.close();
|
||||
}
|
||||
|
||||
T &operator()(std::uint32_t row_i, std::uint32_t col_j) {
|
||||
return mem[row_i * cols + col_j];
|
||||
}
|
||||
|
||||
const T &operator()(std::uint32_t row_i, std::uint32_t col_j) const {
|
||||
return mem[row_i * cols + col_j];
|
||||
}
|
||||
|
||||
Matrix<T> &operator=(const Matrix<T> &mat) {
|
||||
this->rows = mat.rows;
|
||||
this->cols = mat.cols;
|
||||
this->data = mem;
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
Matrix<T> t() const {
|
||||
Matrix<T> transposed = *this;
|
||||
for (std::uint32_t i = 0; i < this->rows; i++) {
|
||||
for (std::uint32_t j = 0; j < this->cols; j++) {
|
||||
transposed(j, i) = (*this)(i, j);
|
||||
}
|
||||
}
|
||||
return transposed;
|
||||
}
|
||||
|
||||
XXH32_hash_t chksum() const {
|
||||
constexpr XXH32_hash_t HASH_SEED = 42;
|
||||
return XXH32(this->mem.data(), mem.size(), HASH_SEED);
|
||||
}
|
||||
|
||||
std::size_t bytes() const { return this->mem.size() * sizeof(T); }
|
||||
};
|
||||
|
||||
template <class T> std::ostream &operator<<(std::ostream &os, Matrix<T> &mat) {
|
||||
for (std::uint32_t i = 0; i < mat.rows; i++) {
|
||||
for (std::uint32_t j = 0; j < mat.cols; j++) {
|
||||
os << mat(i, j) << "\t";
|
||||
}
|
||||
os << "\n";
|
||||
}
|
||||
|
||||
return os;
|
||||
}
|
||||
|
||||
#endif // MATRIX_H_
|
||||
193
src/cuda/matrixMult.cu
Normal file
193
src/cuda/matrixMult.cu
Normal file
@ -0,0 +1,193 @@
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cuda_runtime.h>
|
||||
#include <iostream>
|
||||
|
||||
#include "matrix.hpp"
|
||||
#include "timer.hpp"
|
||||
|
||||
#define stream_hex(_val) std::hex << _val << std::dec
|
||||
|
||||
#define print_pair(_desc, _chksm, _time) \
|
||||
std::cout << _desc << "\n\t" \
|
||||
<< "\t-> Check: 0x" << stream_hex(_chksm) \
|
||||
<< "\tRuntime: " << _time << " us\n\n"
|
||||
|
||||
#define checkCudaErrors(call) \
|
||||
do { \
|
||||
cudaError_t err = call; \
|
||||
if (err != cudaSuccess) { \
|
||||
printf("CUDA error at %s %d: %s\n", __FILE__, __LINE__, \
|
||||
cudaGetErrorString(err)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
enum CUDA_FUNC { CUDA_NAIVE, CUDA_TILED };
|
||||
|
||||
using data_type = int;
|
||||
|
||||
template <class T>
|
||||
__global__ void matrixMultCuda(T *matA, T *matB, std::uint32_t colsA,
|
||||
T *matRes) {
|
||||
|
||||
const auto tidx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const auto tidy = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
T sum = 0;
|
||||
|
||||
for (std::uint32_t k = 0; k < colsA; k++) {
|
||||
sum += matA[tidy * colsA + k] * matB[k * colsA + tidx];
|
||||
}
|
||||
|
||||
matRes[tidy * colsA + tidx] = sum;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void matrixMultCudaTiled(T *matA, T *matB, std::uint32_t colsA,
|
||||
std::uint32_t colsB, T *matRes) {
|
||||
|
||||
const auto tidx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const auto tidy = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const auto tile_size = blockDim.x;
|
||||
|
||||
std::uint32_t a = tidy * colsA + threadIdx.x;
|
||||
std::uint32_t b = tile_size * blockIdx.x + threadIdx.x + threadIdx.y * colsB;
|
||||
|
||||
const auto a_step = tile_size;
|
||||
const auto b_step = tile_size * colsB;
|
||||
|
||||
const auto a_end = a + colsA;
|
||||
|
||||
extern __shared__ T localMem[];
|
||||
|
||||
T *localA = &localMem[0];
|
||||
T *localB = &localMem[tile_size * tile_size];
|
||||
|
||||
T sum = 0;
|
||||
|
||||
for (; a < a_end; a += a_step, b += b_step) {
|
||||
localA[threadIdx.y * tile_size + threadIdx.x] = matA[a];
|
||||
localB[threadIdx.y * tile_size + threadIdx.x] = matB[b];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (std::uint32_t k = 0; k < tile_size; k++) {
|
||||
sum += localA[threadIdx.y * tile_size + k] *
|
||||
localB[k * tile_size + threadIdx.x];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
matRes[tidy * colsA + tidx] = sum;
|
||||
}
|
||||
|
||||
inline int prevPowerOfTwo(int n) {
|
||||
if (n <= 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Calculate the most significant bit position (MSB)
|
||||
int msbPos = 0;
|
||||
while (n > 1) {
|
||||
n >>= 1;
|
||||
msbPos++;
|
||||
}
|
||||
|
||||
// Calculate the next power of 2
|
||||
int result = 1 << msbPos;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template <class T, CUDA_FUNC F>
|
||||
std::uint32_t matrixMult(const Matrix<T> &matA, const Matrix<T> &matB) {
|
||||
Matrix<T> matRes(matA.rows, matB.cols);
|
||||
|
||||
T *d_matA;
|
||||
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_matA), matA.bytes()));
|
||||
checkCudaErrors(cudaMemcpy(d_matA, matA.mem.data(), matA.bytes(),
|
||||
cudaMemcpyHostToDevice));
|
||||
|
||||
T *d_matB;
|
||||
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_matB), matB.bytes()));
|
||||
checkCudaErrors(cudaMemcpy(d_matB, matB.mem.data(), matB.bytes(),
|
||||
cudaMemcpyHostToDevice));
|
||||
|
||||
T *d_matRes;
|
||||
checkCudaErrors(
|
||||
cudaMalloc(reinterpret_cast<void **>(&d_matRes), matRes.bytes()));
|
||||
|
||||
// obtain the maximum work group size for the given device
|
||||
int device_id;
|
||||
checkCudaErrors(cudaGetDevice(&device_id));
|
||||
|
||||
cudaDeviceProp dev_props;
|
||||
checkCudaErrors(cudaGetDeviceProperties(&dev_props, device_id));
|
||||
|
||||
std::size_t max_block_size = dev_props.maxThreadsPerBlock;
|
||||
|
||||
// each tile should not exceed the max_group_size -> T x T <= max_group_size
|
||||
const std::uint32_t max_tile_size =
|
||||
static_cast<std::uint32_t>(prevPowerOfTwo(std::sqrt(max_block_size)));
|
||||
|
||||
// maybe the problem size of the multiplication is smaller than the maximum
|
||||
// tile size
|
||||
const std::uint32_t block_size = std::min(max_tile_size, matA.cols);
|
||||
|
||||
dim3 threads(block_size, block_size);
|
||||
dim3 grid(matB.cols / threads.x, matA.rows / threads.y);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
if constexpr (F == CUDA_NAIVE) {
|
||||
matrixMultCuda<T><<<grid, threads>>>(d_matA, d_matB, matA.cols, d_matRes);
|
||||
} else if constexpr (F == CUDA_TILED) {
|
||||
const auto shared_mem_size = sizeof(T) * block_size * block_size * 2;
|
||||
matrixMultCudaTiled<T><<<grid, threads, shared_mem_size>>>(
|
||||
d_matA, d_matB, matA.cols, matB.cols, d_matRes);
|
||||
}
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
checkCudaErrors(cudaMemcpy(matRes.mem.data(), d_matRes, matRes.bytes(),
|
||||
cudaMemcpyDeviceToHost));
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
checkCudaErrors(cudaFree(d_matA));
|
||||
checkCudaErrors(cudaFree(d_matB));
|
||||
checkCudaErrors(cudaFree(d_matRes));
|
||||
|
||||
return matRes.chksum();
|
||||
}
|
||||
|
||||
auto main(int argc, char *argv[]) -> int {
|
||||
|
||||
if (argc != 3) {
|
||||
std::cerr << "Provide 2 arguments to the program!\n"
|
||||
<< "Usage: <prog> <matA>.txt <matB>.txt\n";
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
Matrix<data_type> matA(argv[1]);
|
||||
Matrix<data_type> matB(argv[2]);
|
||||
|
||||
assert(matA.rows == matB.cols);
|
||||
|
||||
const auto naive_chk =
|
||||
measure<>::duration(matrixMult<data_type, CUDA_NAIVE>, matA, matB);
|
||||
|
||||
print_pair("CUDA naive", naive_chk.first, naive_chk.second.count());
|
||||
|
||||
const auto tiled_chk =
|
||||
measure<>::duration(matrixMult<data_type, CUDA_TILED>, matA, matB);
|
||||
|
||||
print_pair("CUDA tiled", tiled_chk.first, tiled_chk.second.count());
|
||||
|
||||
return 0;
|
||||
}
|
||||
22
src/cuda/timer.hpp
Normal file
22
src/cuda/timer.hpp
Normal file
@ -0,0 +1,22 @@
|
||||
#ifndef TIMER_H_
|
||||
#define TIMER_H_
|
||||
|
||||
#include <chrono>
|
||||
#include <functional>
|
||||
#include <utility>
|
||||
|
||||
template <class TimeUnit = std::chrono::microseconds,
|
||||
class ClockType = std::chrono::steady_clock>
|
||||
struct measure {
|
||||
template <class F, class... Args>
|
||||
static auto duration(F &&func, Args &&...args) {
|
||||
auto start = ClockType::now();
|
||||
auto retVal =
|
||||
std::invoke(std::forward<F>(func), std::forward<Args>(args)...);
|
||||
auto end = ClockType::now();
|
||||
return std::make_pair(retVal,
|
||||
std::chrono::duration_cast<TimeUnit>(end - start));
|
||||
}
|
||||
};
|
||||
|
||||
#endif // TIMER_H_
|
||||
@ -1,20 +1,18 @@
|
||||
#include <sycl/sycl.hpp>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <hipSYCL/sycl/info/device.hpp>
|
||||
#include <iostream>
|
||||
|
||||
#include <CL/sycl.hpp>
|
||||
#include <utility>
|
||||
|
||||
#include "matrix.hpp"
|
||||
#include "timer.hpp"
|
||||
|
||||
namespace sycl = cl::sycl;
|
||||
|
||||
#define stream_hex(_val) std::hex << _val << std::dec
|
||||
|
||||
#define print_pair(_desc, _chksm, _time) \
|
||||
@ -100,19 +98,19 @@ auto matrixMultSYCL(sycl::queue &q, const Matrix<T> &matA,
|
||||
// cell of the product. Thus, leading to a problem size of N x M
|
||||
sycl::range<2> global_range(matRes.rows, matRes.cols);
|
||||
|
||||
const auto &inner_loop = matA.cols;
|
||||
|
||||
{
|
||||
// defining 2 dimensional buffers which can then be exposed to the device.
|
||||
// It also possible to use 1D buffers here, but then we have to manually
|
||||
// calculate the index to access the matrices for each thread in the kernel
|
||||
// code. Solving it this way will ask the compiler to do the work.
|
||||
sycl::buffer<T, 2> b_matA(matA.mem.data(),
|
||||
sycl::range<2>(matA.rows, matA.cols));
|
||||
sycl::buffer b_matA(matA.mem.data(), sycl::range(matA.rows, matA.cols));
|
||||
|
||||
sycl::buffer<T, 2> b_matB(matB.mem.data(),
|
||||
sycl::range<2>(matB.rows, matB.cols));
|
||||
sycl::buffer b_matB(matB.mem.data(), sycl::range(matB.rows, matB.cols));
|
||||
|
||||
sycl::buffer<T, 2> b_matRes(matRes.mem.data(),
|
||||
sycl::range<2>(matRes.rows, matRes.cols));
|
||||
sycl::buffer b_matRes(matRes.mem.data(),
|
||||
sycl::range(matRes.rows, matRes.cols));
|
||||
|
||||
// submit work to the device. this is done by using a lambda function which
|
||||
// references all values known to the scope i.e. the previously defined
|
||||
@ -124,10 +122,9 @@ auto matrixMultSYCL(sycl::queue &q, const Matrix<T> &matA,
|
||||
//
|
||||
// Here, we only the matrix A and B to be read from and the matrix C to be
|
||||
// written to.
|
||||
auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
|
||||
auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
|
||||
auto acc_matRes =
|
||||
b_matRes.template get_access<sycl::access::mode::write>(h);
|
||||
sycl::accessor acc_matA(b_matA, h, sycl::read_only);
|
||||
sycl::accessor acc_matB(b_matB, h, sycl::read_only);
|
||||
sycl::accessor acc_matRes(b_matRes, h, sycl::write_only);
|
||||
|
||||
// For the parallelized loop another lambda function is used, but all
|
||||
// known values are passed by value, as host and device doesn't share the
|
||||
@ -137,12 +134,12 @@ auto matrixMultSYCL(sycl::queue &q, const Matrix<T> &matA,
|
||||
// the global range we defined earlier to provide the size of the problem
|
||||
// and launch the count of tasks accordingly. The identifier of the task
|
||||
// is then passed to the lambda function as a parameter.
|
||||
h.parallel_for(global_range, [=](sycl::id<2> ID) {
|
||||
h.parallel_for(global_range, [=](auto ID) {
|
||||
const auto i = ID[0];
|
||||
const auto j = ID[1];
|
||||
T sum = 0;
|
||||
|
||||
for (auto k = 0; k < matA.cols; k++) {
|
||||
for (auto k = 0; k < inner_loop; k++) {
|
||||
sum += acc_matA[i][k] * acc_matB[k][j];
|
||||
}
|
||||
acc_matRes[i][j] = sum;
|
||||
@ -179,30 +176,30 @@ auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix<T> &matA,
|
||||
|
||||
Matrix<T> matB_t = matB.t();
|
||||
Matrix<T> matRes(matA.rows, matB.cols);
|
||||
sycl::range<2> global_range(matRes.rows, matRes.cols);
|
||||
sycl::range global_range(matRes.rows, matRes.cols);
|
||||
|
||||
const auto &inner_loop = matA.cols;
|
||||
|
||||
{
|
||||
sycl::buffer<T, 2> b_matA(matA.mem.data(),
|
||||
sycl::range<2>(matA.rows, matA.cols));
|
||||
sycl::buffer b_matA(matA.mem.data(), sycl::range(matA.rows, matA.cols));
|
||||
|
||||
sycl::buffer<T, 2> b_matB(matB_t.mem.data(),
|
||||
sycl::range<2>(matB_t.rows, matB_t.cols));
|
||||
sycl::buffer b_matB(matB_t.mem.data(),
|
||||
sycl::range(matB_t.rows, matB_t.cols));
|
||||
|
||||
sycl::buffer<T, 2> b_matRes(matRes.mem.data(),
|
||||
sycl::range<2>(matRes.rows, matRes.cols));
|
||||
sycl::buffer b_matRes(matRes.mem.data(),
|
||||
sycl::range(matRes.rows, matRes.cols));
|
||||
|
||||
q.submit([&](sycl::handler &h) {
|
||||
auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
|
||||
auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
|
||||
auto acc_matRes =
|
||||
b_matRes.template get_access<sycl::access::mode::write>(h);
|
||||
sycl::accessor acc_matA(b_matA, h, sycl::read_only);
|
||||
sycl::accessor acc_matB(b_matB, h, sycl::read_only);
|
||||
sycl::accessor acc_matRes(b_matRes, h, sycl::write_only);
|
||||
|
||||
h.parallel_for(global_range, [=](sycl::id<2> ID) {
|
||||
h.parallel_for(global_range, [=](auto ID) {
|
||||
auto i = ID[0];
|
||||
auto j = ID[1];
|
||||
T sum = 0;
|
||||
|
||||
for (auto k = 0; k < matA.cols; k++) {
|
||||
for (auto k = 0; k < inner_loop; k++) {
|
||||
sum += acc_matA[i][k] * acc_matB[j][k];
|
||||
}
|
||||
acc_matRes[i][j] = sum;
|
||||
@ -215,25 +212,19 @@ auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix<T> &matA,
|
||||
return matRes.chksum();
|
||||
}
|
||||
|
||||
/*
|
||||
* Obtained from
|
||||
* https://github.com/codeplaysoftware/computecpp-sdk/blob/master/samples/matrix-multiply.cpp
|
||||
*
|
||||
* Obtains the previous power of two from the given integer.
|
||||
* It works by masking out all ones after the first one bit,
|
||||
* then leaves the first one bit intact, effectively
|
||||
* yielding the first power of two < x. */
|
||||
inline int prevPowerOfTwo(int x) {
|
||||
if (x < 0) {
|
||||
inline int prevPowerOfTwo(int n) {
|
||||
if (n <= 0) {
|
||||
return 0;
|
||||
}
|
||||
--x;
|
||||
x |= x >> 1;
|
||||
x |= x >> 2;
|
||||
x |= x >> 4;
|
||||
x |= x >> 8;
|
||||
x |= x >> 16;
|
||||
return x - (x >> 1);
|
||||
|
||||
// Calculate the most significant bit position (MSB)
|
||||
int msbPos = 0;
|
||||
while (n > 1) {
|
||||
n >>= 1;
|
||||
msbPos++;
|
||||
}
|
||||
|
||||
return 1 << msbPos;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -290,32 +281,25 @@ auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,
|
||||
|
||||
{
|
||||
// allocate the buffers
|
||||
sycl::buffer<T, 2> b_matA(matA.mem.data(),
|
||||
sycl::range<2>(matA.rows, matA.cols));
|
||||
sycl::buffer b_matA(matA.mem.data(), sycl::range(matA.rows, matA.cols));
|
||||
|
||||
sycl::buffer<T, 2> b_matB(matB.mem.data(),
|
||||
sycl::range<2>(matB.rows, matB.cols));
|
||||
sycl::buffer b_matB(matB.mem.data(), sycl::range(matB.rows, matB.cols));
|
||||
|
||||
sycl::buffer<T, 2> b_matRes(matRes.mem.data(),
|
||||
sycl::range<2>(matRes.rows, matRes.cols));
|
||||
sycl::buffer b_matRes(matRes.mem.data(),
|
||||
sycl::range(matRes.rows, matRes.cols));
|
||||
|
||||
q.submit([&](sycl::handler &h) {
|
||||
// provide access to the buffers and ...
|
||||
auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
|
||||
auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
|
||||
auto acc_matRes =
|
||||
b_matRes.template get_access<sycl::access::mode::write>(h);
|
||||
sycl::accessor acc_matA(b_matA, h, sycl::read_only);
|
||||
sycl::accessor acc_matB(b_matB, h, sycl::read_only);
|
||||
sycl::accessor acc_matRes(b_matRes, h, sycl::write_only);
|
||||
|
||||
// ... allocate memory in the local device memory which should be
|
||||
// accessble to each thread per matrix A ...
|
||||
sycl::accessor<int, 2, sycl::access::mode::read_write,
|
||||
sycl::access::target::local>
|
||||
tileA(tile_range, h);
|
||||
sycl::local_accessor<T, 2> tileA(tile_range, h);
|
||||
|
||||
// ... and matrix B
|
||||
sycl::accessor<int, 2, sycl::access::mode::read_write,
|
||||
sycl::access::target::local>
|
||||
tileB(tile_range, h);
|
||||
sycl::local_accessor<T, 2> tileB(tile_range, h);
|
||||
|
||||
// We define a kernel function by passing the global_range and the
|
||||
// tile_range to the parallel_for function of the handler. Secondly,
|
||||
@ -323,7 +307,7 @@ auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,
|
||||
// passed-by-value lambda captures. As a parameter serves a nd_item, which
|
||||
// can be used to extract all relevant data linked to the running task.
|
||||
h.parallel_for<class tiled_matmul>(
|
||||
sycl::nd_range{global_range, tile_range}, [=](sycl::nd_item<2> &ID) {
|
||||
sycl::nd_range{global_range, tile_range}, [=](auto ID) {
|
||||
// extract all relevant information
|
||||
const int i = ID.get_global_id(0);
|
||||
const int j = ID.get_global_id(1);
|
||||
@ -342,7 +326,7 @@ auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,
|
||||
tileA[local_i][local_j] =
|
||||
acc_matA[i][tile_i * block_size + local_j];
|
||||
// here we will also transpose the B matrix
|
||||
tileB[local_j][local_i] =
|
||||
tileB[local_i][local_j] =
|
||||
acc_matB[block_size * tile_i + local_i][j];
|
||||
|
||||
// we need an explicit barrier to ensure all threads of the
|
||||
@ -350,10 +334,11 @@ auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,
|
||||
// memory
|
||||
ID.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
// build the 'local' sum over the part of matrix A and B stored in
|
||||
// the local memory
|
||||
#pragma unroll
|
||||
for (auto k = 0; k < block_size; k++) {
|
||||
sum += tileA[local_i][k] * tileB[local_j][k];
|
||||
// build the 'local' sum over the part of matrix A and B stored
|
||||
// in the local memory
|
||||
sum += tileA[local_i][k] * tileB[k][local_j];
|
||||
}
|
||||
|
||||
// ensure all threads finished the multiplication, allowing to
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user