From db765fff8d7e1329b14479c2b259c42fc351d764 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Max=20L=C3=BCbke?= Date: Fri, 29 Sep 2023 17:16:41 +0200 Subject: [PATCH] Initial commit --- CMakeLists.txt | 8 ++++++++ README.org | 14 ++++++++++++++ sycl_comp.cpp | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 README.org create mode 100644 sycl_comp.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..4a68522 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,8 @@ +cmake_minimum_required(VERSION 3.25) + +project(sycl_example) + +find_package(AdaptiveCpp REQUIRED) + +add_executable(sycl_comp sycl_comp.cpp) +add_sycl_to_target(TARGET sycl_comp) diff --git a/README.org b/README.org new file mode 100644 index 0000000..a123d06 --- /dev/null +++ b/README.org @@ -0,0 +1,14 @@ +#+title: Matrix multiplication with SYCL, yay + +This project serves as a sample demonstration of SYCL syntax and offers a +straightforward program as an illustration. + +Its primary objective is to function as a benchmark for executing matrix +multiplication on a single CPU core while using SYCL for both OpenMP and GPU +parallelization. Subsequently, we will record and analyze the execution times. + +At this stage, the project showcases how to transfer and manipulate data on the +GPU using the Unified Shared Memory (USM) model with explicit data movement. +Unfortunately, I've encountered a hurdle as my current implementation with =hip= +lacks a valid USM provider for my graphics card, the AMD Radeon RX 6700 XT, +preventing me from achieving implicit data movement for demonstration 😔 diff --git a/sycl_comp.cpp b/sycl_comp.cpp new file mode 100644 index 0000000..d6889cd --- /dev/null +++ b/sycl_comp.cpp @@ -0,0 +1,38 @@ +#include + +#include + +using namespace cl::sycl; + +auto main(int argc, char **argv) -> int { + + queue q; + + std::cout << "Using device: " << q.get_device().get_info() + << "\n"; + + int hostArray[42]; + auto deviceArray = static_cast(malloc_device(42 * sizeof(int), q)); + + for (int i = 0; i < 42; i++) { + hostArray[i] = i; + } + + q.memcpy(deviceArray, hostArray, 42 * sizeof(int)); + q.wait(); + + q.submit([&](handler &h) { + h.parallel_for(range<1>(42), [=](auto ID) { deviceArray[ID]++; }); + }); + + q.wait(); + + q.memcpy(hostArray, deviceArray, 42 * sizeof(int)); + q.wait(); + + for (int i = 0; i < 42; i++) { + std::cout << hostArray[i] << " "; + } + + std::cout << "\n"; +}