diff --git a/src/sycl_comp.cpp b/src/sycl_comp.cpp
index 15b7e00..e32bcd8 100644
--- a/src/sycl_comp.cpp
+++ b/src/sycl_comp.cpp
@@ -4,6 +4,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstdlib>
+#include <hipSYCL/sycl/info/device.hpp>
 #include <iostream>
 
 #include <CL/sycl.hpp>
@@ -23,6 +24,16 @@ namespace sycl = cl::sycl;
 
 using data_type = int;
 
+/**
+ * Straightforward matrix multiplication using one single CPU core and 3 nested
+ * loops.
+ *
+ * @param matA Matrix A
+ * @param matB Matrix B
+ *
+ * @return returns the checksum of the result produced by multiplication of
+ * matrix A and B
+ */
 template <class T>
 auto matrixMultCPU(const Matrix<T> &matA, const Matrix<T> &matB) {
   Matrix<T> res(matA.rows, matB.cols);
@@ -38,9 +49,19 @@ auto matrixMultCPU(const Matrix<T> &matA, const Matrix<T> &matB) {
   return res.chksum();
 }
 
+/**
+ * Also a straightforward solution, but before dive into the three nested loop,
+ * transpose matrix B. This allows faster access to the second factor using as
+ * CPU caching is more utilized.
+ *
+ * @param matA Matrix A
+ * @param matB Matrix B
+ *
+ * @return returns the checksum of the result produced by multiplication of
+ * matrix A and B
+ */
 template <class T>
-auto matrixMultTransposeCPU(const Matrix<T> &matA,
-                            const Matrix<T> &matB) {
+auto matrixMultTransposeCPU(const Matrix<T> &matA, const Matrix<T> &matB) {
   Matrix<T> matB_t = matB.t();
   Matrix<T> res(matA.rows, matB.cols);
   for (std::uint32_t i = 0; i < res.rows; i++) {
@@ -55,30 +76,70 @@ auto matrixMultTransposeCPU(const Matrix<T> &matA,
   return res.chksum();
 }
 
+/**
+ * The naive implementation of the matrix multiplication using sycl. The actual
+ * implementation (OpenMP CPU parallelization, GPU oflloading) is done by the
+ * compiler.
+ *
+ * @param q Passing the desired SYCL queue. If you want to utilize GPU,
+ * initialize the queue with a gpu selector.
+ * @param matA Matrix A
+ * @param matB Matrix B
+ *
+ * @return returns the checksum of the result produced by multiplication of
+ * matrix A and B
+ */
 template <class T>
 auto matrixMultSYCL(sycl::queue &q, const Matrix<T> &matA,
                     const Matrix<T> &matB) {
+
+  // allocate memory for the result matrix on the host
   Matrix<T> matRes(matA.rows, matB.cols);
+
+  // define the size of our problem - here we want each task to calculate one
+  // cell of the product. Thus, leading to a problem size of N x M
   sycl::range<2> global_range(matRes.rows, matRes.cols);
 
   {
+    // defining 2 dimensional buffers which can then be exposed to the device.
+    // It also possible to use 1D buffers here, but then we have to manually
+    // calculate the index to access the matrices for each thread in the kernel
+    // code. Solving it this way will ask the compiler to do the work.
     sycl::buffer<T, 2> b_matA(matA.mem.data(),
-                                      sycl::range<2>(matA.rows, matA.cols));
+                              sycl::range<2>(matA.rows, matA.cols));
 
     sycl::buffer<T, 2> b_matB(matB.mem.data(),
-                                      sycl::range<2>(matB.rows, matB.cols));
+                              sycl::range<2>(matB.rows, matB.cols));
 
-    sycl::buffer<T, 2> b_matRes(
-        matRes.mem.data(), sycl::range<2>(matRes.rows, matRes.cols));
+    sycl::buffer<T, 2> b_matRes(matRes.mem.data(),
+                                sycl::range<2>(matRes.rows, matRes.cols));
 
+    // submit work to the device. this is done by using a lambda function which
+    // references all values known to the scope i.e. the previously defined
+    // buffers.
     q.submit([&](sycl::handler &h) {
+      // create accessors and expose/copy the data to the device or mark them to
+      // be written back after successfull execution of the kernel. Access can
+      // be controlled by access modes.
+      //
+      // Here, we only the matrix A and B to be read from and the matrix C to be
+      // written to.
       auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
       auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
-      auto acc_matRes = b_matRes.template get_access<sycl::access::mode::write>(h);
+      auto acc_matRes =
+          b_matRes.template get_access<sycl::access::mode::write>(h);
 
+      // For the parallelized loop another lambda function is used, but all
+      // known values are passed by value, as host and device doesn't share the
+      // same address room.
+      //
+      // Additionally to the lambda function, the parallel_for function is given
+      // the global range we defined earlier to provide the size of the problem
+      // and launch the count of tasks accordingly. The identifier of the task
+      // is then passed to the lambda function as a parameter.
       h.parallel_for(global_range, [=](sycl::id<2> ID) {
-        auto i = ID[0];
-        auto j = ID[1];
+        const auto i = ID[0];
+        const auto j = ID[1];
         T sum = 0;
 
         for (auto k = 0; k < matA.cols; k++) {
@@ -88,12 +149,30 @@ auto matrixMultSYCL(sycl::queue &q, const Matrix<T> &matA,
       });
     });
   }
-
-  q.wait();
+  // As buffers are destroyed after the scope, ensuring to copy the data back
+  // from the device, no need to call a barrier here.
+  //
+  // We are able to return the cksum of matrix C.
 
   return matRes.chksum();
 }
 
+/**
+ * The implementation of the matrix multiplication using sycl with transposing
+ * matrix B beforehand. The actual implementation (OpenMP CPU parallelization,
+ * GPU oflloading) is done by the compiler.
+ *
+ * For inline comments please refer to matrixMultSYCL(). Nothing changes unless
+ * a buffer to the transposed matrix B is created.
+ *
+ * @param q Passing the desired SYCL queue. If you want to utilize GPU,
+ * initialize the queue with a gpu selector.
+ * @param matA Matrix A
+ * @param matB Matrix B
+ *
+ * @return returns the checksum of the result produced by multiplication of
+ * matrix A and B
+ */
 template <class T>
 auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix<T> &matA,
                              const Matrix<T> &matB) {
@@ -104,18 +183,19 @@ auto matrixMultTransposeSYCL(sycl::queue &q, const Matrix<T> &matA,
 
   {
     sycl::buffer<T, 2> b_matA(matA.mem.data(),
-                                      sycl::range<2>(matA.rows, matA.cols));
+                              sycl::range<2>(matA.rows, matA.cols));
 
     sycl::buffer<T, 2> b_matB(matB_t.mem.data(),
-                                      sycl::range<2>(matB_t.rows, matB_t.cols));
+                              sycl::range<2>(matB_t.rows, matB_t.cols));
 
-    sycl::buffer<T, 2> b_matRes(
-        matRes.mem.data(), sycl::range<2>(matRes.rows, matRes.cols));
+    sycl::buffer<T, 2> b_matRes(matRes.mem.data(),
+                                sycl::range<2>(matRes.rows, matRes.cols));
 
     q.submit([&](sycl::handler &h) {
       auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
       auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
-      auto acc_matRes = b_matRes.template get_access<sycl::access::mode::write>(h);
+      auto acc_matRes =
+          b_matRes.template get_access<sycl::access::mode::write>(h);
 
       h.parallel_for(global_range, [=](sycl::id<2> ID) {
         auto i = ID[0];
@@ -156,24 +236,60 @@ inline int prevPowerOfTwo(int x) {
   return x - (x >> 1);
 }
 
+/**
+ * The implementation of the matrix multiplication using sycl using tiling.
+ *
+ * Each task fills a field in the local memory of the device with values from
+ * the global memory. For this tasks are grouped to a work group. The size of
+ * the work group is given by the maximum value defined by the device
+ * description. All tasks from one work group share the same local memory.
+ *
+ * As the tile might be smaller than the actual matrix A and B sizes, the tile
+ * is 'moved' along the x-axis and the y-axis of those matrices resp.
+ *
+ * By using tiling, each task only performs N/TILE_SIZE global memory operations
+ * and utilizing way faster local memory, speeding up multiplication on most
+ * devices.
+ *
+ * Keep in mind that this function is intended to only be executed on GPUs, as
+ * nd_range paradigm with parallel_for is not efficiently implementable on CPUs.
+ * See: <a
+ * href="https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/compilation.md#compiler-support-to-accelerate-nd_range-parallel_for-on-cpus-ompaccelerated">Compilation
+ * model of AdaptiveCpp</a>
+ *
+ * @param q Passing the desired SYCL queue. If you want to utilize GPU,
+ * initialize the queue with a gpu selector.
+ * @param matA Matrix A
+ * @param matB Matrix B
+ *
+ * @return returns the checksum of the result produced by multiplication of
+ * matrix A and B
+ */
 template <class T>
 auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,
                          const Matrix<T> &matB) {
   Matrix<T> matRes(matA.rows, matB.cols);
 
+  // obtain the maximum work group size for the given device
   std::size_t max_group_size =
       q.get_device().get_info<sycl::info::device::max_work_group_size>();
 
-  // lets assume we always have a maximum group size with a power of 2
-  const std::uint32_t max_block_size =
+  // each tile should not exceed the max_group_size -> T x T <= max_group_size
+  const std::uint32_t max_tile_size =
       static_cast<std::uint32_t>(prevPowerOfTwo(std::sqrt(max_group_size)));
 
-  const std::uint32_t block_size = std::min(matA.cols, max_block_size);
+  // maybe the problem size of the multiplication is smaller than the maximum
+  // tile size
+  const std::uint32_t block_size = std::min(matA.cols, max_tile_size);
 
+  // define the global range ...
   sycl::range<2> global_range(matRes.rows, matRes.cols);
-  sycl::range<2> local_range(block_size, block_size);
+
+  // ... and the local_range (one tile per work group)
+  sycl::range<2> tile_range(block_size, block_size);
 
   {
+    // allocate the buffers
     sycl::buffer<T, 2> b_matA(matA.mem.data(),
                               sycl::range<2>(matA.rows, matA.cols));
 
@@ -184,21 +300,31 @@ auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,
                                 sycl::range<2>(matRes.rows, matRes.cols));
 
     q.submit([&](sycl::handler &h) {
+      // provide access to the buffers and ...
       auto acc_matA = b_matA.template get_access<sycl::access::mode::read>(h);
       auto acc_matB = b_matB.template get_access<sycl::access::mode::read>(h);
       auto acc_matRes =
           b_matRes.template get_access<sycl::access::mode::write>(h);
 
+      // ... allocate memory in the local device memory which should be
+      // accessble to each thread per matrix A ...
       sycl::accessor<int, 2, sycl::access::mode::read_write,
                      sycl::access::target::local>
-          tileA(local_range, h);
+          tileA(tile_range, h);
 
+      // ... and matrix B
       sycl::accessor<int, 2, sycl::access::mode::read_write,
                      sycl::access::target::local>
-          tileB(local_range, h);
+          tileB(tile_range, h);
 
+      // We define a kernel function by passing the global_range and the
+      // tile_range to the parallel_for function of the handler. Secondly,
+      // another lambda is passed as the kernel function, also with
+      // passed-by-value lambda captures. As a parameter serves a nd_item, which
+      // can be used to extract all relevant data linked to the running task.
       h.parallel_for<class tiled_matmul>(
-          sycl::nd_range{global_range, local_range}, [=](sycl::nd_item<2> &ID) {
+          sycl::nd_range{global_range, tile_range}, [=](sycl::nd_item<2> &ID) {
+            // extract all relevant information
             const int i = ID.get_global_id(0);
             const int j = ID.get_global_id(1);
 
@@ -207,31 +333,40 @@ auto matrixMultTiledSYCL(sycl::queue &q, const Matrix<T> &matA,
 
             const int max_tile = ID.get_group_range(0);
 
-            // Current local IDem
             T sum = 0;
 
+            // 'move' the tile over the A and B matrix
             for (int tile_i = 0; tile_i < max_tile; tile_i++) {
+              // each thread copy 'its' value to the local memory from the
+              // global memory.
               tileA[local_i][local_j] =
                   acc_matA[i][tile_i * block_size + local_j];
+              // here we will also transpose the B matrix
               tileB[local_j][local_i] =
                   acc_matB[block_size * tile_i + local_i][j];
 
+              // we need an explicit barrier to ensure all threads of the
+              // working group succeeded to write their value into the local
+              // memory
               ID.barrier(sycl::access::fence_space::local_space);
 
+              // build the 'local' sum over the part of matrix A and B stored in
+              // the local memory
               for (auto k = 0; k < block_size; k++) {
                 sum += tileA[local_i][k] * tileB[local_j][k];
               }
 
+              // ensure all threads finished the multiplication, allowing to
+              // rewrite the local memory
               ID.barrier(sycl::access::fence_space::local_space);
             }
 
+            // each thread stores its sum in their resulting matrix C
             acc_matRes[i][j] = sum;
           });
     });
   }
 
-  q.wait();
-
   return matRes.chksum();
 }
 
@@ -260,21 +395,36 @@ auto main(int argc, char **argv) -> int {
 
   sycl::queue cpu_queue(sycl::cpu_selector_v);
 
-  auto omp_chksum = measure<>::duration(matrixMultSYCL<data_type>, cpu_queue, matA, matB);
+  std::cout << "Starting CPU benchmarks with device: "
+            << cpu_queue.get_device().get_info<sycl::info::device::name>()
+            << "\n";
+
+  auto omp_chksum =
+      measure<>::duration(matrixMultSYCL<data_type>, cpu_queue, matA, matB);
   print_pair("OMP - naive", omp_chksum.first, omp_chksum.second.count());
 
-  auto omp_transp_chksum =
-      measure<>::duration(matrixMultTransposeSYCL<data_type>, cpu_queue, matA, matB);
+  auto omp_transp_chksum = measure<>::duration(
+      matrixMultTransposeSYCL<data_type>, cpu_queue, matA, matB);
   print_pair("OMP - transposed", omp_transp_chksum.first,
              omp_transp_chksum.second.count());
 
   sycl::queue gpu_queue(sycl::gpu_selector_v);
 
-  auto gpu_chksum = measure<>::duration(matrixMultSYCL<data_type>, gpu_queue, matA, matB);
+  if (!gpu_queue.get_device().is_gpu()) {
+    std::cout << "No GPU found, skipping GPU benchmarks\n";
+    return EXIT_SUCCESS;
+  }
+
+  std::cout << "Starting GPU benchmarks with device: "
+            << gpu_queue.get_device().get_info<sycl::info::device::name>()
+            << "\n";
+
+  auto gpu_chksum =
+      measure<>::duration(matrixMultSYCL<data_type>, gpu_queue, matA, matB);
   print_pair("GPU - naive", gpu_chksum.first, gpu_chksum.second.count());
 
-  auto gpu_transp_chksum =
-      measure<>::duration(matrixMultTransposeSYCL<data_type>, gpu_queue, matA, matB);
+  auto gpu_transp_chksum = measure<>::duration(
+      matrixMultTransposeSYCL<data_type>, gpu_queue, matA, matB);
   print_pair("GPU - transposed", gpu_transp_chksum.first,
              gpu_transp_chksum.second.count());