From 38750485ca388c1056d0f9d7f52fc94891db89fd Mon Sep 17 00:00:00 2001 From: Max Luebke Date: Mon, 17 Jun 2024 19:37:27 +0200 Subject: [PATCH] chore: Optimize SYCL implementation of FTCS algorithm The commit optimizes the SYCL implementation of the FTCS algorithm by minimizing copy operations and using a row-major matrix internally. --- .../{FTCS_1D_sycl.cpp => FTCS_2D_sycl.cpp} | 49 ++-- include/tug/Core/FTCS_Sycl.hpp | 215 +++++++++++++++--- include/tug/Simulation.hpp | 7 + 3 files changed, 215 insertions(+), 56 deletions(-) rename examples/{FTCS_1D_sycl.cpp => FTCS_2D_sycl.cpp} (58%) diff --git a/examples/FTCS_1D_sycl.cpp b/examples/FTCS_2D_sycl.cpp similarity index 58% rename from examples/FTCS_1D_sycl.cpp rename to examples/FTCS_2D_sycl.cpp index eb688cb..6d92ac4 100644 --- a/examples/FTCS_1D_sycl.cpp +++ b/examples/FTCS_2D_sycl.cpp @@ -7,60 +7,61 @@ using namespace tug; using namespace Eigen; -MatrixXd runNormal(MatrixXd concentrations, const MatrixXd &alphas, - Boundary &boundaries, int iterations, +MatrixXf runNormal(MatrixXf concentrations, const MatrixXf &alphas, + Boundary &boundaries, int iterations, double timestep) { - Grid64 grid(concentrations.cols()); + Grid32 grid(concentrations.rows(), concentrations.cols()); grid.setConcentrations(concentrations); - grid.setAlpha(alphas); + grid.setAlpha(alphas, alphas); Simulation simulation = - Simulation(grid, boundaries); + Simulation(grid, boundaries); simulation.setTimestep(timestep); simulation.setIterations(iterations); simulation.run(); - return MatrixXd(grid.getConcentrations()); + return MatrixXf(grid.getConcentrations()); } -MatrixXd runWithSYCL(MatrixXd concentrations, const MatrixXd &alphas, - Boundary &boundaries, int iterations, +MatrixXf runWithSYCL(MatrixXf concentrations, const MatrixXf &alphas, + Boundary &boundaries, int iterations, double timestep) { - Grid64 grid(concentrations.cols()); + Grid32 grid(concentrations.rows(), concentrations.cols()); grid.setConcentrations(concentrations); - grid.setAlpha(alphas); + grid.setAlpha(alphas, alphas); - Simulation simulation = Simulation(grid, boundaries); + Simulation simulation = Simulation(grid, boundaries); simulation.setTimestep(timestep); simulation.setIterations(iterations); simulation.run(); - return MatrixXd(grid.getConcentrations()); + return MatrixXf(grid.getConcentrations()); } int main(int argc, char **argv) { - constexpr int cells = 1e7; - constexpr double timestep = 0.1; - constexpr int iterations = 500; + constexpr int cells = 3e3; + constexpr double timestep = 500; + constexpr int iterations = 1; - Grid64 grid(cells); + MatrixXf concentrations(cells, cells); + concentrations.setConstant(20); + concentrations(0, 0) = 100; - MatrixXd concentrations = MatrixXd::Constant(1, cells, 20); - - MatrixXd alphas = MatrixXd::Constant(1, cells, 1); + MatrixXf alphas(cells, cells); + alphas.setConstant(1); // ****************** // **** BOUNDARY **** // ****************** // create a boundary with constant values - Boundary bc = Boundary(grid); - bc.setBoundarySideConstant(BC_SIDE_LEFT, 1); - bc.setBoundarySideConstant(BC_SIDE_RIGHT, 1); + Boundary bc = Boundary(cells, cells); + // bc.setBoundarySideConstant(BC_SIDE_LEFT, 1); + // bc.setBoundarySideConstant(BC_SIDE_RIGHT, 1); auto t_normal_begin = std::chrono::steady_clock::now(); auto normal = runNormal(concentrations, alphas, bc, iterations, timestep); @@ -85,9 +86,9 @@ int main(int argc, char **argv) { // std::cout << "Normal: " << std::endl << normal << std::endl; // std::cout << "SYCL: " << std::endl << sycl << std::endl; - auto diff = (normal - sycl).cwiseAbs().maxCoeff(); + auto diff = normal - sycl; - std::cout << "Max diff: " << diff << std::endl; + std::cout << "Max diff: " << diff.cwiseAbs().maxCoeff() << std::endl; return 0; } \ No newline at end of file diff --git a/include/tug/Core/FTCS_Sycl.hpp b/include/tug/Core/FTCS_Sycl.hpp index 28479c5..c2ede92 100644 --- a/include/tug/Core/FTCS_Sycl.hpp +++ b/include/tug/Core/FTCS_Sycl.hpp @@ -2,8 +2,8 @@ #include "../Boundary.hpp" #include "../Grid.hpp" -#include "sycl/usm.hpp" +#include #include #include #include @@ -11,9 +11,10 @@ namespace tug { template -static inline constexpr T boundApply(const T conc, const T conc_neighbor, - const T alpha, const T alpha_neighbor, - const T bcValue, const BC_TYPE bcType) { +static inline constexpr T +evaluateBoundary(const T conc, const T conc_neighbor, const T alpha, + const T alpha_neighbor, const T bcValue, + const BC_TYPE bcType) { const T alpha_intercell = calcAlphaIntercell(alpha, alpha_neighbor); switch (bcType) { case BC_TYPE::BC_TYPE_CLOSED: { @@ -33,17 +34,145 @@ static inline constexpr T boundApply(const T conc, const T conc_neighbor, } template -static inline constexpr T explicitChange(const T conc, const T conc_n1, - const T conc_n2, const T alpha, - const T alpha_n1, const T alpha_n2) { - const T alpha_intercell_n1 = calcAlphaIntercell(alpha, alpha_n1); - const T alpha_intercell_n2 = calcAlphaIntercell(alpha, alpha_n2); +static inline constexpr T +calculateExplicitChange(const std::size_t idx_base, const std::size_t idx_n1, + const std::size_t idx_n2, const T *conc, + const T *alpha) { + const T alpha_intercell_n1 = + calcAlphaIntercell(alpha[idx_base], alpha[idx_n1]); + const T alpha_intercell_n2 = + calcAlphaIntercell(alpha[idx_base], alpha[idx_n2]); - const T conc_term_n1 = alpha_intercell_n1 * conc_n1; - const T conc_term_n2 = alpha_intercell_n2 * conc_n2; + const T conc_term_n1 = alpha_intercell_n1 * conc[idx_n1]; + const T conc_term_n2 = alpha_intercell_n2 * conc[idx_n2]; return conc_term_n1 + conc_term_n2 - - (alpha_intercell_n1 + alpha_intercell_n2) * conc; + (alpha_intercell_n1 + alpha_intercell_n2) * conc[idx_base]; +} + +template +static inline void +FTCS_SYCL_2D(sycl::queue &q, T *concentrations_inout, const T *alphas_x_in, + const T *alphas_y_in, const tug::BoundaryElement *boundaries, + const std::size_t colMax, const std::size_t rowMax, + const T dtOverDeltaColSquare, const T dtOverDeltaRowSquare, + const std::size_t nIter) { + + T *concentrations_device = sycl::malloc_device(colMax * rowMax, q); + T *concentrations_t1_device = sycl::malloc_device(colMax * rowMax, q); + T *alpha_x_device = sycl::malloc_device(colMax * rowMax, q); + T *alpha_y_device = sycl::malloc_device(colMax * rowMax, q); + tug::BoundaryElement *boundaries_device = + sycl::malloc_device>(2 * colMax + 2 * rowMax, q); + + q.memcpy(concentrations_device, concentrations_inout, + colMax * rowMax * sizeof(T)); + q.memcpy(alpha_x_device, alphas_x_in, colMax * rowMax * sizeof(T)); + q.memcpy(alpha_y_device, alphas_y_in, colMax * rowMax * sizeof(T)); + q.memcpy(boundaries_device, boundaries, + (2 * colMax + 2 * rowMax) * sizeof(BoundaryElement)); + + const tug::BoundaryElement *leftbound = &boundaries_device[0]; + const tug::BoundaryElement *rightbound = &boundaries_device[rowMax]; + const tug::BoundaryElement *topbound = &boundaries_device[2 * rowMax]; + const tug::BoundaryElement *bottombound = + &boundaries_device[2 * rowMax + colMax]; + + const std::size_t wgMaxSize = + q.get_device().get_info(); + + constexpr std::size_t CUDA_WARP_SIZE = 32; + const std::size_t col_wg = std::min(colMax, CUDA_WARP_SIZE); + const std::size_t row_wg = wgMaxSize / col_wg; + + auto round_to_multiple = [](std::size_t value, std::size_t multiple) { + return ((value + multiple - 1) / multiple) * multiple; + }; + + sycl::range<2> wg_range(col_wg, row_wg); + sycl::range<2> global_range = sycl::range<2>( + round_to_multiple(colMax, col_wg), round_to_multiple(rowMax, row_wg)); + + q.wait(); + + for (std::size_t i = 0; i < nIter; i++) { + q.submit([&](sycl::handler &h) { + h.parallel_for( + sycl::nd_range<2>(global_range, wg_range), + [=](sycl::nd_item<2> item2DIndex) { + const std::size_t col_i = item2DIndex.get_global_id(0); + const std::size_t row_j = item2DIndex.get_global_id(1); + + if (col_i >= colMax || row_j >= rowMax) { + return; + } + + const std::size_t idx_self = col_i + row_j * colMax; + const std::size_t idx_left = idx_self - 1; + const std::size_t idx_right = idx_self + 1; + const std::size_t idx_top = idx_self - colMax; + const std::size_t idx_bottom = idx_self + colMax; + + T vertical_change; + + if (row_j == 0 || row_j == rowMax - 1) { + const std::size_t neighbor_idx = + row_j == 0 ? idx_bottom : idx_top; + const tug::BoundaryElement &bound = + row_j == 0 ? topbound[col_i] : bottombound[col_i]; + + vertical_change = evaluateBoundary( + concentrations_device[idx_self], + concentrations_device[neighbor_idx], alpha_y_device[idx_self], + alpha_y_device[neighbor_idx], bound.getValue(), + bound.getType()); + } else { + vertical_change = calculateExplicitChange( + idx_self, idx_bottom, idx_top, concentrations_device, + alpha_y_device); + } + + T horizontal_change; + + if (col_i == 0 || col_i == colMax - 1) { + const std::size_t neighbor_idx = + col_i == 0 ? idx_right : idx_left; + const tug::BoundaryElement &bound = + col_i == 0 ? leftbound[row_j] : rightbound[row_j]; + + horizontal_change = evaluateBoundary( + concentrations_device[idx_self], + concentrations_device[neighbor_idx], alpha_x_device[idx_self], + alpha_x_device[neighbor_idx], bound.getValue(), + bound.getType()); + } else { + horizontal_change = calculateExplicitChange( + idx_self, idx_right, idx_left, concentrations_device, + alpha_x_device); + } + concentrations_t1_device[idx_self] = + concentrations_device[idx_self] + + dtOverDeltaColSquare * horizontal_change + + dtOverDeltaRowSquare * vertical_change; + }); + }); + + q.wait(); + + T *tmp = concentrations_device; + concentrations_device = concentrations_t1_device; + concentrations_t1_device = tmp; + } + + q.memcpy(concentrations_inout, concentrations_device, + colMax * rowMax * sizeof(T)) + .wait(); + + sycl::free(concentrations_device, q); + sycl::free(concentrations_t1_device, q); + sycl::free(alpha_x_device, q); + sycl::free(alpha_y_device, q); + sycl::free(boundaries_device, q); } template @@ -60,6 +189,9 @@ FTCS_SYCL_1D(sycl::queue &q, T *concentrations_inout, const T *alphas_in, const T bcValue_left = leftbound.getValue(); const T bcValue_right = rightbound.getValue(); + const std::size_t max_wg_size = + q.get_device().get_info(); + { T *concentrations_device = sycl::malloc_device(colMax, q); T *concentrations_t1_device = sycl::malloc_device(colMax, q); @@ -76,9 +208,9 @@ FTCS_SYCL_1D(sycl::queue &q, T *concentrations_inout, const T *alphas_in, concentrations_t1_device[0] = concentrations_device[0] + dtOverDeltaColSquare * - boundApply(concentrations_device[0], concentrations_device[1], - alpha_device[0], alpha_device[1], bcValue_left, - bcType_left); + evaluateBoundary(concentrations_device[0], + concentrations_device[1], alpha_device[0], + alpha_device[1], bcValue_left, bcType_left); }); }); @@ -87,29 +219,37 @@ FTCS_SYCL_1D(sycl::queue &q, T *concentrations_inout, const T *alphas_in, concentrations_t1_device[colMax - 1] = concentrations_device[colMax - 1] + dtOverDeltaColSquare * - boundApply(concentrations_device[colMax - 1], - concentrations_device[colMax - 2], - alpha_device[colMax - 1], alpha_device[colMax - 2], - bcValue_right, bcType_right); + evaluateBoundary(concentrations_device[colMax - 1], + concentrations_device[colMax - 2], + alpha_device[colMax - 1], + alpha_device[colMax - 2], bcValue_right, + bcType_right); }); }); auto e_mid = q.submit([&](sycl::handler &h) { - h.parallel_for(sycl::range<1>(colMax - 2), [=](sycl::id<1> idx) { - const std::size_t i = idx[0] + 1; - concentrations_t1_device[i] = - concentrations_device[i] + - dtOverDeltaColSquare * - explicitChange(concentrations_device[i], - concentrations_device[i + 1], - concentrations_device[i - 1], alpha_device[i], - alpha_device[i + 1], alpha_device[i - 1]); - }); + h.parallel_for(sycl::nd_range<1>(sycl::range<1>(colMax - 2), + sycl::range<1>(max_wg_size)), + [=](sycl::nd_item<1> item) { + const std::size_t i = item.get_global_id(0) + 1; + concentrations_t1_device[i] = + concentrations_device[i] + + dtOverDeltaColSquare * + calculateExplicitChange(i, i + 1, i - 1, + concentrations_device, + alpha_device); + }); }); - q.memcpy(concentrations_device, concentrations_t1_device, - colMax * sizeof(T), {e_boundleft, e_boundright, e_mid}) - .wait(); + q.wait(); + + T *tmp = concentrations_device; + concentrations_device = concentrations_t1_device; + concentrations_t1_device = tmp; + + // q.memcpy(concentrations_device, concentrations_t1_device, + // colMax * sizeof(T), {e_boundleft, e_boundright, e_mid}) + // .wait(); } q.memcpy(concentrations_inout, concentrations_device, colMax * sizeof(T)) @@ -134,6 +274,17 @@ void FTCS_Sycl_impl(sycl::queue &q, tug::Grid &grid, boundaries.getBoundaryElement(BC_SIDE_LEFT, 0), boundaries.getBoundaryElement(BC_SIDE_RIGHT, 0), colMax, dtOverDeltaColSquare, nIter); + } else if (grid.getDim() == 2) { + const std::size_t rowMax = grid.getRow(); + const T deltaRow = grid.getDeltaRow(); + const T deltaRowSquare = deltaRow * deltaRow; + const T dtOverDeltaRowSquare = dt / deltaRowSquare; + + const auto serialized_boundaries = boundaries.serialize(); + + FTCS_SYCL_2D(q, grid.getConcentrations().data(), grid.getAlphaX().data(), + grid.getAlphaY().data(), serialized_boundaries.data(), colMax, + rowMax, dtOverDeltaColSquare, dtOverDeltaRowSquare, nIter); } } } // namespace tug \ No newline at end of file diff --git a/include/tug/Simulation.hpp b/include/tug/Simulation.hpp index 549d7da..1522850 100644 --- a/include/tug/Simulation.hpp +++ b/include/tug/Simulation.hpp @@ -429,6 +429,13 @@ public: } #ifdef TUG_ENABLE_SYCL else if constexpr (approach == FTCS_SYCL) { + sycl::queue &q = *this->q.get(); + + std::cout << "Running FTCS_SYCL on device: " + << q.get_device().get_info() << "\n"; + + std::cout << std::flush; + FTCS_Sycl_impl(*this->q.get(), this->grid, this->bc, this->timestep, this->iterations * innerIterations); }