mirror of
https://git.gfz-potsdam.de/naaice/tug.git
synced 2025-12-15 10:28:23 +01:00
chore: Optimize SYCL implementation of FTCS algorithm
The commit optimizes the SYCL implementation of the FTCS algorithm by minimizing copy operations and using a row-major matrix internally.
This commit is contained in:
parent
378bb6fc72
commit
38750485ca
@ -7,60 +7,61 @@
|
||||
using namespace tug;
|
||||
using namespace Eigen;
|
||||
|
||||
MatrixXd runNormal(MatrixXd concentrations, const MatrixXd &alphas,
|
||||
Boundary<double> &boundaries, int iterations,
|
||||
MatrixXf runNormal(MatrixXf concentrations, const MatrixXf &alphas,
|
||||
Boundary<float> &boundaries, int iterations,
|
||||
double timestep) {
|
||||
Grid64 grid(concentrations.cols());
|
||||
Grid32 grid(concentrations.rows(), concentrations.cols());
|
||||
grid.setConcentrations(concentrations);
|
||||
|
||||
grid.setAlpha(alphas);
|
||||
grid.setAlpha(alphas, alphas);
|
||||
|
||||
Simulation simulation =
|
||||
Simulation<double, tug::FTCS_APPROACH>(grid, boundaries);
|
||||
Simulation<float, tug::FTCS_APPROACH>(grid, boundaries);
|
||||
|
||||
simulation.setTimestep(timestep);
|
||||
simulation.setIterations(iterations);
|
||||
simulation.run();
|
||||
|
||||
return MatrixXd(grid.getConcentrations());
|
||||
return MatrixXf(grid.getConcentrations());
|
||||
}
|
||||
|
||||
MatrixXd runWithSYCL(MatrixXd concentrations, const MatrixXd &alphas,
|
||||
Boundary<double> &boundaries, int iterations,
|
||||
MatrixXf runWithSYCL(MatrixXf concentrations, const MatrixXf &alphas,
|
||||
Boundary<float> &boundaries, int iterations,
|
||||
double timestep) {
|
||||
Grid64 grid(concentrations.cols());
|
||||
Grid32 grid(concentrations.rows(), concentrations.cols());
|
||||
grid.setConcentrations(concentrations);
|
||||
|
||||
grid.setAlpha(alphas);
|
||||
grid.setAlpha(alphas, alphas);
|
||||
|
||||
Simulation simulation = Simulation<double, tug::FTCS_SYCL>(grid, boundaries);
|
||||
Simulation simulation = Simulation<float, tug::FTCS_SYCL>(grid, boundaries);
|
||||
|
||||
simulation.setTimestep(timestep);
|
||||
simulation.setIterations(iterations);
|
||||
simulation.run();
|
||||
|
||||
return MatrixXd(grid.getConcentrations());
|
||||
return MatrixXf(grid.getConcentrations());
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
constexpr int cells = 1e7;
|
||||
constexpr double timestep = 0.1;
|
||||
constexpr int iterations = 500;
|
||||
constexpr int cells = 3e3;
|
||||
constexpr double timestep = 500;
|
||||
constexpr int iterations = 1;
|
||||
|
||||
Grid64 grid(cells);
|
||||
MatrixXf concentrations(cells, cells);
|
||||
concentrations.setConstant(20);
|
||||
concentrations(0, 0) = 100;
|
||||
|
||||
MatrixXd concentrations = MatrixXd::Constant(1, cells, 20);
|
||||
|
||||
MatrixXd alphas = MatrixXd::Constant(1, cells, 1);
|
||||
MatrixXf alphas(cells, cells);
|
||||
alphas.setConstant(1);
|
||||
|
||||
// ******************
|
||||
// **** BOUNDARY ****
|
||||
// ******************
|
||||
|
||||
// create a boundary with constant values
|
||||
Boundary bc = Boundary(grid);
|
||||
bc.setBoundarySideConstant(BC_SIDE_LEFT, 1);
|
||||
bc.setBoundarySideConstant(BC_SIDE_RIGHT, 1);
|
||||
Boundary bc = Boundary<float>(cells, cells);
|
||||
// bc.setBoundarySideConstant(BC_SIDE_LEFT, 1);
|
||||
// bc.setBoundarySideConstant(BC_SIDE_RIGHT, 1);
|
||||
|
||||
auto t_normal_begin = std::chrono::steady_clock::now();
|
||||
auto normal = runNormal(concentrations, alphas, bc, iterations, timestep);
|
||||
@ -85,9 +86,9 @@ int main(int argc, char **argv) {
|
||||
// std::cout << "Normal: " << std::endl << normal << std::endl;
|
||||
// std::cout << "SYCL: " << std::endl << sycl << std::endl;
|
||||
|
||||
auto diff = (normal - sycl).cwiseAbs().maxCoeff();
|
||||
auto diff = normal - sycl;
|
||||
|
||||
std::cout << "Max diff: " << diff << std::endl;
|
||||
std::cout << "Max diff: " << diff.cwiseAbs().maxCoeff() << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -2,8 +2,8 @@
|
||||
|
||||
#include "../Boundary.hpp"
|
||||
#include "../Grid.hpp"
|
||||
#include "sycl/usm.hpp"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstddef>
|
||||
#include <cstring>
|
||||
#include <sycl/sycl.hpp>
|
||||
@ -11,9 +11,10 @@
|
||||
namespace tug {
|
||||
|
||||
template <typename T>
|
||||
static inline constexpr T boundApply(const T conc, const T conc_neighbor,
|
||||
const T alpha, const T alpha_neighbor,
|
||||
const T bcValue, const BC_TYPE bcType) {
|
||||
static inline constexpr T
|
||||
evaluateBoundary(const T conc, const T conc_neighbor, const T alpha,
|
||||
const T alpha_neighbor, const T bcValue,
|
||||
const BC_TYPE bcType) {
|
||||
const T alpha_intercell = calcAlphaIntercell(alpha, alpha_neighbor);
|
||||
switch (bcType) {
|
||||
case BC_TYPE::BC_TYPE_CLOSED: {
|
||||
@ -33,17 +34,145 @@ static inline constexpr T boundApply(const T conc, const T conc_neighbor,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline constexpr T explicitChange(const T conc, const T conc_n1,
|
||||
const T conc_n2, const T alpha,
|
||||
const T alpha_n1, const T alpha_n2) {
|
||||
const T alpha_intercell_n1 = calcAlphaIntercell(alpha, alpha_n1);
|
||||
const T alpha_intercell_n2 = calcAlphaIntercell(alpha, alpha_n2);
|
||||
static inline constexpr T
|
||||
calculateExplicitChange(const std::size_t idx_base, const std::size_t idx_n1,
|
||||
const std::size_t idx_n2, const T *conc,
|
||||
const T *alpha) {
|
||||
const T alpha_intercell_n1 =
|
||||
calcAlphaIntercell(alpha[idx_base], alpha[idx_n1]);
|
||||
const T alpha_intercell_n2 =
|
||||
calcAlphaIntercell(alpha[idx_base], alpha[idx_n2]);
|
||||
|
||||
const T conc_term_n1 = alpha_intercell_n1 * conc_n1;
|
||||
const T conc_term_n2 = alpha_intercell_n2 * conc_n2;
|
||||
const T conc_term_n1 = alpha_intercell_n1 * conc[idx_n1];
|
||||
const T conc_term_n2 = alpha_intercell_n2 * conc[idx_n2];
|
||||
|
||||
return conc_term_n1 + conc_term_n2 -
|
||||
(alpha_intercell_n1 + alpha_intercell_n2) * conc;
|
||||
(alpha_intercell_n1 + alpha_intercell_n2) * conc[idx_base];
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline void
|
||||
FTCS_SYCL_2D(sycl::queue &q, T *concentrations_inout, const T *alphas_x_in,
|
||||
const T *alphas_y_in, const tug::BoundaryElement<T> *boundaries,
|
||||
const std::size_t colMax, const std::size_t rowMax,
|
||||
const T dtOverDeltaColSquare, const T dtOverDeltaRowSquare,
|
||||
const std::size_t nIter) {
|
||||
|
||||
T *concentrations_device = sycl::malloc_device<T>(colMax * rowMax, q);
|
||||
T *concentrations_t1_device = sycl::malloc_device<T>(colMax * rowMax, q);
|
||||
T *alpha_x_device = sycl::malloc_device<T>(colMax * rowMax, q);
|
||||
T *alpha_y_device = sycl::malloc_device<T>(colMax * rowMax, q);
|
||||
tug::BoundaryElement<T> *boundaries_device =
|
||||
sycl::malloc_device<tug::BoundaryElement<T>>(2 * colMax + 2 * rowMax, q);
|
||||
|
||||
q.memcpy(concentrations_device, concentrations_inout,
|
||||
colMax * rowMax * sizeof(T));
|
||||
q.memcpy(alpha_x_device, alphas_x_in, colMax * rowMax * sizeof(T));
|
||||
q.memcpy(alpha_y_device, alphas_y_in, colMax * rowMax * sizeof(T));
|
||||
q.memcpy(boundaries_device, boundaries,
|
||||
(2 * colMax + 2 * rowMax) * sizeof(BoundaryElement<T>));
|
||||
|
||||
const tug::BoundaryElement<T> *leftbound = &boundaries_device[0];
|
||||
const tug::BoundaryElement<T> *rightbound = &boundaries_device[rowMax];
|
||||
const tug::BoundaryElement<T> *topbound = &boundaries_device[2 * rowMax];
|
||||
const tug::BoundaryElement<T> *bottombound =
|
||||
&boundaries_device[2 * rowMax + colMax];
|
||||
|
||||
const std::size_t wgMaxSize =
|
||||
q.get_device().get_info<sycl::info::device::max_work_group_size>();
|
||||
|
||||
constexpr std::size_t CUDA_WARP_SIZE = 32;
|
||||
const std::size_t col_wg = std::min(colMax, CUDA_WARP_SIZE);
|
||||
const std::size_t row_wg = wgMaxSize / col_wg;
|
||||
|
||||
auto round_to_multiple = [](std::size_t value, std::size_t multiple) {
|
||||
return ((value + multiple - 1) / multiple) * multiple;
|
||||
};
|
||||
|
||||
sycl::range<2> wg_range(col_wg, row_wg);
|
||||
sycl::range<2> global_range = sycl::range<2>(
|
||||
round_to_multiple(colMax, col_wg), round_to_multiple(rowMax, row_wg));
|
||||
|
||||
q.wait();
|
||||
|
||||
for (std::size_t i = 0; i < nIter; i++) {
|
||||
q.submit([&](sycl::handler &h) {
|
||||
h.parallel_for(
|
||||
sycl::nd_range<2>(global_range, wg_range),
|
||||
[=](sycl::nd_item<2> item2DIndex) {
|
||||
const std::size_t col_i = item2DIndex.get_global_id(0);
|
||||
const std::size_t row_j = item2DIndex.get_global_id(1);
|
||||
|
||||
if (col_i >= colMax || row_j >= rowMax) {
|
||||
return;
|
||||
}
|
||||
|
||||
const std::size_t idx_self = col_i + row_j * colMax;
|
||||
const std::size_t idx_left = idx_self - 1;
|
||||
const std::size_t idx_right = idx_self + 1;
|
||||
const std::size_t idx_top = idx_self - colMax;
|
||||
const std::size_t idx_bottom = idx_self + colMax;
|
||||
|
||||
T vertical_change;
|
||||
|
||||
if (row_j == 0 || row_j == rowMax - 1) {
|
||||
const std::size_t neighbor_idx =
|
||||
row_j == 0 ? idx_bottom : idx_top;
|
||||
const tug::BoundaryElement<T> &bound =
|
||||
row_j == 0 ? topbound[col_i] : bottombound[col_i];
|
||||
|
||||
vertical_change = evaluateBoundary(
|
||||
concentrations_device[idx_self],
|
||||
concentrations_device[neighbor_idx], alpha_y_device[idx_self],
|
||||
alpha_y_device[neighbor_idx], bound.getValue(),
|
||||
bound.getType());
|
||||
} else {
|
||||
vertical_change = calculateExplicitChange(
|
||||
idx_self, idx_bottom, idx_top, concentrations_device,
|
||||
alpha_y_device);
|
||||
}
|
||||
|
||||
T horizontal_change;
|
||||
|
||||
if (col_i == 0 || col_i == colMax - 1) {
|
||||
const std::size_t neighbor_idx =
|
||||
col_i == 0 ? idx_right : idx_left;
|
||||
const tug::BoundaryElement<T> &bound =
|
||||
col_i == 0 ? leftbound[row_j] : rightbound[row_j];
|
||||
|
||||
horizontal_change = evaluateBoundary(
|
||||
concentrations_device[idx_self],
|
||||
concentrations_device[neighbor_idx], alpha_x_device[idx_self],
|
||||
alpha_x_device[neighbor_idx], bound.getValue(),
|
||||
bound.getType());
|
||||
} else {
|
||||
horizontal_change = calculateExplicitChange(
|
||||
idx_self, idx_right, idx_left, concentrations_device,
|
||||
alpha_x_device);
|
||||
}
|
||||
concentrations_t1_device[idx_self] =
|
||||
concentrations_device[idx_self] +
|
||||
dtOverDeltaColSquare * horizontal_change +
|
||||
dtOverDeltaRowSquare * vertical_change;
|
||||
});
|
||||
});
|
||||
|
||||
q.wait();
|
||||
|
||||
T *tmp = concentrations_device;
|
||||
concentrations_device = concentrations_t1_device;
|
||||
concentrations_t1_device = tmp;
|
||||
}
|
||||
|
||||
q.memcpy(concentrations_inout, concentrations_device,
|
||||
colMax * rowMax * sizeof(T))
|
||||
.wait();
|
||||
|
||||
sycl::free(concentrations_device, q);
|
||||
sycl::free(concentrations_t1_device, q);
|
||||
sycl::free(alpha_x_device, q);
|
||||
sycl::free(alpha_y_device, q);
|
||||
sycl::free(boundaries_device, q);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -60,6 +189,9 @@ FTCS_SYCL_1D(sycl::queue &q, T *concentrations_inout, const T *alphas_in,
|
||||
const T bcValue_left = leftbound.getValue();
|
||||
const T bcValue_right = rightbound.getValue();
|
||||
|
||||
const std::size_t max_wg_size =
|
||||
q.get_device().get_info<sycl::info::device::max_work_group_size>();
|
||||
|
||||
{
|
||||
T *concentrations_device = sycl::malloc_device<T>(colMax, q);
|
||||
T *concentrations_t1_device = sycl::malloc_device<T>(colMax, q);
|
||||
@ -76,9 +208,9 @@ FTCS_SYCL_1D(sycl::queue &q, T *concentrations_inout, const T *alphas_in,
|
||||
concentrations_t1_device[0] =
|
||||
concentrations_device[0] +
|
||||
dtOverDeltaColSquare *
|
||||
boundApply(concentrations_device[0], concentrations_device[1],
|
||||
alpha_device[0], alpha_device[1], bcValue_left,
|
||||
bcType_left);
|
||||
evaluateBoundary(concentrations_device[0],
|
||||
concentrations_device[1], alpha_device[0],
|
||||
alpha_device[1], bcValue_left, bcType_left);
|
||||
});
|
||||
});
|
||||
|
||||
@ -87,29 +219,37 @@ FTCS_SYCL_1D(sycl::queue &q, T *concentrations_inout, const T *alphas_in,
|
||||
concentrations_t1_device[colMax - 1] =
|
||||
concentrations_device[colMax - 1] +
|
||||
dtOverDeltaColSquare *
|
||||
boundApply(concentrations_device[colMax - 1],
|
||||
concentrations_device[colMax - 2],
|
||||
alpha_device[colMax - 1], alpha_device[colMax - 2],
|
||||
bcValue_right, bcType_right);
|
||||
evaluateBoundary(concentrations_device[colMax - 1],
|
||||
concentrations_device[colMax - 2],
|
||||
alpha_device[colMax - 1],
|
||||
alpha_device[colMax - 2], bcValue_right,
|
||||
bcType_right);
|
||||
});
|
||||
});
|
||||
|
||||
auto e_mid = q.submit([&](sycl::handler &h) {
|
||||
h.parallel_for(sycl::range<1>(colMax - 2), [=](sycl::id<1> idx) {
|
||||
const std::size_t i = idx[0] + 1;
|
||||
concentrations_t1_device[i] =
|
||||
concentrations_device[i] +
|
||||
dtOverDeltaColSquare *
|
||||
explicitChange(concentrations_device[i],
|
||||
concentrations_device[i + 1],
|
||||
concentrations_device[i - 1], alpha_device[i],
|
||||
alpha_device[i + 1], alpha_device[i - 1]);
|
||||
});
|
||||
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(colMax - 2),
|
||||
sycl::range<1>(max_wg_size)),
|
||||
[=](sycl::nd_item<1> item) {
|
||||
const std::size_t i = item.get_global_id(0) + 1;
|
||||
concentrations_t1_device[i] =
|
||||
concentrations_device[i] +
|
||||
dtOverDeltaColSquare *
|
||||
calculateExplicitChange(i, i + 1, i - 1,
|
||||
concentrations_device,
|
||||
alpha_device);
|
||||
});
|
||||
});
|
||||
|
||||
q.memcpy(concentrations_device, concentrations_t1_device,
|
||||
colMax * sizeof(T), {e_boundleft, e_boundright, e_mid})
|
||||
.wait();
|
||||
q.wait();
|
||||
|
||||
T *tmp = concentrations_device;
|
||||
concentrations_device = concentrations_t1_device;
|
||||
concentrations_t1_device = tmp;
|
||||
|
||||
// q.memcpy(concentrations_device, concentrations_t1_device,
|
||||
// colMax * sizeof(T), {e_boundleft, e_boundright, e_mid})
|
||||
// .wait();
|
||||
}
|
||||
|
||||
q.memcpy(concentrations_inout, concentrations_device, colMax * sizeof(T))
|
||||
@ -134,6 +274,17 @@ void FTCS_Sycl_impl(sycl::queue &q, tug::Grid<T> &grid,
|
||||
boundaries.getBoundaryElement(BC_SIDE_LEFT, 0),
|
||||
boundaries.getBoundaryElement(BC_SIDE_RIGHT, 0), colMax,
|
||||
dtOverDeltaColSquare, nIter);
|
||||
} else if (grid.getDim() == 2) {
|
||||
const std::size_t rowMax = grid.getRow();
|
||||
const T deltaRow = grid.getDeltaRow();
|
||||
const T deltaRowSquare = deltaRow * deltaRow;
|
||||
const T dtOverDeltaRowSquare = dt / deltaRowSquare;
|
||||
|
||||
const auto serialized_boundaries = boundaries.serialize();
|
||||
|
||||
FTCS_SYCL_2D(q, grid.getConcentrations().data(), grid.getAlphaX().data(),
|
||||
grid.getAlphaY().data(), serialized_boundaries.data(), colMax,
|
||||
rowMax, dtOverDeltaColSquare, dtOverDeltaRowSquare, nIter);
|
||||
}
|
||||
}
|
||||
} // namespace tug
|
||||
@ -429,6 +429,13 @@ public:
|
||||
}
|
||||
#ifdef TUG_ENABLE_SYCL
|
||||
else if constexpr (approach == FTCS_SYCL) {
|
||||
sycl::queue &q = *this->q.get();
|
||||
|
||||
std::cout << "Running FTCS_SYCL on device: "
|
||||
<< q.get_device().get_info<sycl::info::device::name>() << "\n";
|
||||
|
||||
std::cout << std::flush;
|
||||
|
||||
FTCS_Sycl_impl(*this->q.get(), this->grid, this->bc, this->timestep,
|
||||
this->iterations * innerIterations);
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user