mirror of
https://git.gfz-potsdam.de/naaice/tug.git
synced 2025-12-15 18:38:23 +01:00
fixed avx algorithm and added constexpr to switch avx
This commit is contained in:
parent
001f59fae8
commit
247999f69a
@ -358,23 +358,38 @@ static Eigen::VectorX<T> ThomasAlgorithmAVX(Eigen::SparseMatrix<T> &A,
|
||||
const __m256d zero = _mm256_setzero_pd();
|
||||
const __m256d one = _mm256_set1_pd(1.0);
|
||||
|
||||
for (Eigen::Index i = 1; i < n; i += avx_size) {
|
||||
// Load vectors into AVX registers
|
||||
__m256d a_diag_avx = _mm256_loadu_pd(&a_diag[i]);
|
||||
__m256d b_diag_avx = _mm256_loadu_pd(&b_diag[i]);
|
||||
__m256d c_diag_avx = _mm256_loadu_pd(&c_diag[i]);
|
||||
__m256d x_vec_avx = _mm256_loadu_pd(&x_vec[i]);
|
||||
/*
|
||||
c_diag[i] /= b_diag[i] - a_diag[i] * c_diag[i - 1];
|
||||
x_vec[i] = (x_vec[i] - a_diag[i] * x_vec[i - 1]) / (b_diag[i] - a_diag[i] * c_diag[i - 1]);
|
||||
*/
|
||||
|
||||
// Compute intermediate results
|
||||
__m256d factor = _mm256_div_pd(a_diag_avx, b_diag_avx);
|
||||
c_diag_avx = _mm256_div_pd(c_diag_avx, _mm256_sub_pd(b_diag_avx, _mm256_mul_pd(a_diag_avx, c_diag_avx)));
|
||||
x_vec_avx = _mm256_div_pd(_mm256_sub_pd(x_vec_avx, _mm256_mul_pd(a_diag_avx, x_vec_avx)), _mm256_sub_pd(b_diag_avx, _mm256_mul_pd(a_diag_avx, c_diag_avx)));
|
||||
for (Eigen::Index i = 1; i < n; i+=4) { // Hier verwenden wir 4 Elemente gleichzeitig
|
||||
__m256d a = _mm256_loadu_pd(&a_diag[i]);
|
||||
__m256d b = _mm256_loadu_pd(&b_diag[i]);
|
||||
__m256d c = _mm256_loadu_pd(&c_diag[i]);
|
||||
__m256d x = _mm256_loadu_pd(&x_vec[i]);
|
||||
|
||||
// Store results back to memory
|
||||
_mm256_storeu_pd(&c_diag[i], c_diag_avx);
|
||||
_mm256_storeu_pd(&x_vec[i], x_vec_avx);
|
||||
__m256d prev_c = _mm256_loadu_pd(&c_diag[i - 1]);
|
||||
__m256d prev_x = _mm256_loadu_pd(&x_vec[i - 1]);
|
||||
|
||||
__m256d tmp = _mm256_sub_pd(b, _mm256_mul_pd(a, prev_c));
|
||||
|
||||
c = _mm256_div_pd(c, tmp);
|
||||
x = _mm256_div_pd(_mm256_sub_pd(x, _mm256_mul_pd(a, prev_x)), tmp);
|
||||
|
||||
_mm256_storeu_pd(&c_diag[i], c);
|
||||
_mm256_storeu_pd(&x_vec[i], x);
|
||||
}
|
||||
|
||||
// Remainder: Handle elements that are not multiples of 4
|
||||
for (Eigen::Index i = n - (n%4); i < n; i++) {
|
||||
c_diag[i] /= b_diag[i] - a_diag[i] * c_diag[i - 1];
|
||||
x_vec[i] = (x_vec[i] - a_diag[i] * x_vec[i - 1]) / (b_diag[i] - a_diag[i] * c_diag[i - 1]);
|
||||
}
|
||||
//std::cout << x_vec[0] << std::endl;
|
||||
x_vec[n] = (x_vec[n] - a_diag[n] * x_vec[n - 1]) /
|
||||
(b_diag[n] - a_diag[n] * c_diag[n - 1]);
|
||||
|
||||
// Perform final back substitution
|
||||
for (Eigen::Index i = n; i-- > 0;) {
|
||||
x_vec[i] -= c_diag[i] * x_vec[i + 1];
|
||||
@ -501,11 +516,22 @@ void BTCS_LU(Grid<T> &grid, Boundary<T> &bc, T timestep, int numThreads) {
|
||||
|
||||
// entry point for Thomas algorithm solver; differentiate 1D and 2D grid
|
||||
template <class T>
|
||||
void BTCS_Thomas(Grid<T> &grid, Boundary<T> &bc, T timestep, int numThreads) {
|
||||
void BTCS_Thomas(Grid<T> &grid, Boundary<T> &bc, T timestep, int numThreads, bool enableAVX) {
|
||||
if (grid.getDim() == 1) {
|
||||
BTCS_1D(grid, bc, timestep, ThomasAlgorithm);
|
||||
if(enableAVX) {
|
||||
BTCS_1D(grid, bc, timestep, ThomasAlgorithmAVX);
|
||||
}
|
||||
else {
|
||||
BTCS_1D(grid, bc, timestep, ThomasAlgorithm);
|
||||
}
|
||||
} else if (grid.getDim() == 2) {
|
||||
BTCS_2D(grid, bc, timestep, ThomasAlgorithm, numThreads);
|
||||
if(enableAVX) {
|
||||
BTCS_2D(grid, bc, timestep, ThomasAlgorithmAVX, numThreads);
|
||||
}
|
||||
else {
|
||||
BTCS_2D(grid, bc, timestep, ThomasAlgorithm, numThreads);
|
||||
}
|
||||
|
||||
} else {
|
||||
throw_invalid_argument(
|
||||
"Error: Only 1- and 2-dimensional grids are defined!");
|
||||
|
||||
@ -46,9 +46,9 @@ enum APPROACH {
|
||||
*
|
||||
*/
|
||||
enum SOLVER {
|
||||
EIGEN_LU_SOLVER, /*!< EigenLU solver */
|
||||
THOMAS_ALGORITHM_SOLVER /*!< Thomas Algorithm solver; more efficient for
|
||||
tridiagonal matrices */
|
||||
EIGEN_LU_SOLVER, /*!< EigenLU solver */
|
||||
THOMAS_ALGORITHM_SOLVER /*!< Thomas Algorithm solver; more efficient for
|
||||
tridiagonal matrices */
|
||||
};
|
||||
|
||||
/**
|
||||
@ -82,6 +82,11 @@ enum TIME_MEASURE {
|
||||
TIME_MEASURE_ON /*!< print time measure after last iteration */
|
||||
};
|
||||
|
||||
enum AVX {
|
||||
AVX_ENABLED,
|
||||
AVX_DISABLED
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief The class forms the interface for performing the diffusion simulations
|
||||
* and contains all the methods for controlling the desired parameters, such as
|
||||
@ -93,7 +98,8 @@ enum TIME_MEASURE {
|
||||
* @tparam solver Set the solver to be used
|
||||
*/
|
||||
template <class T, APPROACH approach = BTCS_APPROACH,
|
||||
SOLVER solver = THOMAS_ALGORITHM_SOLVER>
|
||||
SOLVER solver = THOMAS_ALGORITHM_SOLVER,
|
||||
AVX enableAVX = AVX_DISABLED>
|
||||
class Simulation {
|
||||
public:
|
||||
/**
|
||||
@ -433,8 +439,12 @@ public:
|
||||
if (csv_output >= CSV_OUTPUT_VERBOSE) {
|
||||
printConcentrationsCSV(filename);
|
||||
}
|
||||
|
||||
BTCS_Thomas(this->grid, this->bc, this->timestep, this->numThreads);
|
||||
if constexpr (enableAVX == AVX_ENABLED) {
|
||||
BTCS_Thomas(this->grid, this->bc, this->timestep, this->numThreads, true);
|
||||
}
|
||||
else {
|
||||
BTCS_Thomas(this->grid, this->bc, this->timestep, this->numThreads, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.29)
|
||||
find_library(DOCTEST_LIB doctest)
|
||||
|
||||
if(NOT DOCTEST_LIB)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user