diff --git a/include/tug/Core/BTCS.hpp b/include/tug/Core/BTCS.hpp index cc41abc..47e1608 100644 --- a/include/tug/Core/BTCS.hpp +++ b/include/tug/Core/BTCS.hpp @@ -358,23 +358,38 @@ static Eigen::VectorX ThomasAlgorithmAVX(Eigen::SparseMatrix &A, const __m256d zero = _mm256_setzero_pd(); const __m256d one = _mm256_set1_pd(1.0); - for (Eigen::Index i = 1; i < n; i += avx_size) { - // Load vectors into AVX registers - __m256d a_diag_avx = _mm256_loadu_pd(&a_diag[i]); - __m256d b_diag_avx = _mm256_loadu_pd(&b_diag[i]); - __m256d c_diag_avx = _mm256_loadu_pd(&c_diag[i]); - __m256d x_vec_avx = _mm256_loadu_pd(&x_vec[i]); + /* + c_diag[i] /= b_diag[i] - a_diag[i] * c_diag[i - 1]; + x_vec[i] = (x_vec[i] - a_diag[i] * x_vec[i - 1]) / (b_diag[i] - a_diag[i] * c_diag[i - 1]); + */ - // Compute intermediate results - __m256d factor = _mm256_div_pd(a_diag_avx, b_diag_avx); - c_diag_avx = _mm256_div_pd(c_diag_avx, _mm256_sub_pd(b_diag_avx, _mm256_mul_pd(a_diag_avx, c_diag_avx))); - x_vec_avx = _mm256_div_pd(_mm256_sub_pd(x_vec_avx, _mm256_mul_pd(a_diag_avx, x_vec_avx)), _mm256_sub_pd(b_diag_avx, _mm256_mul_pd(a_diag_avx, c_diag_avx))); + for (Eigen::Index i = 1; i < n; i+=4) { // Hier verwenden wir 4 Elemente gleichzeitig + __m256d a = _mm256_loadu_pd(&a_diag[i]); + __m256d b = _mm256_loadu_pd(&b_diag[i]); + __m256d c = _mm256_loadu_pd(&c_diag[i]); + __m256d x = _mm256_loadu_pd(&x_vec[i]); - // Store results back to memory - _mm256_storeu_pd(&c_diag[i], c_diag_avx); - _mm256_storeu_pd(&x_vec[i], x_vec_avx); + __m256d prev_c = _mm256_loadu_pd(&c_diag[i - 1]); + __m256d prev_x = _mm256_loadu_pd(&x_vec[i - 1]); + + __m256d tmp = _mm256_sub_pd(b, _mm256_mul_pd(a, prev_c)); + + c = _mm256_div_pd(c, tmp); + x = _mm256_div_pd(_mm256_sub_pd(x, _mm256_mul_pd(a, prev_x)), tmp); + + _mm256_storeu_pd(&c_diag[i], c); + _mm256_storeu_pd(&x_vec[i], x); } + // Remainder: Handle elements that are not multiples of 4 + for (Eigen::Index i = n - (n%4); i < n; i++) { + c_diag[i] /= b_diag[i] - a_diag[i] * c_diag[i - 1]; + x_vec[i] = (x_vec[i] - a_diag[i] * x_vec[i - 1]) / (b_diag[i] - a_diag[i] * c_diag[i - 1]); + } + //std::cout << x_vec[0] << std::endl; + x_vec[n] = (x_vec[n] - a_diag[n] * x_vec[n - 1]) / + (b_diag[n] - a_diag[n] * c_diag[n - 1]); + // Perform final back substitution for (Eigen::Index i = n; i-- > 0;) { x_vec[i] -= c_diag[i] * x_vec[i + 1]; @@ -501,11 +516,22 @@ void BTCS_LU(Grid &grid, Boundary &bc, T timestep, int numThreads) { // entry point for Thomas algorithm solver; differentiate 1D and 2D grid template -void BTCS_Thomas(Grid &grid, Boundary &bc, T timestep, int numThreads) { +void BTCS_Thomas(Grid &grid, Boundary &bc, T timestep, int numThreads, bool enableAVX) { if (grid.getDim() == 1) { - BTCS_1D(grid, bc, timestep, ThomasAlgorithm); + if(enableAVX) { + BTCS_1D(grid, bc, timestep, ThomasAlgorithmAVX); + } + else { + BTCS_1D(grid, bc, timestep, ThomasAlgorithm); + } } else if (grid.getDim() == 2) { - BTCS_2D(grid, bc, timestep, ThomasAlgorithm, numThreads); + if(enableAVX) { + BTCS_2D(grid, bc, timestep, ThomasAlgorithmAVX, numThreads); + } + else { + BTCS_2D(grid, bc, timestep, ThomasAlgorithm, numThreads); + } + } else { throw_invalid_argument( "Error: Only 1- and 2-dimensional grids are defined!"); diff --git a/include/tug/Simulation.hpp b/include/tug/Simulation.hpp index 469b411..dabff99 100644 --- a/include/tug/Simulation.hpp +++ b/include/tug/Simulation.hpp @@ -46,9 +46,9 @@ enum APPROACH { * */ enum SOLVER { - EIGEN_LU_SOLVER, /*!< EigenLU solver */ - THOMAS_ALGORITHM_SOLVER /*!< Thomas Algorithm solver; more efficient for - tridiagonal matrices */ + EIGEN_LU_SOLVER, /*!< EigenLU solver */ + THOMAS_ALGORITHM_SOLVER /*!< Thomas Algorithm solver; more efficient for + tridiagonal matrices */ }; /** @@ -82,6 +82,11 @@ enum TIME_MEASURE { TIME_MEASURE_ON /*!< print time measure after last iteration */ }; +enum AVX { + AVX_ENABLED, + AVX_DISABLED +}; + /** * @brief The class forms the interface for performing the diffusion simulations * and contains all the methods for controlling the desired parameters, such as @@ -93,7 +98,8 @@ enum TIME_MEASURE { * @tparam solver Set the solver to be used */ template + SOLVER solver = THOMAS_ALGORITHM_SOLVER, + AVX enableAVX = AVX_DISABLED> class Simulation { public: /** @@ -433,8 +439,12 @@ public: if (csv_output >= CSV_OUTPUT_VERBOSE) { printConcentrationsCSV(filename); } - - BTCS_Thomas(this->grid, this->bc, this->timestep, this->numThreads); + if constexpr (enableAVX == AVX_ENABLED) { + BTCS_Thomas(this->grid, this->bc, this->timestep, this->numThreads, true); + } + else { + BTCS_Thomas(this->grid, this->bc, this->timestep, this->numThreads, false); + } } } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 19c4daf..5570148 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,3 +1,4 @@ +cmake_minimum_required(VERSION 3.29) find_library(DOCTEST_LIB doctest) if(NOT DOCTEST_LIB)