From 36d6374714eb3cf60a3e57e519472b692322a7e1 Mon Sep 17 00:00:00 2001 From: Miroslav Stoyanov <30537612+mkstoyanov@users.noreply.github.com> Date: Wed, 7 Feb 2024 15:05:59 -0500 Subject: [PATCH] Coefficient improvements (#663) * improved the coefficient generation --- src/asgard_matrix.hpp | 1 + src/coefficients.cpp | 625 ++++++++++++++++++++---------------------- src/fast_math.hpp | 26 +- 3 files changed, 308 insertions(+), 344 deletions(-) diff --git a/src/asgard_matrix.hpp b/src/asgard_matrix.hpp index 1dec24418..1b406ae04 100644 --- a/src/asgard_matrix.hpp +++ b/src/asgard_matrix.hpp @@ -273,6 +273,7 @@ class matrix template> void dump_to_octave(std::filesystem::path const &filename) const; + using value_type = P; using iterator = matrix_iterator

; using const_iterator = matrix_iterator

; diff --git a/src/coefficients.cpp b/src/coefficients.cpp index 0d4320427..e44dd9aaf 100644 --- a/src/coefficients.cpp +++ b/src/coefficients.cpp @@ -160,7 +160,10 @@ void generate_dimension_mass_mat( // this routine returns a 2D array representing an operator coefficient // matrix for a single dimension (1D). Each term in a PDE requires D many // coefficient matricies -template +// +// the coeff_type must match pterm.coeff_type, it is a template parameter +// so that we can simplify the code and avoid runtime cost with if-constexpr +template fk::matrix

generate_coefficients( dimension

const &dim, partial_term

const &pterm, basis::wavelet_transform const &transformer, @@ -170,6 +173,7 @@ fk::matrix

generate_coefficients( expect(transformer.degree == dim.get_degree()); expect(transformer.max_level >= dim.get_level()); expect(level <= transformer.max_level); + expect(coeff_type == pterm.coeff_type); auto g_dv_func = [g_func = pterm.g_func, dv_func = pterm.dv_func]() -> g_func_type

{ @@ -208,8 +212,10 @@ fk::matrix

generate_coefficients( // we do the two-step store because we cannot have 'static' bindings static auto const legendre_values = legendre_weights

(dim.get_degree(), -1.0, 1.0); - auto const [quadrature_points, quadrature_weights] = legendre_values; - auto const [legendre_poly_L, legendre_poly_R] = [&]() { + auto const &quadrature_points = legendre_values[0]; + auto const &quadrature_weights = legendre_values[1]; + + auto const legendre_poly_LR = [&]() { auto [lP_L, lPP_L] = legendre(fk::vector

{-1}, dim.get_degree()); lP_L = lP_L * (1 / std::sqrt(grid_spacing)); auto [lP_R, lPP_R] = legendre(fk::vector

{+1}, dim.get_degree()); @@ -219,14 +225,12 @@ fk::matrix

generate_coefficients( ignore(lPP_R); return std::array, 2>{lP_L, lP_R}; }(); - - auto const legendre_poly_L_t = fk::matrix

(legendre_poly_L).transpose(); - auto const legendre_poly_R_t = fk::matrix

(legendre_poly_R).transpose(); + auto const &legendre_poly_L = legendre_poly_LR[0]; + auto const &legendre_poly_R = legendre_poly_LR[1]; // get the basis functions and derivatives for all k // this auto is std::array, 2> - auto const [legendre_poly, - legendre_prime] = [&, quadrature_points = quadrature_points]() { + auto const legendre_poly_prime = [&]() { auto [lP, lPP] = legendre(quadrature_points, dim.get_degree()); lP = lP * (1.0 / std::sqrt(grid_spacing)); @@ -235,370 +239,304 @@ fk::matrix

generate_coefficients( return std::array, 2>{lP, lPP}; }(); - auto const legendre_poly_t = fk::matrix

(legendre_poly).transpose(); - auto const legendre_prime_t = fk::matrix

(legendre_prime).transpose(); + int const porder = dim.get_degree() - 1; + + // adds a matrix mat (scaled by alpha) into a block of coefficients + auto coeff_axpy = [&](int begin, int end, P alpha, fk::matrix

const &mat) + -> void { + fk::matrix blk(coefficients, begin, begin + porder, + end, end + porder); + + for (int j = 0; j <= porder; j++) + for (int i = 0; i <= porder; i++) + blk(i, j) += alpha * mat(i, j); + }; + + auto const &legendre_poly = legendre_poly_prime[0]; + auto const &legendre_prime = legendre_poly_prime[1]; // get jacobian auto const jacobi = grid_spacing / 2; - for (auto i = 0; i < num_cells; ++i) + fk::matrix

matrix_LtR(legendre_poly_L.ncols(), legendre_poly_R.ncols()); + fm::gemm(legendre_poly_L, legendre_poly_R, matrix_LtR, true, false, P{1}, P{0}); + + fk::matrix

matrix_LtL(legendre_poly_L.ncols(), legendre_poly_L.ncols()); + fm::gemm(legendre_poly_L, legendre_poly_L, matrix_LtL, true, false, P{1}, P{0}); + + fk::matrix

matrix_RtR(legendre_poly_R.ncols(), legendre_poly_R.ncols()); + fm::gemm(legendre_poly_R, legendre_poly_R, matrix_RtR, true, false, P{1}, P{0}); + + fk::matrix

matrix_RtL(legendre_poly_R.ncols(), legendre_poly_L.ncols()); + fm::gemm(legendre_poly_R, legendre_poly_L, matrix_RtL, true, false, P{1}, P{0}); + + // General algorithm + // For each cell, we need to compute the volume integral within the cell + // and then the fluxes given the neighboring cells to the left and right. + // Computing three blocks per cell with size (porder + 1) by (porder + 1) + // The blocks are denoted by: + // (current, current - porder - 1) + // (current, current), + // (current, current + porder + 1) + // where "current" for cell "i" is "(porder + 1) * i" + // + // The key here is to add the properly scaled matrix LtR, LtL etc. + // to the correct block of the coefficients matrix. + // + // If using periodic boundary, the left-most and right-most cells wrap around + // i.e., the left cell for the left-most cell is the right-most cell. + // Even without periodicity, left/right-most cells need special consideration + // as they must use the Dirichlet or Neumann boundary condition in the flux. + // Thus, the main for-loop works only on the interior cells + // and the left most cell (0) and right most cell (num_cells - 1) + // are handled separately. + +#pragma omp parallel { - // get left and right locations for this element - auto const x_left = dim.domain_min + i * grid_spacing; - auto const x_right = x_left + grid_spacing; - - // get index for current, first and last element - auto const current = dim.get_degree() * i; - auto const first = 0; - auto const last = dim.get_degree() * (num_cells - 1); - - // map quadrature points from [-1,1] to physical domain of this i element - fk::vector

const quadrature_points_i = [&, quadrature_points = - quadrature_points]() { - fk::vector

quadrature_points_copy(quadrature_points); - std::transform( - quadrature_points_copy.begin(), quadrature_points_copy.end(), - quadrature_points_copy.begin(), [&](P const elem) { - return ((elem + 1) / 2 + i) * grid_spacing + dim.domain_min; - }); - return quadrature_points_copy; - }(); - - fk::vector

const g_vector = [&, legendre_poly = legendre_poly]() { - fk::vector

g(quadrature_points_i.size()); - for (auto j = 0; j < quadrature_points_i.size(); ++j) + // each thread will allocate it's own tmp matrix + fk::matrix

tmp(legendre_poly.nrows(), legendre_poly.ncols()); + + // tmp will be captured inside the lambda closure + // no allocations will occur per call + auto apply_volume = [&](int i) -> void { + // the penalty term does not include a volume integral + if constexpr (coeff_type != coefficient_type::penalty) { - g(j) = g_dv_func(quadrature_points_i(j), time); - } - return g; - }(); + int const current = dim.get_degree() * i; - auto const block = [&, legendre_poly = legendre_poly, - quadrature_weights = quadrature_weights]() { - fk::matrix

tmp(legendre_poly.nrows(), legendre_poly.ncols()); - - for (int j = 0; j < tmp.nrows(); j++) - { - for (int k = 0; k < tmp.ncols(); k++) + for (int k = 0; k < tmp.nrows(); k++) { - tmp(j, k) = g_vector(j) * legendre_poly(j, k) * - quadrature_weights(j) * jacobi; + P c = g_dv_func( + (0.5 * quadrature_points[k] + 0.5 + i) * grid_spacing + dim.domain_min, time); + c *= quadrature_weights(k) * jacobi; + + for (int j = 0; j < tmp.ncols(); j++) + tmp(k, j) = c * legendre_poly(k, j); } - } - fk::matrix

output(dim.get_degree(), dim.get_degree()); - if (pterm.coeff_type == coefficient_type::mass) - { - output = legendre_poly_t * tmp; - } - else if (pterm.coeff_type == coefficient_type::grad || - pterm.coeff_type == coefficient_type::div) - { - output = legendre_prime_t * tmp * (-1); + fk::matrix blk(coefficients, current, + current + porder, current, + current + porder); + if constexpr (coeff_type == coefficient_type::mass) + // volume integral where phi is trial(tmp) and psi is test(legendre_poly) + // \int phi(x)\psi(x) dx + fm::gemm(legendre_poly, tmp, blk, true, false, P{1}, P{1}); + else // div or grad falls here + // -\int \phi(x)\psi'(x) dx + fm::gemm(legendre_prime, tmp, blk, true, false, P{-1}, P{1}); } - // If pterm.coeff_type == coefficient_type::penalty is true, there's - // no volume term so the output is zeros. - return output; - }(); - - // set the block at the correct position - fk::matrix

const curr_block = - fk::matrix(coefficients, current, - current + dim.get_degree() - 1, current, - current + dim.get_degree() - 1) + - block; - coefficients.set_submatrix(current, current, curr_block); - - if (pterm.coeff_type == coefficient_type::grad || - pterm.coeff_type == coefficient_type::div || - pterm.coeff_type == coefficient_type::penalty) + }; + +#pragma omp for + for (int i = 1; i < num_cells - 1; ++i) { - // setup numerical flux choice/boundary conditions - // - // - - //---------------------------------------------- - // Numerical Flux is defined as - // Flux = {{f}} + C/2*[[u]] - // = ( f_L + f_R )/2 + FunCoef*( u_R - u_L )/2 - // [[v]] = v_R - v_L - - // FIXME G functions should accept G(x,p,t,dat), since we don't know how - // the dat is going to be used in the G function (above it is used as - // linear multuplication but this is not always true) - - // Penalty term is just <|gfunc|/2[[f]],[[v]]> so we need to remove the - // central flux from the operators - P const central_coeff = - pterm.coeff_type == coefficient_type::penalty ? 0.0 : 1.0; - - P const flux_left = g_dv_func(x_left, time); - P const flux_right = g_dv_func(x_right, time); - - // get the "trace" values - // (values at the left and right of each element for all k) - // ------------------------------------------------------------------------- - // More detailed explanation - // Each trace_value_ evaluates - // where v is a DG functions with support on I_i. The - // difference between the trace_values_ varies with the edge the flux - // is evaluated on and the support of the DG function f. - // The legendre_poly_X is the trace of f and legende_poly_X_t is for v - // We will use f=p_X for the polynomials where X=L (left boundary of cell) - // or X=R (right boundary of cell). Similar for v but depends on the - // support - - // trace_value_1 is the interaction on x_{i-1/2} -- - // the edge between cell I_{i-1} and I_i or the left boundary of I_i. - // f is a DG function with support on I_{i-1} - // In this case: {{f}} = p_R/2, [[f]] = p_R, [[v]] = -p_L - auto trace_value_1 = - (legendre_poly_L_t * legendre_poly_R) * central_coeff * - (-1 * flux_left / 2) + - (legendre_poly_L_t * legendre_poly_R) * - (+1 * pterm.get_flux_scale() * std::abs(flux_left) / 2 * -1); - - // trace_value_2 is the interaction on x_{i-1/2} -- - // the edge between cell I_{i-1} and I_i or the left boundary of I_i. - // f is a DG function with support on I_{i} - // In this case: {{f}} = p_L/2, [[f]] = -p_L, [[v]] = -p_L - auto trace_value_2 = - (legendre_poly_L_t * legendre_poly_L) * central_coeff * - (-1 * flux_left / 2) + - (legendre_poly_L_t * legendre_poly_L) * - (-1 * pterm.get_flux_scale() * std::abs(flux_left) / 2 * -1); - - // trace_value_3 is the interaction on x_{i+1/2} -- - // the edge between cell I_i and I_{i+1} or the right boundary of I_i. - // f is a DG function with support on I_{i} - // In this case: {{f}} = p_R/2, [[f]] = p_R, [[v]] = p_R - auto trace_value_3 = - (legendre_poly_R_t * legendre_poly_R) * central_coeff * - (+1 * flux_right / 2) + - (legendre_poly_R_t * legendre_poly_R) * - (+1 * pterm.get_flux_scale() * std::abs(flux_right) / 2 * +1); - - // trace_value_4 is the interaction on x_{i+1/2} -- - // the edge between cell I_i and I_{i+1} or the right boundary of I_i. - // f is a DG function with support on I_{i+1} - // In this case: {{f}} = p_L/2, [[f]] = -p_L, [[v]] = p_R - auto trace_value_4 = - (legendre_poly_R_t * legendre_poly_L) * central_coeff * - (+1 * flux_right / 2) + - (legendre_poly_R_t * legendre_poly_L) * - (-1 * pterm.get_flux_scale() * std::abs(flux_right) / 2 * +1); - - // If dirichelt - // u^-_LEFT = g(LEFT) - // u^+_RIGHT = g(RIGHT) - boundary_condition const left = pterm.ileft; - boundary_condition const right = pterm.iright; - - // Dirichlet Boundary Conditions - // For div and grad, the boundary is not part of the bilinear operator, - // but instead tranferred to the source. Similar to an inflow condition. - // For penalty, the operator <|gfunc|/2*f,v> is applied for the case where - // f and v share the same volume support - - // If statement checking coeff_type is because gfunc can evaluate to nan - // in 1/0 case. Ex: gfunc = x, domain = [0,4] (possible in spherical - // coordinates) - - if (left == boundary_condition::dirichlet) // left dirichlet - { - if (i == 0) - { - trace_value_1 = - (legendre_poly_L_t * (legendre_poly_L - legendre_poly_L)) * (-1); - if (pterm.coeff_type == coefficient_type::penalty) - { - trace_value_2 = (legendre_poly_L_t * legendre_poly_L) * - (-1.0 * pterm.get_flux_scale() * - std::abs(flux_left) / 2.0 * -1.0); - } - else - { - trace_value_2 = - (legendre_poly_L_t * (legendre_poly_L - legendre_poly_L)) * - (-1); - } - trace_value_3 = - (legendre_poly_R_t * legendre_poly_R) * (+1 * flux_right / 2) * - central_coeff + - (legendre_poly_R_t * legendre_poly_R) * - (+1 * pterm.get_flux_scale() * std::abs(flux_right) / 2 * +1); - trace_value_4 = - (legendre_poly_R_t * legendre_poly_L) * (+1 * flux_right / 2) * - central_coeff + - (legendre_poly_R_t * legendre_poly_L) * - (-1 * pterm.get_flux_scale() * std::abs(flux_right) / 2 * +1); - } - } + // looping over the interior cells - if (right == boundary_condition::dirichlet) // right dirichlet + // get left and right locations for this element + P const x_left = dim.domain_min + i * grid_spacing; + P const x_right = x_left + grid_spacing; + + // get index for current block + int const current = dim.get_degree() * i; + + apply_volume(i); + + if constexpr (coeff_type == coefficient_type::grad or + coeff_type == coefficient_type::div or + coeff_type == coefficient_type::penalty) { - if (i == num_cells - 1) + // setup numerical flux choice/boundary conditions + // + // - + //---------------------------------------------- + // Numerical Flux is defined as + // Flux = {{f}} + C/2*[[u]] + // = ( f_L + f_R )/2 + FunCoef*( u_R - u_L )/2 + // [[v]] = v_R - v_L + + // FIXME G functions should accept G(x,p,t,dat), since we don't know how + // the dat is going to be used in the G function (above it is used as + // linear multuplication but this is not always true) + + P fluxL2 = 0.5 * g_dv_func(x_left, time); + P fluxR2 = 0.5 * g_dv_func(x_right, time); + + P const fluxL2abs = pterm.get_flux_scale() * std::abs(fluxL2); + P const fluxR2abs = pterm.get_flux_scale() * std::abs(fluxR2); + + if constexpr (coeff_type == coefficient_type::penalty) { - trace_value_1 = - (legendre_poly_L_t * legendre_poly_R) * (-1 * flux_left / 2) * - central_coeff + - (legendre_poly_L_t * legendre_poly_R) * - (+1 * pterm.get_flux_scale() * std::abs(flux_left) / 2 * -1); - trace_value_2 = - (legendre_poly_L_t * legendre_poly_L) * (-1 * flux_left / 2) * - central_coeff + - (legendre_poly_L_t * legendre_poly_L) * - (-1 * pterm.get_flux_scale() * std::abs(flux_left) / 2 * -1); - if (pterm.coeff_type == coefficient_type::penalty) - { - trace_value_3 = - (legendre_poly_R_t * legendre_poly_R) * - (+1 * pterm.get_flux_scale() * std::abs(flux_right) / 2 * +1); - } - else - { - trace_value_3 = - (legendre_poly_R_t * (legendre_poly_R - legendre_poly_R)) * - (+1); - } - trace_value_4 = - (legendre_poly_R_t * (legendre_poly_R - legendre_poly_R)) * (+1); + fluxL2 = 0; + fluxR2 = 0; } + + // get the "trace" values + // (values at the left and right of each element for all k) + // ------------------------------------------------------------------------- + // More detailed explanation + // Each trace_value_ evaluates + // where v is a DG functions with support on I_i. The + // difference between the trace_values_ varies with the edge the flux + // is evaluated on and the support of the DG function f. + // The legendre_poly_X is the trace of f and legende_poly_X_t is for v + // We will use f=p_X for the polynomials where X=L (left boundary of cell) + // or X=R (right boundary of cell). Similar for v but depends on the + // support. Note matrix multiply ordering goes by + // v_mat^T * f_mat for + + // trace_value_1 is the interaction on x_{i-1/2} -- + // the edge between cell I_{i-1} and I_i or the left boundary of I_i. + // f is a DG function with support on I_{i-1} + // In this case: {{f}} = p_R/2, [[f]] = p_R, [[v]] = -p_L + // (in the code below and in all other cases, the expressions has been + // simplified by applying the negative or positive -p_L) + coeff_axpy(current, current - porder - 1, -fluxL2 - fluxL2abs, matrix_LtR); + + // trace_value_2 is the interaction on x_{i-1/2} -- + // the edge between cell I_{i-1} and I_i or the left boundary of I_i. + // f is a DG function with support on I_{i} + // In this case: {{f}} = p_L/2, [[f]] = -p_L, [[v]] = -p_L + coeff_axpy(current, current, -fluxL2 + fluxL2abs, matrix_LtL); + + // trace_value_3 is the interaction on x_{i+1/2} -- + // the edge between cell I_i and I_{i+1} or the right boundary of I_i. + // f is a DG function with support on I_{i} + // In this case: {{f}} = p_R/2, [[f]] = p_R, [[v]] = p_R + coeff_axpy(current, current, fluxR2 + fluxR2abs, matrix_RtR); + + // trace_value_4 is the interaction on x_{i+1/2} -- + // the edge between cell I_i and I_{i+1} or the right boundary of I_i. + // f is a DG function with support on I_{i+1} + // In this case: {{f}} = p_L/2, [[f]] = -p_L, [[v]] = p_R + coeff_axpy(current, current + porder + 1, fluxR2 - fluxR2abs, matrix_RtL); + + // If dirichelt + // u^-_LEFT = g(LEFT) + // u^+_RIGHT = g(RIGHT) + + // Dirichlet Boundary Conditions + // For div and grad, the boundary is not part of the bilinear operator, + // but instead tranferred to the source. Similar to an inflow condition. + // For penalty, the operator <|gfunc|/2*f,v> is applied for the case where + // f and v share the same volume support + + // If statement checking coeff_type is because gfunc can evaluate to nan + // in 1/0 case. Ex: gfunc = x, domain = [0,4] (possible in spherical + // coordinates) + + // Neumann boundary conditions + // For div and grad, the interior trace is used to calculate the flux, + // similar to an outflow boundary condition. For penalty, nothing is + // added. } + } // for i - // If neumann - // (gradient u)*num_cells = g - // by splitting grad u = q by LDG methods, the B.C is changed to - // q*num_cells = g (=> q = g for 1D variable) - // only work for derivatives greater than 1 +#pragma omp single + { + // special case, handle the left and right boundary conditions + // the first thread that exits the for-loop above will do this work + + // need to consider various types of boundary conditions on left/right + // but we have a possible case of 1 cell, so left-most is also right-most - // Neumann boundary conditions - // For div and grad, the interior trace is used to calculate the flux, - // similar to an outflow boundary condition. For penalty, nothing is - // added. + apply_volume(0); // left-most cell + if (num_cells > 1) // if right-most is not left-most + apply_volume(num_cells - 1); - if (left == boundary_condition::neumann) // left neumann + if constexpr (coeff_type == coefficient_type::grad or + coeff_type == coefficient_type::div or + coeff_type == coefficient_type::penalty) { - if (i == 0) + // get index for the last element (first is zero) + int const last = dim.get_degree() * (num_cells - 1); + + P fluxL2 = 0.5 * g_dv_func(dim.domain_min, time); + P fluxR2 = 0.5 * g_dv_func(dim.domain_min + grid_spacing, time); + + P fluxL2abs = pterm.get_flux_scale() * std::abs(fluxL2); + P fluxR2abs = pterm.get_flux_scale() * std::abs(fluxR2); + + if constexpr (coeff_type == coefficient_type::penalty) { - trace_value_1 = - (legendre_poly_L_t * (legendre_poly_L - legendre_poly_L)) * (-1); - if (pterm.coeff_type == coefficient_type::penalty) - { - trace_value_2 = - (legendre_poly_L_t * (legendre_poly_L - legendre_poly_L)) * - (-1); - } - else - { - trace_value_2 = - (legendre_poly_L_t * legendre_poly_L) * (-1 * flux_left); - } - trace_value_3 = - (legendre_poly_R_t * legendre_poly_R) * (+1 * flux_right / 2) * - central_coeff + - (legendre_poly_R_t * legendre_poly_R) * - (+1 * pterm.get_flux_scale() * std::abs(flux_right) / 2 * +1); - trace_value_4 = - (legendre_poly_R_t * legendre_poly_L) * (+1 * flux_right / 2) * - central_coeff + - (legendre_poly_R_t * legendre_poly_L) * - (-1 * pterm.get_flux_scale() * std::abs(flux_right) / 2 * +1); + fluxL2 = 0; + fluxR2 = 0; } - } - if (right == boundary_condition::neumann) // right neumann - { - if (i == num_cells - 1) + // handle the left-boundary + switch (pterm.ileft) { - trace_value_1 = - (legendre_poly_L_t * legendre_poly_R) * (-1 * flux_left / 2) * - central_coeff + - (legendre_poly_L_t * legendre_poly_R) * - (+1 * pterm.get_flux_scale() * std::abs(flux_left) / 2 * -1); - trace_value_2 = - (legendre_poly_L_t * legendre_poly_L) * (-1 * flux_left / 2) * - central_coeff + - (legendre_poly_L_t * legendre_poly_L) * - (-1 * pterm.get_flux_scale() * std::abs(flux_left) / 2 * -1); - if (pterm.coeff_type == coefficient_type::penalty) - { - trace_value_3 = - (legendre_poly_R_t * (legendre_poly_R - legendre_poly_R)) * - (+1); - } - else - { - trace_value_3 = - (legendre_poly_R_t * legendre_poly_R) * (+1 * flux_right); - } - trace_value_4 = - (legendre_poly_R_t * (legendre_poly_R - legendre_poly_R)) * (+1); + case boundary_condition::dirichlet: + // If penalty then we add <|g|/2[f],[v]> + // Else we're wanting no flux as this is handed by the + // boundary conditions. + if constexpr (coeff_type == coefficient_type::penalty) + coeff_axpy(0, 0, fluxL2abs, matrix_LtL); + break; + + case boundary_condition::neumann: + // If penalty then we add nothing + // Else we want to standard (outflow) flux + // = + if constexpr (coeff_type != coefficient_type::penalty) + coeff_axpy(0, 0, -2.0 * fluxL2, matrix_LtL); + break; + + default: // case boundary_condition::periodic + coeff_axpy(0, last, -fluxL2 - fluxL2abs, matrix_LtR); + coeff_axpy(0, 0, -fluxL2 + fluxL2abs, matrix_LtL); + break; } - } - // Add trace values to matrix + if (num_cells > 1) + { + // right boundary of the left-most cell is in the interior + coeff_axpy(0, 0, fluxR2 + fluxR2abs, matrix_RtR); + coeff_axpy(0, porder + 1, fluxR2 - fluxR2abs, matrix_RtL); - int row1 = current; - int col1 = current - dim.get_degree(); + // at this point, we are done with the left-most cell + // switch the flux to the right-most cell - int row2 = current; - int col2 = current; + fluxL2 = 0.5 * g_dv_func(dim.domain_max - grid_spacing, time); + fluxR2 = 0.5 * g_dv_func(dim.domain_max, time); - int row3 = current; - int col3 = current; + fluxL2abs = pterm.get_flux_scale() * std::abs(fluxL2); + fluxR2abs = pterm.get_flux_scale() * std::abs(fluxR2); - int row4 = current; - int col4 = current + dim.get_degree(); + if constexpr (coeff_type == coefficient_type::penalty) + { + fluxL2 = 0; + fluxR2 = 0; + } - if (left == boundary_condition::periodic || - right == boundary_condition::periodic) - { - if (i == 0) - { - row1 = current; - col1 = last; + // left boundary of the right-most cell is in the interior + coeff_axpy(last, last - porder - 1, -fluxL2 - fluxL2abs, matrix_LtR); + coeff_axpy(last, last, -fluxL2 + fluxL2abs, matrix_LtL); } - if (i == num_cells - 1) + + // handle the right boundary condition + switch (pterm.iright) { - row4 = current; - col4 = first; + case boundary_condition::dirichlet: + if constexpr (coeff_type == coefficient_type::penalty) + coeff_axpy(last, last, fluxR2abs, matrix_RtR); + break; + + case boundary_condition::neumann: + if constexpr (coeff_type != coefficient_type::penalty) + coeff_axpy(last, last, 2.0 * fluxR2, matrix_RtR); + break; + + default: // case boundary_condition::periodic + coeff_axpy(last, last, fluxR2 + fluxR2abs, matrix_RtR); + coeff_axpy(last, 0, fluxR2 - fluxR2abs, matrix_RtL); + break; } } + } // #pragma omp single - if (i != 0 || left == boundary_condition::periodic || - right == boundary_condition::periodic) - { - // Add trace part 1 - fk::matrix block1(coefficients, row1, - row1 + dim.get_degree() - 1, col1, - col1 + dim.get_degree() - 1); - block1 = block1 + trace_value_1; - } + } // #pragma omp parallel - // Add trace part 2 - fk::matrix block2(coefficients, row2, - row2 + dim.get_degree() - 1, col2, - col2 + dim.get_degree() - 1); - block2 = block2 + trace_value_2; - - // Add trace part 3 - fk::matrix block3(coefficients, row3, - row3 + dim.get_degree() - 1, col3, - col3 + dim.get_degree() - 1); - block3 = block3 + trace_value_3; - if (i != num_cells - 1 || left == boundary_condition::periodic || - right == boundary_condition::periodic) - { - // Add trace part 4 - fk::matrix block4(coefficients, row4, - row4 + dim.get_degree() - 1, col4, - col4 + dim.get_degree() - 1); - block4 = block4 + trace_value_4; - } - } - } - - if (pterm.coeff_type == coefficient_type::grad) + if constexpr (coeff_type == coefficient_type::grad) { // take the negative transpose of div coefficients.transpose(); @@ -621,6 +559,25 @@ fk::matrix

generate_coefficients( return coefficients; } +template +fk::matrix

generate_coefficients( + dimension

const &dim, partial_term

const &pterm, + basis::wavelet_transform const &transformer, + int const level, P const time, bool const rotate) +{ + switch (pterm.coeff_type) + { + case coefficient_type::mass: + return generate_coefficients(dim, pterm, transformer, level, time, rotate); + case coefficient_type::grad: + return generate_coefficients(dim, pterm, transformer, level, time, rotate); + case coefficient_type::div: + return generate_coefficients(dim, pterm, transformer, level, time, rotate); + default: // case coefficient_type::penalty: + return generate_coefficients(dim, pterm, transformer, level, time, rotate); + } +} + #ifdef ASGARD_ENABLE_DOUBLE template fk::matrix generate_coefficients( dimension const &dim, partial_term const &pterm, diff --git a/src/fast_math.hpp b/src/fast_math.hpp index cada023cf..04117f674 100644 --- a/src/fast_math.hpp +++ b/src/fast_math.hpp @@ -194,23 +194,29 @@ gemm(fk::matrix const &A, fk::matrix const &B, expect(C.ncols() == cols_B); expect(cols_A == rows_B); - int lda = A.stride(); - int ldb = B.stride(); - int ldc = C.stride(); - P alpha_ = alpha; - P beta_ = beta; + int lda = A.stride(); + int ldb = B.stride(); + int ldc = C.stride(); + char const transa = trans_A ? 't' : 'n'; char const transb = trans_B ? 't' : 'n'; - int m = rows_A; - int n = cols_B; - int k = rows_B; - lib_dispatch::gemm(transa, transb, m, n, k, alpha_, A.data(), lda, - B.data(), ldb, beta_, C.data(), ldc); + lib_dispatch::gemm(transa, transb, rows_A, cols_B, rows_B, alpha, A.data(), lda, + B.data(), ldb, beta, C.data(), ldc); return C; } +// gemm - shortcut giving alpha/beta before the matrices and skips the transposes +template +fk::matrix & +gemm(P const alpha, fk::matrix const &A, fk::matrix const &B, + P const beta, fk::matrix &C) +{ + return gemm(A, B, C, false, false, alpha, beta); +} + /** gesv - Solve Ax=B using LU decomposition * * \param A n-by-n coefficient matrix