diff --git a/CMakeLists.txt b/CMakeLists.txt index dc9801a59..612f45bdc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -102,6 +102,14 @@ option (ASGARD_USE_TIMER "Enable the builtin profiling tool" ON) option (ASGARD_RECOMMENDED_DEFAULTS "Enable OpenMP, set some flags, download OpenBLAS if system BLAS is missing." OFF) option (ASGARD_BUILD_DOCS "(incomplete) Build the documentation." OFF) +if (ASGARD_USE_TIMER) + if (ASGARD_USE_MPI) + option (ASGARD_USE_FLOPCOUNTER "Counts flops as part of the timing process" ON) + else() + option (ASGARD_USE_FLOPCOUNTER "Counts flops as part of the timing process" OFF) + endif() +endif() + if (NOT ASGARD_USE_MPI AND ASGARD_USE_CUDA) message(FATAL_ERROR "CUDA has been temporarily disabled for the non-mpi mode") endif() @@ -636,6 +644,9 @@ if (ASGARD_BUILD_TESTS) ASGARD_USE_TIMER) message(STATUS " ${_opt}=${${_opt}}") endforeach() + if (ASGARD_USE_TIMER) + message(STATUS " ASGARD_USE_FLOPCOUNTER=${ASGARD_USE_FLOPCOUNTER}") + endif() if (ASGARD_USE_CUDA) foreach(_opt CMAKE_CUDA_COMPILER CMAKE_CUDA_FLAGS ASGARD_USE_GPU_MEM_LIMIT) message(STATUS " ${_opt}=${${_opt}}") diff --git a/src/asgard_adapt.cpp b/src/asgard_adapt.cpp index ffc9d6034..fa1f34848 100644 --- a/src/asgard_adapt.cpp +++ b/src/asgard_adapt.cpp @@ -108,6 +108,7 @@ template fk::vector

distributed_grid

::coarsen_solution(PDE

&pde, fk::vector

const &x) { + auto session = tools::time_session("coarsen solution"); auto const coarse_y = this->coarsen(x, pde.options()); update_levels(this->get_table(), pde); return coarse_y; @@ -117,6 +118,7 @@ template fk::vector

distributed_grid

::refine_solution(PDE

&pde, fk::vector

const &x) { + auto session = tools::time_session("refine solution"); auto const refine_y = this->refine(x, pde.options()); update_levels(this->get_table(), pde); return refine_y; diff --git a/src/asgard_block_matrix.hpp b/src/asgard_block_matrix.hpp index 4863cac9c..da2633186 100644 --- a/src/asgard_block_matrix.hpp +++ b/src/asgard_block_matrix.hpp @@ -460,6 +460,10 @@ struct block_sparse_matrix P *operator[] (int64_t i) { return data_[i]; } //! returns the block at the given index P const *operator[] (int64_t i) const { return data_[i]; } + //! returns the internal data + P *data() { return data_[0]; } + //! returns the internal data (const overload) + P const *data() const { return data_[0]; } //! converts the matrix to a full one, mostly for testing/plotting block_matrix

to_full(connection_patterns const &conns) const diff --git a/src/asgard_boundary_conditions.cpp b/src/asgard_boundary_conditions.cpp index e0a70c4af..906d8ae77 100644 --- a/src/asgard_boundary_conditions.cpp +++ b/src/asgard_boundary_conditions.cpp @@ -111,6 +111,7 @@ std::array, 2> make_unscaled_bc_parts( connection_patterns const &conn, int const start_element, int const stop_element, P const t_init) { + tools::time_event timing("make unscaled bc"); expect(start_element >= 0); expect(stop_element < table.size()); expect(stop_element >= start_element); diff --git a/src/asgard_build_info.hpp.in b/src/asgard_build_info.hpp.in index 07bd50f01..247500a80 100644 --- a/src/asgard_build_info.hpp.in +++ b/src/asgard_build_info.hpp.in @@ -12,6 +12,7 @@ #cmakedefine ASGARD_USE_HIGHFIVE #cmakedefine ASGARD_USE_TIMER +#cmakedefine ASGARD_USE_FLOPCOUNTER #cmakedefine ASGARD_USE_CUDA #cmakedefine ASGARD_USE_GPU_MEM_LIMIT #cmakedefine ASGARD_USE_OPENMP diff --git a/src/asgard_discretization.cpp b/src/asgard_discretization.cpp index e8a32154a..40767bb1b 100644 --- a/src/asgard_discretization.cpp +++ b/src/asgard_discretization.cpp @@ -88,6 +88,11 @@ discretization_manager::discretization_manager( fixed_bc = boundary_conditions::make_unscaled_bc_parts( *pde, grid.get_table(), transformer, hier, matrices, conn, msg.row_start, msg.row_stop); +#ifdef KRON_MODE_GLOBAL + // the imex-flag is not used internally + kronops.make(imex_flag::unspecified, *pde, matrices, grid); +#endif + if (high_verbosity()) node_out() << " generating: moment vectors..." << '\n'; diff --git a/src/asgard_discretization.hpp b/src/asgard_discretization.hpp index ebb8f4d6e..db512f1f5 100644 --- a/src/asgard_discretization.hpp +++ b/src/asgard_discretization.hpp @@ -253,13 +253,19 @@ class discretization_manager //! update components on grid reset void update_grid_components() { + tools::time_event performance("update grid components"); kronops.clear(); generate_coefficients(*pde, matrices, conn, hier, time_, coeff_update_mode::independent); -#ifndef KRON_MODE_GLOBAL + +#ifdef KRON_MODE_GLOBAL + // the imex-flag is not used internally + kronops.make(imex_flag::unspecified, *pde, matrices, grid); +#else pde->coeffs_.resize(pde->num_terms() * pde->num_dims()); for (int64_t t : indexof(pde->coeffs_.size())) pde->coeffs_[t] = matrices.term_coeffs[t].to_fk_matrix(degree_ + 1, conn); #endif + auto const my_subgrid = grid.get_subgrid(get_rank()); fixed_bc = boundary_conditions::make_unscaled_bc_parts( *pde, grid.get_table(), transformer, hier, matrices, @@ -273,7 +279,7 @@ class discretization_manager //! rebuild the moments void reset_moments() { - tools::time_event performance("update_system"); + tools::time_event performance("reset moments"); int const level = pde->get_dimensions()[0].get_level(); precision const min = pde->get_dimensions()[0].domain_min; diff --git a/src/asgard_kron_operators.hpp b/src/asgard_kron_operators.hpp index 4b68cd612..c23708304 100644 --- a/src/asgard_kron_operators.hpp +++ b/src/asgard_kron_operators.hpp @@ -38,7 +38,8 @@ struct kron_operators } template - void apply(imex_flag entry, precision alpha, precision const x[], precision beta, precision y[]) const + void apply(imex_flag entry, precision alpha, precision const x[], + precision beta, precision y[]) const { apply(entry, 0, alpha, x, beta, y); } @@ -203,15 +204,19 @@ struct kron_operators {} template - void apply(imex_flag entry, precision alpha, precision const x[], precision beta, precision y[]) const + void apply(imex_flag entry, precision alpha, precision const x[], + precision beta, precision y[]) const { apply(entry, precision{0}, alpha, x, beta, y); } //! \brief Apply the given matrix entry template - void apply(imex_flag entry, precision time, precision alpha, precision const x[], precision beta, precision y[]) const + void apply(imex_flag entry, precision time, precision alpha, precision const x[], + precision beta, precision y[]) const { + auto const &terms = term_groups_[static_cast(entry)]; + // prep stage for the operator application // apply the beta parameter, all operations are incremental if (beta == 0) @@ -220,10 +225,9 @@ struct kron_operators lib_dispatch::scal(kglobal.num_active(), beta, y, 1); // if any work will be done, copy x into the padded workspace - if (kglobal.is_active(entry) or interp) - std::copy_n(x, kglobal.num_active(), workspace.x.begin()); + std::copy_n(x, kglobal.num_active(), workspace.x.begin()); - kglobal.template apply(entry, alpha, y); + kglobal.template apply(*tcoeffs, terms, alpha, y); if (interp) { @@ -249,7 +253,7 @@ struct kron_operators int64_t flops(imex_flag entry) const { - return kglobal.flops(entry); + return kglobal.flops(entry, term_groups_); } //! \brief Make the matrix for the given entry @@ -257,49 +261,60 @@ struct kron_operators coefficient_matrices &cmats, adapt::distributed_grid const &grid) { - if (pde_ == nullptr and pde.has_interp()) + tools::time_event timing("make kron-operators"); + tcoeffs = &cmats.term_coeffs; + if (pde_ == nullptr) { - pde.get_domain_bounds(dmin, dslope); - domain_scale = precision{1}; - for (int d = 0; d < pde.num_dims(); d++) + pde_ = &pde; + for (auto im : {imex_flag::unspecified, imex_flag::imex_explicit, imex_flag::imex_implicit}) + term_groups_[static_cast(im)] = get_used_terms(pde, im); + + if (pde.has_interp()) { - dslope[d] -= dmin[d]; - domain_scale *= dslope[d]; - } - domain_scale = precision{1} / std::sqrt(domain_scale); + pde.get_domain_bounds(dmin, dslope); + domain_scale = precision{1}; + for (int d = 0; d < pde.num_dims(); d++) + { + dslope[d] -= dmin[d]; + domain_scale *= dslope[d]; + } + domain_scale = precision{1} / std::sqrt(domain_scale); - pde_ = &pde; - interp = interpolation(pde_->num_dims(), conn_->get(connect_1d::hierarchy::volume), &workspace); + + interp = interpolation(pde_->num_dims(), conn_->get(connect_1d::hierarchy::volume), &workspace); + } } if (not kglobal) { kglobal = make_block_global_kron_matrix( pde, grid, conn_->get(connect_1d::hierarchy::volume), conn_->get(connect_1d::hierarchy::full), &workspace, verbosity); - set_specific_mode(pde, cmats, *conn_, grid, entry, kglobal); if (interp) { finterp.resize(workspace.x.size()); inodes.clear(); } } - else if (not kglobal.specific_is_set(entry)) - set_specific_mode(pde, cmats, *conn_, grid, entry, kglobal); + + // rebuild the preconditioner + if (entry == imex_flag::imex_implicit or pde.use_implicit()) + { + int const imex_indx = static_cast(entry); + build_preconditioner(pde, cmats, *conn_, grid, + term_groups_[imex_indx], kglobal.pre_con_); + } } /*! * \brief Either makes the matrix or if it exists, just updates only the * coefficients + * + * TODO: remove this method once the local-mode no longer needs this. */ - void reset_coefficients(imex_flag entry, PDE const &pde, - coefficient_matrices &cmats, - adapt::distributed_grid const &grid) - { - if (not kglobal) - make(entry, pde, cmats, grid); - else - set_specific_mode(pde, cmats, *conn_, grid, entry, kglobal); - } + void reset_coefficients(imex_flag, PDE const &, + coefficient_matrices &, + adapt::distributed_grid const &) + {} //! \brief Clear all matrices void clear() @@ -383,6 +398,9 @@ struct kron_operators std::array dmin, dslope; connection_patterns const *conn_ = nullptr; + std::array, 3> term_groups_; + std::vector> const *tcoeffs = nullptr; + block_global_kron_matrix kglobal; interpolation interp; diff --git a/src/asgard_kronmult_matrix.cpp b/src/asgard_kronmult_matrix.cpp index 654a233eb..d72bdf4a0 100644 --- a/src/asgard_kronmult_matrix.cpp +++ b/src/asgard_kronmult_matrix.cpp @@ -28,12 +28,6 @@ std::vector get_used_terms(PDE const &pde, imex_flag const imex) } } -/*! - * \brief Constructs a preconditioner - * - * The preconditioner should go into another file, but that will come with a - * big cleanup of the kronmult logic (and the removal of the old code). - */ template void build_preconditioner(PDE const &pde, coefficient_matrices const &cmats, @@ -1101,17 +1095,14 @@ int get_flux_direction(PDE const &pde, int term_id) template template void block_global_kron_matrix::apply( - imex_flag etype, precision alpha, precision *y) const + std::vector> const &tcoeffs, + std::vector const &used_terms, precision alpha, precision *y) const { - int const imex = static_cast(etype); - - std::vector const &used_terms = term_groups_[imex]; - std::fill_n(workspace_->y.begin(), num_padded_, precision{0}); kronmult::global_cpu(num_dimensions_, blockn_, block_size_, ilist_, dsort_, perms_, flux_dir_, *conn_volumes_, *conn_full_, - gvals_, used_terms, workspace_->x.data(), + tcoeffs, used_terms, workspace_->x.data(), workspace_->y.data(), *workspace_); precision const *py = workspace_->y.data(); @@ -1128,6 +1119,7 @@ make_block_global_kron_matrix(PDE const &pde, kronmult::block_global_workspace *workspace, verbosity_level verb) { + tools::time_event timing("make block-global matrix"); int const degree = pde.get_dimensions()[0].get_degree(); int const num_dimensions = pde.num_dims(); @@ -1185,49 +1177,22 @@ make_block_global_kron_matrix(PDE const &pde, workspace, verb); } -template -void set_specific_mode(PDE const &pde, - coefficient_matrices const &cmats, - connection_patterns const &conns, - adapt::distributed_grid const &dis_grid, - imex_flag const imex, - block_global_kron_matrix &mat) -{ - int const imex_indx = static_cast(imex); - - mat.term_groups_[imex_indx] = get_used_terms(pde, imex); - - std::vector const &used_terms = mat.term_groups_[imex_indx]; - - int const num_dimensions = pde.num_dims(); - - for (int const t : used_terms) - { - for (int d : indexof(num_dimensions)) - { - if (not pde.get_terms()[t][d].is_identity()) - { - // This should be an alias and not a copy - cmats.term_coeffs[t * num_dimensions + d].copy_out(mat.gvals_[t * num_dimensions + d]); - } - } - } - - if (imex == imex_flag::imex_implicit or pde.use_implicit()) - // prepare a preconditioner - build_preconditioner(pde, cmats, conns, dis_grid, used_terms, mat.pre_con_); -} - #endif // KRON_MODE_GLOBAL #ifdef ASGARD_ENABLE_DOUBLE +template void build_preconditioner( + PDE const &, coefficient_matrices const &, + connection_patterns const &, adapt::distributed_grid const &, + std::vector const &, std::vector &); + template std::vector get_used_terms(PDE const &pde, imex_flag const imex); #ifdef KRON_MODE_GLOBAL template class block_global_kron_matrix; template void block_global_kron_matrix::apply( - imex_flag, double, double *) const; + std::vector> const &, std::vector const &, + double, double *) const; template block_global_kron_matrix make_block_global_kron_matrix(PDE const &, @@ -1235,12 +1200,6 @@ make_block_global_kron_matrix(PDE const &, connect_1d const *, connect_1d const *, kronmult::block_global_workspace *, verbosity_level); -template void set_specific_mode(PDE const &, - coefficient_matrices const &, - connection_patterns const &conns, - adapt::distributed_grid const &, - imex_flag const, - block_global_kron_matrix &); #else // KRON_MODE_GLOBAL @@ -1268,11 +1227,17 @@ compute_mem_usage(PDE const &, template std::vector get_used_terms(PDE const &pde, imex_flag const imex); +template void build_preconditioner( + PDE const &, coefficient_matrices const &, + connection_patterns const &, adapt::distributed_grid const &, + std::vector const &, std::vector &); + #ifdef KRON_MODE_GLOBAL template class block_global_kron_matrix; template void block_global_kron_matrix::apply( - imex_flag, float, float *) const; + std::vector> const &, std::vector const &, + float, float *) const; template block_global_kron_matrix make_block_global_kron_matrix(PDE const &, @@ -1280,12 +1245,6 @@ make_block_global_kron_matrix(PDE const &, connect_1d const *, connect_1d const *, kronmult::block_global_workspace *, verbosity_level); -template void set_specific_mode(PDE const &, - coefficient_matrices const &, - connection_patterns const &, - adapt::distributed_grid const &, - imex_flag const, - block_global_kron_matrix &); #else // KRON_MODE_GLOBAL diff --git a/src/asgard_kronmult_matrix.hpp b/src/asgard_kronmult_matrix.hpp index f4d5f2d41..6527eea86 100644 --- a/src/asgard_kronmult_matrix.hpp +++ b/src/asgard_kronmult_matrix.hpp @@ -110,6 +110,20 @@ struct coefficient_matrices template std::vector get_used_terms(PDE const &pde, imex_flag const imex); +/*! + * \brief Constructs a preconditioner + * + * The preconditioner should go into another file, but that will come with a + * big cleanup of the kronmult logic (and the removal of the old code). + */ +template +void build_preconditioner(PDE const &pde, + coefficient_matrices const &cmats, + connection_patterns const &conns, + adapt::distributed_grid const &dis_grid, + std::vector const &used_terms, + std::vector &pc); + #ifndef KRON_MODE_GLOBAL // using LOCAL kronmult, can be parallelised using MPI but much more expensive // then the global modes below @@ -908,8 +922,7 @@ class block_global_kron_matrix num_dimensions_(num_dimensions), blockn_(blockn), block_size_(block_size), ilist_(std::move(ilist)), dsort_(std::move(dsort)), perms_(std::move(perms)), flux_dir_(std::move(flux_dir)), conn_volumes_(conn_volumes), - conn_full_(conn_full), gvals_(flux_dir_.size() * num_dimensions_), - workspace_(workspace), verb(verb_in) + conn_full_(conn_full), workspace_(workspace), verb(verb_in) { for (auto &f : flops_) f = -1; @@ -930,23 +943,11 @@ class block_global_kron_matrix * where A corresponds to the imex etype flag. */ template - void apply(imex_flag etype, precision alpha, precision *y) const; + void apply(std::vector> const &tcoeffs, + std::vector const &terms, precision alpha, precision y[]) const; operator bool() const { return (num_dimensions_ > 0); } - bool specific_is_set(imex_flag etype) - { - std::vector const &terms = term_groups_[static_cast(etype)]; - if (terms.empty()) - return true; // nothing to set, so we're OK - - for (int d = 0; d < num_dimensions_; d++) - if (not gvals_[terms.front() * num_dimensions_ + d].empty()) - return true; - - return false; - } - //! \brief Allows overwriting of the loaded coefficients. template auto const &get_diagonal_preconditioner() const @@ -955,15 +956,26 @@ class block_global_kron_matrix return pre_con_; } - //! \brief Return the number of flops for the current matrix type - int64_t flops(imex_flag etype) const + //! \brief Return the number of flops for the current matrix type, if enabled for timing + int64_t flops(imex_flag etype, std::array, 3> term_groups) const + { +#ifdef ASGARD_USE_FLOPCOUNTER + return count_flops(etype, term_groups); +#else + ignore(etype); + ignore(term_groups); + return 0; +#endif + } + //! \brief Counts the number of floating point operations + int64_t count_flops(imex_flag etype, std::array, 3> term_groups) const { int i = static_cast(etype); if (flops_[i] == -1) { flops_[i] = kronmult::block_global_count_flops( num_dimensions_, block_size_, ilist_, dsort_, perms_, - flux_dir_, *conn_volumes_, *conn_full_, term_groups_[i], *workspace_); + flux_dir_, *conn_volumes_, *conn_full_, term_groups[i], *workspace_); if (verb == verbosity_level::high) { switch (etype) @@ -988,22 +1000,8 @@ class block_global_kron_matrix dimension_sort const &get_dsort() const { return dsort_; }; int64_t num_active() const { return num_active_; } - bool is_active(imex_flag etype) const - { - return not term_groups_[static_cast(etype)].empty(); - } - - // made friends for two reasons - // 1. Keeps the matrix API free from references to pde, which will allow an easier - // transition to a new API that does not require the PDE class - // 2. Give the ability to modify the internal without encumbering the matrix API - friend void set_specific_mode( - PDE const &pde, - coefficient_matrices const &cmats, - connection_patterns const &conns, - adapt::distributed_grid const &dis_grid, - imex_flag const imex, - block_global_kron_matrix &mat); + // preconditioner + std::vector pre_con_; private: int64_t num_active_ = 0; @@ -1018,16 +1016,10 @@ class block_global_kron_matrix connect_1d const *conn_volumes_ = nullptr; connect_1d const *conn_full_ = nullptr; - std::vector> gvals_; - std::array, 3> term_groups_; - mutable kronmult::block_global_workspace *workspace_ = nullptr; mutable std::array flops_; - // preconditioner - std::vector pre_con_; - verbosity_level verb = verbosity_level::quiet; }; diff --git a/src/asgard_time_advance.cpp b/src/asgard_time_advance.cpp index dcc213695..12e261c79 100644 --- a/src/asgard_time_advance.cpp +++ b/src/asgard_time_advance.cpp @@ -356,12 +356,12 @@ imex_advance(discretization_manager

&disc, disc.get_cmatrices(), adaptive_grid); // Explicit step f_1s = f_0 + dt A f_0 - tools::timer.start("explicit_1"); fk::vector fx(f.size()); + tools::timer.start("explicit_1"); { tools::time_event kronm_( - "kronmult - explicit", operator_matrices.flops(imex_flag::imex_explicit)); + "kronmult - explicit 1", operator_matrices.flops(imex_flag::imex_explicit)); operator_matrices.template apply(imex_flag::imex_explicit, 1.0, f.data(), 0.0, fx.data()); } @@ -439,7 +439,6 @@ imex_advance(discretization_manager

&disc, // -------------------------------- // Second Stage // -------------------------------- - tools::timer.start("explicit_2"); fm::copy(f_orig_dev, f); // f here is now f_0 #ifdef ASGARD_USE_CUDA @@ -458,10 +457,11 @@ imex_advance(discretization_manager

&disc, operator_matrices.reset_coefficients(imex_flag::imex_explicit, pde, disc.get_cmatrices(), adaptive_grid); + tools::timer.start("explicit_2"); // Explicit step f_2s = 0.5*f_0 + 0.5*(f_1 + dt A f_1) { tools::time_event kronm_( - "kronmult - explicit", operator_matrices.flops(imex_flag::imex_explicit)); + "kronmult - explicit 2", operator_matrices.flops(imex_flag::imex_explicit)); operator_matrices.template apply(imex_flag::imex_explicit, 1.0, f_1.data(), 0.0, fx.data()); } @@ -579,8 +579,7 @@ void advance_time(discretization_manager

&manager, int64_t num_steps) { // take a time advance step auto const time = manager.time(); - const char *time_str = "time_advance"; - const std::string time_id = tools::timer.start(time_str); + const std::string time_id = tools::timer.start("time_advance"); fk::vector

f_val = [&]() -> fk::vector

{ @@ -688,6 +687,7 @@ void advance_time(discretization_manager

&manager, int64_t num_steps) if (manager.high_verbosity() and not pde.options().ignore_exact) { + auto session = tools::time_session("compute exact solution"); auto rmse = manager.rmse_exact_sol(); if (rmse) { diff --git a/src/asgard_tools.cpp b/src/asgard_tools.cpp index e35b73f1f..7d4a28956 100644 --- a/src/asgard_tools.cpp +++ b/src/asgard_tools.cpp @@ -109,6 +109,7 @@ std::string simple_timer::report() times.pop_back(); } +#ifdef ASGARD_USE_FLOPCOUNTER report << "\n"; std::string const gf = "-- Gflops/s -- "; @@ -129,6 +130,7 @@ std::string simple_timer::report() report << pad_string(fsum / gflops.size()) << pad_string(min) << pad_string(max) << '\n'; } } +#endif return report.str(); } diff --git a/src/device/asgard_glkronmult_bcpu.cpp b/src/device/asgard_glkronmult_bcpu.cpp index 38829bf4e..454ddffca 100644 --- a/src/device/asgard_glkronmult_bcpu.cpp +++ b/src/device/asgard_glkronmult_bcpu.cpp @@ -760,7 +760,7 @@ void global_cpu(int num_dimensions, int n, int64_t block_size, std::vector const &perms, std::vector const &flux_dir, connect_1d const &conn_volumes, connect_1d const &conn_full, - std::vector> const &gvals, + std::vector> const &cmats, std::vector const &terms, precision const x[], precision y[], block_global_workspace &workspace) @@ -800,14 +800,14 @@ void global_cpu(int num_dimensions, int n, int64_t block_size, global_cpu(num_dimensions, n, ilist, dsort, dir, perm.fill[i][0], get_connect_1d(flux_dir[t], perm.fill[i][0]), - gvals[t * num_dimensions + dir].data(), x, w1, workspace.row_map); + cmats[t * num_dimensions + dir].data(), x, w1, workspace.row_map); for (int d = 1; d < active_dims; d++) { dir = perm.direction[i][d]; global_cpu(num_dimensions, n, ilist, dsort, dir, perm.fill[i][d], get_connect_1d(flux_dir[t], perm.fill[i][d]), - gvals[t * num_dimensions + dir].data(), w1, w2, workspace.row_map); + cmats[t * num_dimensions + dir].data(), w1, w2, workspace.row_map); std::swap(w1, w2); } @@ -965,13 +965,12 @@ void globalsv_cpu(int num_dimensions, int n, vector2d const &ilist, #ifdef ASGARD_ENABLE_DOUBLE -template void global_cpu(int, int, int64_t, - vector2d const &, dimension_sort const &, - std::vector const &, - std::vector const &, connect_1d const &, - connect_1d const &, std::vector> const &, - std::vector const &, double const[], double[], - block_global_workspace &); +template void global_cpu( + int, int, int64_t, vector2d const &, dimension_sort const &, + std::vector const &, std::vector const &, connect_1d const &, + connect_1d const &, std::vector> const &, + std::vector const &, double const[], double[], + block_global_workspace &); template int64_t block_global_count_flops( int num_dimensions, int64_t block_size, @@ -1000,13 +999,12 @@ template void globalsv_cpu( #ifdef ASGARD_ENABLE_FLOAT -template void global_cpu(int, int, int64_t, - vector2d const &, dimension_sort const &, - std::vector const &, - std::vector const &, connect_1d const &, - connect_1d const &, std::vector> const &, - std::vector const &, float const[], float[], - block_global_workspace &); +template void global_cpu( + int, int, int64_t, vector2d const &, dimension_sort const &, + std::vector const &, std::vector const &, connect_1d const &, + connect_1d const &, std::vector> const &, + std::vector const &, float const[], float[], + block_global_workspace &); template int64_t block_global_count_flops( int num_dimensions, int64_t block_size, diff --git a/src/device/asgard_kronmult.hpp b/src/device/asgard_kronmult.hpp index cab0c02cf..8114f69ea 100644 --- a/src/device/asgard_kronmult.hpp +++ b/src/device/asgard_kronmult.hpp @@ -4,7 +4,7 @@ #include #include -#include "asgard_indexset.hpp" +#include "asgard_block_matrix.hpp" #include "asgard_interpolation1d.hpp" #include "asgard_kronmult_common.hpp" @@ -300,7 +300,7 @@ void global_cpu(int num_dimensions, int n, int64_t block_size, std::vector const &perms, std::vector const &flux_dir, connect_1d const &conn_volumes, connect_1d const &conn_full, - std::vector> const &gvals, + std::vector> const &cmats, std::vector const &terms, precision const x[], precision y[], block_global_workspace &workspace);