From 43d885746185b509ae851e22e0dc489686a64a49 Mon Sep 17 00:00:00 2001
From: Miroslav Stoyanov <stoyanovmk@ornl.gov>
Date: Thu, 23 Jan 2025 13:58:22 -0500
Subject: [PATCH] cleaning up docs and sharp edges

---
 src/asgard_discretization.cpp        |  10 +-
 src/asgard_discretization.hpp        |   3 +-
 src/asgard_io.cpp                    |  21 ++--
 src/asgard_program_options.cpp       |  13 +--
 src/asgard_program_options.hpp       |  12 ++-
 src/asgard_program_options_tests.cpp |  16 +--
 src/asgard_solver.cpp                |  33 +++---
 src/asgard_solver.hpp                | 148 +++++++++------------------
 src/asgard_term_manager.cpp          |  20 ++--
 src/asgard_term_manager.hpp          |   9 +-
 src/asgard_time_advance.cpp          |  81 ++++-----------
 src/diffusion.cpp                    |  69 +++++++++----
 src/pde/asgard_pde_base.hpp          |  11 +-
 13 files changed, 208 insertions(+), 238 deletions(-)
diff --git a/src/asgard_discretization.cpp b/src/asgard_discretization.cpp
index 1f5c62a82..10a777f1e 100644
--- a/src/asgard_discretization.cpp
+++ b/src/asgard_discretization.cpp
@@ -155,16 +155,16 @@ void discretization_manager<precision>::start_cold()
 
   degree_ = options.degree.value();
 
-  if (not stop_verbosity())
-    std::cout << "\n -- ASGarD discretization options --\n";
-
   if (high_verbosity()) {
     std::cout << "Branch: " << GIT_BRANCH << '\n';
     std::cout << "Commit Summary: " << GIT_COMMIT_HASH
                     << GIT_COMMIT_SUMMARY << '\n';
-    std::cout << "This executable was built on " << BUILD_TIME << '\n';
+    std::cout << "The library was built on " << BUILD_TIME << '\n';
   }
 
+  if (not stop_verbosity())
+    std::cout << "\n -- ASGarD discretization options --\n";
+
   sgrid = sparse_grid(options);
 
   if (not stop_verbosity()) {
@@ -572,7 +572,7 @@ void discretization_manager<precision>::ode_sv(imex_flag imflag,
       kronops.make(imflag, *pde, matrices, grid);
       precision const tolerance = *options.isolver_tolerance;
       int const restart         = *options.isolver_iterations;
-      int const max_iter        = *options.isolver_outer_iterations;
+      int const max_iter        = *options.isolver_inner_iterations;
       sol.resize(static_cast<int>(x.size()));
       std::copy(x.begin(), x.end(), sol.begin());
       if (solver == solve_opts::gmres)
diff --git a/src/asgard_discretization.hpp b/src/asgard_discretization.hpp
index 6095f5f9e..3b6d10459 100644
--- a/src/asgard_discretization.hpp
+++ b/src/asgard_discretization.hpp
@@ -175,8 +175,7 @@ class discretization_manager
     terms.apply_all(sgrid, conn, alpha, x, beta, y);
   }
   //! applies ADI preconditioner for all terms
-  void terms_apply_adi(std::vector<precision> const &x,
-                       std::vector<precision> &y) const
+  void terms_apply_adi(precision const x[], precision y[]) const
   {
     tools::time_event performance_("terms_apply_adi kronmult");
     terms.apply_all_adi(sgrid, conn, x, y);
diff --git a/src/asgard_io.cpp b/src/asgard_io.cpp
index c2be49d63..ef468db49 100644
--- a/src/asgard_io.cpp
+++ b/src/asgard_io.cpp
@@ -141,7 +141,7 @@ void write_output(PDE<P> const &pde, // std::vector<moment<P>> const &moments,
 
   H5Easy::dump(file, "isolver_tolerance", options.isolver_tolerance.value());
   H5Easy::dump(file, "isolver_iterations", options.isolver_iterations.value());
-  H5Easy::dump(file, "isolver_outer_iterations", options.isolver_outer_iterations.value());
+  H5Easy::dump(file, "isolver_inner_iterations", options.isolver_inner_iterations.value());
 
   // save some basic build info
   H5Easy::dump(file, "GIT_BRANCH", std::string(GIT_BRANCH));
@@ -281,7 +281,7 @@ void h5writer<P>::write(PDEv2<P> const &pde, int degree, sparse_grid const &grid
     H5Easy::dump(file, "solver_method", static_cast<int>(options.solver.value_or(solve_opts::direct)));
     H5Easy::dump(file, "solver_itol", options.isolver_tolerance.value_or(-1));
     H5Easy::dump(file, "solver_iter", options.isolver_iterations.value_or(-1));
-    H5Easy::dump(file, "solver_oiter", options.isolver_outer_iterations.value_or(-1));
+    H5Easy::dump(file, "solver_inner", options.isolver_inner_iterations.value_or(-1));
   }
 
   H5Easy::dump(file, "timer_report", tools::timer.report());
@@ -498,12 +498,21 @@ void h5writer<P>::read(std::string const &filename, bool silent, PDEv2<P> &pde,
   { // solver data section
     if (not pde.options_.solver)
       pde.options_.solver = static_cast<solve_opts>(H5Easy::load<int>(file, "solver_method"));
-    if (not pde.options_.isolver_tolerance)
+    if (not pde.options_.isolver_tolerance) {
       pde.options_.isolver_tolerance = H5Easy::load<double>(file, "solver_itol");
-    if (not pde.options_.isolver_iterations)
+      if (pde.options_.isolver_tolerance.value() < 0)
+        pde.options_.isolver_tolerance = pde.options_.default_isolver_tolerance;
+    }
+    if (not pde.options_.isolver_iterations) {
       pde.options_.isolver_iterations = H5Easy::load<int>(file, "solver_iter");
-    if (not pde.options_.isolver_outer_iterations)
-      pde.options_.isolver_outer_iterations = H5Easy::load<int>(file, "solver_oiter");
+      if (pde.options_.isolver_iterations.value() < 0)
+        pde.options_.isolver_iterations = pde.options_.default_isolver_iterations;
+    }
+    if (not pde.options_.isolver_inner_iterations) {
+      pde.options_.isolver_inner_iterations = H5Easy::load<int>(file, "solver_inner");
+      if (pde.options_.isolver_inner_iterations.value() < 0)
+        pde.options_.isolver_inner_iterations = pde.options_.default_isolver_inner_iterations;
+    }
   }
 
   state = H5Easy::load<std::vector<P>>(file, "state");
diff --git a/src/asgard_program_options.cpp b/src/asgard_program_options.cpp
index d55b74063..2ab744282 100644
--- a/src/asgard_program_options.cpp
+++ b/src/asgard_program_options.cpp
@@ -113,8 +113,9 @@ Options          Short   Value      Description
                                     adi - very experimental, not very stable (yet)
 -isolve-tol      -ist    double     Iterative solver tolerance, applies to GMRES and BICG.
 -isolve-iter     -isi    int        Iterative solver maximum number of iterations,
-                                    for GMRES this is the number of inner iterations.
--isolve-outer    -iso    int        (GMRES only) The maximum number of outer GMRES iterations.
+                                    for GMRES this is the number of outer iterations.
+-isolve-inner    -isn    int        (GMRES only) The maximum number of inner GMRES iterations,
+                                    this is ignored by BiCGSTAB.
 
 Leaving soon:
 -memory                  int        Memory limit for the GPU, applied to the earlier versions
@@ -222,8 +223,8 @@ void prog_opts::process_inputs(std::vector<std::string_view> const &argv,
       {"-kron-mode", optentry::kron_mode},
       {"-isolve-tol", optentry::isol_tolerance}, {"-ist", optentry::isol_tolerance},
       {"-isolve-iter", optentry::isol_iterations}, {"-isi", optentry::isol_iterations},
-      {"-isolve-outer", optentry::isol_outer_iterations},
-      {"-iso", optentry::isol_outer_iterations},
+      {"-isolve-inner", optentry::isol_inner_iterations},
+      {"-isn", optentry::isol_inner_iterations},
       {"-restart", optentry::restart_file},
   };
 
@@ -558,12 +559,12 @@ void prog_opts::process_inputs(std::vector<std::string_view> const &argv,
       }
     }
     break;
-    case optentry::isol_outer_iterations: {
+    case optentry::isol_inner_iterations: {
       auto selected = move_process_next();
       if (not selected)
         throw std::runtime_error(report_no_value());
       try {
-        isolver_outer_iterations = std::stoi(selected->data());
+        isolver_inner_iterations = std::stoi(selected->data());
       } catch(std::invalid_argument &) {
         throw std::runtime_error(report_wrong_value());
       } catch(std::out_of_range &) {
diff --git a/src/asgard_program_options.hpp b/src/asgard_program_options.hpp
index b72731d35..63ae7d9dc 100644
--- a/src/asgard_program_options.hpp
+++ b/src/asgard_program_options.hpp
@@ -69,7 +69,7 @@ enum class preconditioner_opts
   none = 0,
   //! diagonal Jacobi preconditioner
   jacobi,
-  //! using alternating direction pseudoinverse
+  //! using alternating direction pseudoinverse (experimental)
   adi
 };
 
@@ -430,7 +430,7 @@ struct prog_opts
   //! max number of iterations (inner iterations for gmres)
   std::optional<int> isolver_iterations;
   //! max number of output gmres iterations
-  std::optional<int> isolver_outer_iterations;
+  std::optional<int> isolver_inner_iterations;
 
   //! local kron method only, mode dense or sparse (faster but memory hungry)
   std::optional<kronmult_mode> kron_mode;
@@ -626,6 +626,12 @@ struct prog_opts
   std::optional<solve_opts> default_solver;
   //! used in place of the preconditioner type, if preconditioner is not specified
   std::optional<preconditioner_opts> default_precon;
+  //! used in place of the tolerance, if tolerance is not specified
+  std::optional<double> default_isolver_tolerance;
+  //! max number of iterations (inner iterations for gmres)
+  std::optional<int> default_isolver_iterations;
+  //! max number of outer gmres iterations
+  std::optional<int> default_isolver_inner_iterations;
 
   //! returns the first available from stop-time, default-stop-time or -1
   double get_stop_time() const { return stop_time.value_or(default_stop_time.value_or(-1)); }
@@ -683,7 +689,7 @@ struct prog_opts
     kron_mode,
     isol_tolerance,
     isol_iterations,
-    isol_outer_iterations,
+    isol_inner_iterations,
     restart_file,
     set_verbosity
   };
diff --git a/src/asgard_program_options_tests.cpp b/src/asgard_program_options_tests.cpp
index b61a3f00c..3c30bf5e3 100644
--- a/src/asgard_program_options_tests.cpp
+++ b/src/asgard_program_options_tests.cpp
@@ -245,15 +245,15 @@ TEST_CASE("new program options", "[single options]")
     REQUIRE_THROWS_WITH(prog_opts(vecstrview({"exe", "-isi", "dummy"})),
                         "invalid value for -isi, see exe -help");
   }
-  SECTION("-isolve_outer")
+  SECTION("-isolve_inner")
   {
-    prog_opts prog(vecstrview({"", "-isolve-outer", "200"}));
-    REQUIRE(prog.isolver_outer_iterations);
-    REQUIRE(prog.isolver_outer_iterations.value() == 200);
-    REQUIRE_THROWS_WITH(prog_opts(vecstrview({"exe", "-isolve-outer"})),
-                        "-isolve-outer must be followed by a value, see exe -help");
-    REQUIRE_THROWS_WITH(prog_opts(vecstrview({"exe", "-iso", "dummy"})),
-                        "invalid value for -iso, see exe -help");
+    prog_opts prog(vecstrview({"", "-isolve-inner", "200"}));
+    REQUIRE(prog.isolver_inner_iterations);
+    REQUIRE(prog.isolver_inner_iterations.value() == 200);
+    REQUIRE_THROWS_WITH(prog_opts(vecstrview({"exe", "-isolve-inner"})),
+                        "-isolve-inner must be followed by a value, see exe -help");
+    REQUIRE_THROWS_WITH(prog_opts(vecstrview({"exe", "-isn", "dummy"})),
+                        "invalid value for -isn, see exe -help");
   }
   SECTION("-title")
   {
diff --git a/src/asgard_solver.cpp b/src/asgard_solver.cpp
index 7d0a45069..d54049456 100644
--- a/src/asgard_solver.cpp
+++ b/src/asgard_solver.cpp
@@ -657,7 +657,7 @@ int bicgstab<P>::solve(
   auto dot = [&](std::vector<P> const &a, std::vector<P> const &b)
     -> P {
       P r = 0;
-//ASGARD_OMP_PARFOR_SIMD_EXTRA(reduction(+:r))
+ASGARD_OMP_PARFOR_SIMD_EXTRA(reduction(+:r))
       for (int64_t i = 0; i < n; i++)
         r += a[i] * b[i];
       return r;
@@ -665,7 +665,7 @@ int bicgstab<P>::solve(
   auto dot1 = [&](std::vector<P> const &a)
     -> P {
       P r = 0;
-//ASGARD_OMP_PARFOR_SIMD_EXTRA(reduction(+:r))
+ASGARD_OMP_PARFOR_SIMD_EXTRA(reduction(+:r))
       for (int64_t i = 0; i < n; i++)
         r += a[i] * a[i];
       return r;
@@ -684,7 +684,7 @@ ASGARD_OMP_PARFOR_SIMD
   r = rhs;
 
   int num_appy = 1;
-  apply_lhs(-1, x, 1, r); // r0 = b - A * x0
+  apply_lhs(-1, x.data(), 1, r.data()); // r0 = b - A * x0
 
   P rho = dot1(r);
 
@@ -693,7 +693,7 @@ ASGARD_OMP_PARFOR_SIMD
 
   for (int i = 0; i < max_iter_; i++) {
     ++num_appy;
-    apply_lhs(1, p, 0, v); // v = A * p
+    apply_lhs(1, p.data(), 0, v.data()); // v = A * p
 
     P const alpha = rho / dot(rref, v);
 
@@ -705,7 +705,7 @@ ASGARD_OMP_PARFOR_SIMD
     }
 
     ++num_appy;
-    apply_lhs(1, r, 0, t); // t = A * p
+    apply_lhs(1, r.data(), 0, t.data()); // t = A * p
 
     P const omega = dot(r, t) / dot1(t);
 
@@ -732,8 +732,8 @@ ASGARD_OMP_PARFOR_SIMD
 
 template<typename P>
 int gmres<P>::solve(
-    operatoin_apply_precon_arr<P> apply_precon,
-    operatoin_apply_lhs_arr<P> apply_lhs, std::vector<P> const &rhs,
+    operatoin_apply_precon<P> apply_precon,
+    operatoin_apply_lhs<P> apply_lhs, std::vector<P> const &rhs,
     std::vector<P> &x) const
 {
   int const n = static_cast<int>(rhs.size());
@@ -936,6 +936,13 @@ ASGARD_OMP_PARFOR_SIMD
   }
 }
 
+template<typename P>
+void solver_manager<P>::xpby(std::vector<P> const &x, P beta, P y[]) {
+ASGARD_OMP_PARFOR_SIMD
+  for (size_t i = 0; i < x.size(); i++)
+    y[i] = x[i] + beta * y[i];
+}
+
 template<typename P>
 void solver_manager<P>::print_opts(std::ostream &os) const
 {
@@ -946,18 +953,18 @@ void solver_manager<P>::print_opts(std::ostream &os) const
       os << "  direct\n";
       break;
     case 1:
-      os << "  bicgstab\n";
-      os << "  tolerance:      " << std::get<solvers::bicgstab<P>>(var).tolerance() << '\n';
-      os << "  max iterations: " << std::get<solvers::bicgstab<P>>(var).max_iter() << '\n';
-      has_precon = true;
-      break;
-    case 2:
       os << "  gmres\n";
       os << "  tolerance: " << std::get<solvers::gmres<P>>(var).tolerance() << '\n';
       os << "  max inner: " << std::get<solvers::gmres<P>>(var).max_inner() << '\n';
       os << "  max outer: " << std::get<solvers::gmres<P>>(var).max_outer() << '\n';
       has_precon = true;
       break;
+    case 2:
+      os << "  bicgstab\n";
+      os << "  tolerance:      " << std::get<solvers::bicgstab<P>>(var).tolerance() << '\n';
+      os << "  max iterations: " << std::get<solvers::bicgstab<P>>(var).max_iter() << '\n';
+      has_precon = true;
+      break;
     default:
       break;
   }
diff --git a/src/asgard_solver.hpp b/src/asgard_solver.hpp
index 19b768ba1..649987eb9 100644
--- a/src/asgard_solver.hpp
+++ b/src/asgard_solver.hpp
@@ -115,21 +115,6 @@ class poisson_data
   std::vector<P> diag, subdiag, rhs;
 };
 
-/*!
- * \internal
- * \brief Stores the data for a diagonal Jacobi preconditioner
- *
- * \endinternal
- */
-template<typename P>
-struct prec_diagonal_jacobi
-{
-  //! make a default, no-preconditioner
-  prec_diagonal_jacobi() = default;
-  //! holds the inverse of the diagonal entries
-  std::vector<P> prec;
-};
-
 /*!
  * \internal
  * \brief Direct solver, explicitly forms the dense matrix, very expensive
@@ -164,18 +149,6 @@ class direct
   dense_matrix<P> mat;
 };
 
-/*!
- * \internal
- * \brief Signature for the left-hand linear operation for a solver
- *
- * Computes `y = alpha * A * x + beta * y`
- *
- * \endinternal
- */
-template<typename P>
-using operatoin_apply_lhs =
-  std::function<void(P alpha, std::vector<P> const &x, P beta, std::vector<P> &y)>;
-
 /*!
  * \internal
  * \brief Signature for the left-hand linear operation for a solver, raw-array variant
@@ -185,21 +158,9 @@ using operatoin_apply_lhs =
  * \endinternal
  */
 template<typename P>
-using operatoin_apply_lhs_arr =
+using operatoin_apply_lhs =
   std::function<void(P alpha, P const x[], P beta, P y[])>;
 
-/*!
- * \internal
- * \brief Signature for the preconditioner
- *
- * Computes `y = inverse-P * x` and do not need the constants
- *
- * \endinternal
- */
-template<typename P>
-using operatoin_apply_precon =
-  std::function<void(std::vector<P> const &x, std::vector<P> &y)>;
-
 /*!
  * \internal
  * \brief Signature for the preconditioner
@@ -210,7 +171,7 @@ using operatoin_apply_precon =
  * \endinternal
  */
 template<typename P>
-using operatoin_apply_precon_arr = std::function<void(P y[])>;
+using operatoin_apply_precon = std::function<void(P y[])>;
 
 /*!
  * \internal
@@ -281,8 +242,8 @@ class gmres
   }
 
   //! solve for the given linear operators, right-hand-side and initial iterate
-  int solve(operatoin_apply_precon_arr<P> apply_precon,
-            operatoin_apply_lhs_arr<P> apply_lhs, std::vector<P> const &rhs,
+  int solve(operatoin_apply_precon<P> apply_precon,
+            operatoin_apply_lhs<P> apply_lhs, std::vector<P> const &rhs,
             std::vector<P> &x) const;
 
   //! returns the set tolerance
@@ -327,8 +288,12 @@ struct solver_manager
   solver_manager() = default;
   //! create a new solver
   solver_manager(prog_opts const &options)
-    : opt(options.solver.value())
   {
+    rassert(options.solver, "implicit time-stepping requires a solver, e.g., "
+            "see --help for list of available solvers");
+
+    opt = options.solver.value();
+
     switch (opt) {
       case solve_opts::direct:
         var = solvers::direct<P>(); // will be initialized later
@@ -347,11 +312,11 @@ struct solver_manager
                 "missing tolerance for the iterative solver gmres");
         rassert(options.isolver_iterations,
                 "missing number of iterations for the iterative solver gmres");
-        rassert(options.isolver_outer_iterations,
+        rassert(options.isolver_inner_iterations,
                 "missing number of outer iterations for the iterative solver gmres");
         var = solvers::gmres<P>(options.isolver_tolerance.value(),
-                                options.isolver_iterations.value(),
-                                options.isolver_outer_iterations.value());
+                                options.isolver_inner_iterations.value(),
+                                options.isolver_iterations.value());
         precon = options.precon.value_or(preconditioner_opts::none);
         break;
       default: // unreachable
@@ -365,7 +330,7 @@ struct solver_manager
     std::get<solvers::direct<P>>(var)(x);
   }
 
-  //! iterate_solve without a preconditioner
+  //! iterative solver, calls the appropriate iterative solver
   void iterate_solve(solvers::operatoin_apply_lhs<P> apply_lhs,
                      std::vector<P> const &rhs, std::vector<P> &x) const
   {
@@ -377,55 +342,39 @@ struct solver_manager
                      solvers::operatoin_apply_lhs<P> apply_lhs,
                      std::vector<P> const &rhs, std::vector<P> &x) const
   {
-    expect(opt == solve_opts::bicgstab);
-    if (precon) {
-      // in the bicgstab, explicitly apply the preconditioner
-      solvers::bicgstab<P> const &bicg = std::get<solvers::bicgstab<P>>(var);
-      bicg.prec_rhs.resize(rhs.size());
-      bicg.prec_y.resize(rhs.size());
-      bicg.prec_yb.resize(rhs.size());
-
-      num_apply += 1;
-      precon(rhs, bicg.prec_rhs);
-      num_apply += bicg.solve(
-        [&](P alpha, std::vector<P> const &x, P beta, std::vector<P> &y) -> void
-        {
-          if (beta != 0)
-            bicg.prec_yb = y;
-
-          apply_lhs(alpha, x, 0, bicg.prec_y);
-          precon(bicg.prec_y, y);
-
-          if (beta != 0)
-            for (size_t i = 0; i < y.size(); i++)
-              y[i] += beta * bicg.prec_yb[i];
-        },
-        bicg.prec_rhs, x);
-    } else {
-      num_apply += std::get<solvers::bicgstab<P>>(var).solve(apply_lhs, rhs, x);
-    }
-  }
-
-  //! iterative solver, calls the appropriate iterative solver
-  void iterate_solve(solvers::operatoin_apply_lhs_arr<P> apply_lhs,
-                     std::vector<P> const &rhs, std::vector<P> &x) const
-  {
-    iterate_solve(nullptr, apply_lhs, rhs, x);
-  }
-
-  //! iterative solver, calls the appropriate iterative solver
-  void iterate_solve(solvers::operatoin_apply_precon_arr<P> precon,
-                     solvers::operatoin_apply_lhs_arr<P> apply_lhs,
-                     std::vector<P> const &rhs, std::vector<P> &x) const
-  {
-    expect(opt == solve_opts::gmres);
-    if (precon) {
-      solvers::gmres<P> const &gmres = std::get<solvers::gmres<P>>(var);
-
-      num_apply += gmres.solve(precon, apply_lhs, rhs, x);
-    } else {
-      num_apply += std::get<solvers::gmres<P>>(var).solve(
-        [](P *)->void{ /* no preconditioner */ }, apply_lhs, rhs, x);
+    expect(opt != solve_opts::direct);
+    if (opt == solve_opts::bicgstab) {
+      if (precon) {
+        solvers::bicgstab<P> const &bicg = std::get<solvers::bicgstab<P>>(var);
+
+        bicg.prec_y.resize(rhs.size());
+
+        bicg.prec_rhs = rhs;
+        precon(bicg.prec_rhs.data());
+
+        num_apply += bicg.solve([&](P alpha, P const xx[], P beta, P y[])
+            -> void {
+              if (beta == 0) {
+                apply_lhs(alpha, xx, 0, y);
+                precon(y);
+              } else {
+                apply_lhs(alpha, xx, 0, bicg.prec_y.data());
+                precon(bicg.prec_y.data());
+                xpby(bicg.prec_y, beta, y);
+              }
+            }, bicg.prec_rhs, x);
+      } else {
+        num_apply += std::get<solvers::bicgstab<P>>(var).solve(apply_lhs, rhs, x);
+      }
+    } else { // if (opt == solve_opts::gmres)
+      if (precon) {
+        solvers::gmres<P> const &gmres = std::get<solvers::gmres<P>>(var);
+
+        num_apply += gmres.solve(precon, apply_lhs, rhs, x);
+      } else {
+        num_apply += std::get<solvers::gmres<P>>(var).solve(
+          [](P *)->void{ /* no preconditioner */ }, apply_lhs, rhs, x);
+      }
     }
   }
 
@@ -446,9 +395,12 @@ struct solver_manager
   //! remembers the generation of the grid that was used to last set the manager
   int grid_gen = -1;
   //! holds the actual solver instance
-  std::variant<solvers::direct<P>, solvers::bicgstab<P>, solvers::gmres<P>> var;
+  std::variant<solvers::direct<P>, solvers::gmres<P>, solvers::bicgstab<P>> var;
   //! holds data for the jacobi preconditioner
   std::vector<P> jacobi;
+
+  //! helper method, y = x + beta * y, compiles with OpenMP and SIMD
+  static void xpby(std::vector<P> const &x, P beta, P y[]);
 };
 
 /*!
diff --git a/src/asgard_term_manager.cpp b/src/asgard_term_manager.cpp
index bb057ed37..4a58f57b6 100644
--- a/src/asgard_term_manager.cpp
+++ b/src/asgard_term_manager.cpp
@@ -123,7 +123,6 @@ void term_manager<P>::rebuld_term(
 
     // build the ADI preconditioner here
     if (precon == preconditioner_opts::adi) {
-      std::cout << " rebuilding ad\n";
       if (is_diag) {
         to_euler(legendre.pdof, alpha, wraw_diag);
         psedoinvert(legendre.pdof, wraw_diag, raw_diag0);
@@ -369,29 +368,29 @@ void term_manager<P>::apply_all(
 template<typename P>
 void term_manager<P>::apply_all_adi(
     sparse_grid const &grid, connection_patterns const &conns,
-    std::vector<P> const &x, std::vector<P> &y) const
+    P const x[], P y[]) const
 {
-  expect(x.size() == y.size());
-  expect(x.size() == kwork.w1.size());
+  int64_t const n = grid.num_indexes() * fm::ipow(legendre.pdof, grid.num_dims());
+
+  static std::vector<P> t1, t2; // more additional workspaces
 
-  std::vector<P> t1 = x;
-  std::vector<P> t2 = x;
+  t1.resize(n);
+  t2.resize(n);
+  std::copy_n(x, n, t1.data());
 
   auto it = terms.begin();
   while (it < terms.end())
   {
     if (it->num_chain == 1) {
-      kron_term_adi(grid, conns, *it, 1, t1, 0, t2);
+      kron_term_adi(grid, conns, *it, 1, t1.data(), 0, t2.data());
       std::swap(t1, t2);
       ++it;
     } else {
       // TODO: consider whether we should do this or not
       it += it->num_chain;
     }
-
-    // b = 1; // next iteration appends on y
   }
-  y = t1;
+  std::copy_n(t1.data(), n, y);
 }
 
 template<typename P>
@@ -480,7 +479,6 @@ void term_manager<P>::kron_diag(
   }
 }
 
-
 #ifdef ASGARD_ENABLE_DOUBLE
 template struct term_entry<double>;
 template struct term_manager<double>;
diff --git a/src/asgard_term_manager.hpp b/src/asgard_term_manager.hpp
index 146b29e12..c9c56a527 100644
--- a/src/asgard_term_manager.hpp
+++ b/src/asgard_term_manager.hpp
@@ -92,7 +92,7 @@ struct term_manager
 
   //! y = prod(terms_adi * x), applies the ADI preconditioning to all terms
   void apply_all_adi(sparse_grid const &grid, connection_patterns const &conns,
-                     std::vector<P> const &x, std::vector<P> &y) const;
+                     P const x[], P y[]) const;
 
   //! construct term diagonal
   void make_jacobi(sparse_grid const &grid, connection_patterns const &conns,
@@ -114,11 +114,10 @@ struct term_manager
               alpha, x, beta, y, kwork);
   }
   void kron_term_adi(sparse_grid const &grid, connection_patterns const &conns,
-                     term_entry<P> const &tme, P alpha, std::vector<P> const &x, P beta,
-                     std::vector<P> &y) const
+                     term_entry<P> const &tme, P alpha, P const x[], P beta,
+                     P y[]) const
   {
-    block_cpu(legendre.pdof, grid, conns, tme.perm, tme.adi,
-              alpha, x.data(), beta, y.data(), kwork);
+    block_cpu(legendre.pdof, grid, conns, tme.perm, tme.adi, alpha, x, beta, y, kwork);
   }
 
   template<data_mode mode>
diff --git a/src/asgard_time_advance.cpp b/src/asgard_time_advance.cpp
index 318225ca9..032f76e7f 100644
--- a/src/asgard_time_advance.cpp
+++ b/src/asgard_time_advance.cpp
@@ -171,8 +171,8 @@ imex_advance(discretization_manager<P> &disc,
   // Implicit step f_1: f_1 - dt B f_1 = f_1s
   solve_opts solver  = options.solver.value();
   P const tolerance  = *options.isolver_tolerance;
-  int const restart  = *options.isolver_iterations;
-  int const max_iter = *options.isolver_outer_iterations;
+  int const restart  = *options.isolver_inner_iterations;
+  int const max_iter = *options.isolver_iterations;
   fk::vector<P, mem_type::owner, imex_resrc> f_1(f.size());
   fk::vector<P, mem_type::owner, imex_resrc> f_1_output(f.size());
   if (pde.do_collision_operator())
@@ -561,66 +561,25 @@ void crank_nicolson<P>::next_step(
   P const time = disc.time_params().time();
   P const dt   = disc.time_params().dt();
 
+  // if the grid changed since the last time we used the solver
+  // update the matrices and preconditioners, update-grid checks what's needed
   if (solver.grid_gen != disc.get_sgrid().generation())
     solver.update_grid(disc.get_sgrid(), disc.get_conn(), disc.get_terms(), 0.5 * dt);
 
-  next = current; // copy
+  if (solver.opt == solve_opts::direct) {
+    next = current; // copy
 
-  disc.terms_apply_all(-0.5 * dt, current, 1, next);
-  disc.add_ode_rhs_sources(time + 0.5 * dt, dt, next);
+    disc.terms_apply_all(-0.5 * dt, current, 1, next);
+    disc.add_ode_rhs_sources(time + 0.5 * dt, dt, next);
 
-  if (solver.opt == solve_opts::direct) {
     solver.direct_solve(next);
-  } else if (solver.opt == solve_opts::bicgstab) {
-    work = next;
+  } else { // iterative solver
+    // form the right-hand-side inside work
+    work = current;
+    disc.terms_apply_all(-0.5 * dt, current, 1, work);
+    disc.add_ode_rhs_sources(time + 0.5 * dt, dt, work);
 
-    int64_t const n = static_cast<int64_t>(work.size());
-
-    switch (solver.precon) {
-    case preconditioner_opts::none:
-      solver.iterate_solve(
-        [&](P alpha, std::vector<P> const &x, P beta, std::vector<P> &y) -> void
-        {
-          ASGARD_OMP_PARFOR_SIMD
-          for (int64_t i = 0; i < n; i++)
-            y[i] = alpha * x[i] + beta * y[i];
-          disc.terms_apply_all(0.5 * alpha * dt, x, 1, y);
-        }, work, next);
-    break;
-    case preconditioner_opts::jacobi:
-      solver.iterate_solve(
-        [&](std::vector<P> const &x, std::vector<P> &y) -> void
-        {
-          ASGARD_OMP_PARFOR_SIMD
-          for (int64_t i = 0; i < n; i++)
-            y[i] = x[i] * solver.jacobi[i];
-        },
-        [&](P alpha, std::vector<P> const &x, P beta, std::vector<P> &y) -> void
-        {
-          ASGARD_OMP_PARFOR_SIMD
-          for (int64_t i = 0; i < n; i++)
-            y[i] = alpha * x[i] + beta * y[i];
-          disc.terms_apply_all(0.5 * alpha * dt, x, 1, y);
-        }, work, next);
-    break;
-    default:
-      // assuming ADI
-      solver.iterate_solve(
-        [&](std::vector<P> const &x, std::vector<P> &y) -> void
-        {
-          disc.terms_apply_adi(x, y);
-        },
-        [&](P alpha, std::vector<P> const &x, P beta, std::vector<P> &y) -> void
-        {
-          ASGARD_OMP_PARFOR_SIMD
-          for (int64_t i = 0; i < n; i++)
-            y[i] = alpha * x[i] + beta * y[i];
-          disc.terms_apply_all(0.5 * alpha * dt, x, 1, y);
-        }, work, next);
-    break;
-    }
-  } else if (solver.opt == solve_opts::gmres) {
-    work = next;
+    next = current; // use the current step as the initial guess
 
     int64_t const n = static_cast<int64_t>(work.size());
 
@@ -639,6 +598,7 @@ void crank_nicolson<P>::next_step(
       solver.iterate_solve(
         [&](P y[]) -> void
         {
+          tools::time_event timing_("jacobi preconditioner");
           ASGARD_OMP_PARFOR_SIMD
           for (int64_t i = 0; i < n; i++)
             y[i] *= solver.jacobi[i];
@@ -651,13 +611,15 @@ void crank_nicolson<P>::next_step(
           disc.terms_apply_all(0.5 * alpha * dt, x, 1, y);
         }, work, next);
     break;
-    default:
+    default: {
+      static std::vector<P> adi_work;
+      adi_work.resize(work.size());
       // assuming ADI
       solver.iterate_solve(
-        [&](P []) -> void
+        [&](P y[]) -> void
         {
-          // disc.terms_apply_adi(x, y);
-          throw std::runtime_error("ADI not implemented for GMRES");
+          disc.terms_apply_adi(y, adi_work.data());
+          std::copy(adi_work.begin(), adi_work.end(), y);
         },
         [&](P alpha, P const x[], P beta, P y[]) -> void
         {
@@ -666,6 +628,7 @@ void crank_nicolson<P>::next_step(
             y[i] = alpha * x[i] + beta * y[i];
           disc.terms_apply_all(0.5 * alpha * dt, x, 1, y);
         }, work, next);
+    }
     break;
     }
   }
diff --git a/src/diffusion.cpp b/src/diffusion.cpp
index 939f797be..a6da20cc4 100644
--- a/src/diffusion.cpp
+++ b/src/diffusion.cpp
@@ -81,10 +81,39 @@ asgard::PDEv2<P> make_diffusion_pde(int num_dims, asgard::prog_opts options) {
 
   options.default_stop_time = 3.0; // integrate until T = 3
 
-  // using implicit Crank-Nicolson method
+  // using implicit Crank-Nicolson method, which requires a solver
   options.default_step_method = asgard::time_advance::method::cn;
 
-  options.default_solver = asgard::solve_opts::direct; // bad but OK for this example
+  if (options.max_level() < 5) {
+    // direct (dense) solver is fast for small problems and works well for prototyping
+    // and debugging, since it remove from the problem some additional factors,
+    // such as solver tolerance and number of iterations
+    options.default_solver = asgard::solve_opts::direct;
+  } else {
+    // when the problem size becomes significant, forming and factorizing the dense
+    // operator matrix becomes prohibitively expensive in flops and memory usage
+    // iterative solvers are needed and it is good to specify default parameters
+    options.default_solver = asgard::solve_opts::gmres;
+  }
+
+  // only the iterative solvers will use these values
+  // jacobi is the fastest and (currently) most stable preconditioner
+  options.default_precon = asgard::preconditioner_opts::jacobi;
+
+  // the tolerance for the iterative solver should probably be updated
+  // based on the time-step and the max-level
+  // this is a tight number of remove the solver error from consideration
+  options.default_isolver_tolerance = 1.E-7;
+
+  // the number of iterations should depends on the time-step and condition
+  // number of the operators, should be kept high to allow for convergence
+  options.isolver_iterations = 1000;
+
+  // GMRES uses a two-loop approach (restarted GMRES)
+  // the inner iterations explicitly form and manipulate the basis for the Krylov sub-space
+  // this requires lots of memory and the number here should be kept moderate
+  // (memory usage is dominated by isolver_inner_iterations * degrees-of-freedom)
+  options.isolver_inner_iterations = 50;
 
   // create a pde from the given options and domain
   // we can read the variables using pde.options() and pde.domain() (both return const-refs)
@@ -255,7 +284,7 @@ double get_error_l2(asgard::discretization_manager<P> const &disc) {
   // when cos(t) vanishes, so does the exact solution and enorm -> 0
   // for small values of enorm, the relative error is artificially magnified
   // switch between relative and absolute error
-  if (enorm < 1.E-3)
+  if (enorm < 1)
     return std::sqrt(ndiff + enorm - nself);
   else
     return std::sqrt((ndiff + enorm - nself) / enorm);
@@ -404,33 +433,33 @@ void self_test() {
 
 #ifdef ASGARD_ENABLE_DOUBLE
   // check convergence w.r.t. level
-  dotest(1.E-3, 1, "-l 4 -n 20");
-  dotest(1.E-4, 1, "-l 5 -n 20");
-  dotest(5.E-5, 1, "-l 6 -n 20");
+  dotest(1.E-3, 1, "-l 4 -n 20 -sv direct");
+  dotest(1.E-4, 1, "-l 5 -n 20 -sv direct");
+  dotest(5.E-5, 1, "-l 6 -n 20 -sv direct");
 
-  dotest<double, false>(1.E-3, 1, "-l 4 -n 20"); // check chaining
-  dotest<double, false>(1.E-4, 1, "-l 5 -n 20");
-  dotest<double, false>(5.E-5, 1, "-l 6 -n 20");
+  dotest<double, false>(1.E-3, 1, "-l 4 -n 20 -sv direct"); // check chaining
+  dotest<double, false>(1.E-4, 1, "-l 5 -n 20 -sv direct");
+  dotest<double, false>(5.E-5, 1, "-l 6 -n 20 -sv direct");
 
-  dotest(1.E-1, 1, "-l 5 -d 0 -n 20");
-  dotest(5.E-3, 1, "-l 5 -d 1 -n 20");
-  dotest(5.E-5, 1, "-l 5 -d 2 -n 40 -dt 0.005"); // time error manifests here
+  dotest(1.E-1, 1, "-l 5 -d 0 -n 20 -sv direct");
+  dotest(5.E-3, 1, "-l 5 -d 1 -n 20 -sv direct");
+  dotest(5.E-5, 1, "-l 5 -d 2 -n 40 -dt 0.005 -sv direct"); // time error manifests here
 
-  dotest(1.00E-2, 2, "-l 6 -t 0.5 -dt 0.1"); // second order in time
-  dotest(2.50E-3, 2, "-l 6 -t 0.5 -dt 0.05");
-  dotest(6.25E-4, 2, "-l 6 -t 0.5 -dt 0.025");
+  dotest(1.00E-2, 2, "-l 6 -t 0.5 -dt 0.1 -sv direct"); // second order in time
+  dotest(2.50E-3, 2, "-l 6 -t 0.5 -dt 0.05 -sv direct");
+  dotest(6.25E-4, 2, "-l 6 -t 0.5 -dt 0.025 -sv direct");
 
-  dotest(1.E-2, 3, "-l 4 -t 0.5 -dt 0.1"); // direct solver multi-d
+  dotest(1.E-2, 3, "-l 4 -t 0.5 -dt 0.1 -sv direct"); // direct solver multi-d
 
   // in the first few steps here, the grid is very coarse
   // finer refinement and time-step is needed, only the final error is OK
-  longtest(2.E-3, 2, "-l 2 -m 8 -dt 0.01 -a 1.E-3"); // adapt
-  longtest(2.E-4, 2, "-l 2 -m 8 -dt 0.01 -a 1.E-4");
+  longtest(2.E-3, 2, "-l 2 -m 8 -a 1.E-3"); // adapt
+  dotest(5.E-3, 2, "-l 8 -a 1.E-4");
 #endif
 
 #ifdef ASGARD_ENABLE_FLOAT
-  dotest<float>(1.E-2, 1, "-l 4 -n 10");
-  dotest<float>(1.E-2, 2, "-l 5 -t 0.5 -dt 0.1");
+  dotest<float>(1.E-2, 1, "-l 4 -n 10 -sv direct");
+  dotest<float>(1.E-2, 2, "-l 5 -t 0.5 -dt 0.1  -sv direct");
 #endif
 }
 
diff --git a/src/pde/asgard_pde_base.hpp b/src/pde/asgard_pde_base.hpp
index ccd3d8696..55e23cb5d 100644
--- a/src/pde/asgard_pde_base.hpp
+++ b/src/pde/asgard_pde_base.hpp
@@ -876,8 +876,8 @@ class PDE
       options_.isolver_tolerance = solvers::notolerance;
     if (not options_.isolver_iterations)
       options_.isolver_iterations = solvers::novalue;
-    if (not options_.isolver_outer_iterations)
-      options_.isolver_outer_iterations = solvers::novalue;
+    if (not options_.isolver_inner_iterations)
+      options_.isolver_inner_iterations = solvers::novalue;
   }
 
   constexpr static int extract_dim0 = 1;
@@ -2115,6 +2115,13 @@ class PDEv2
       // setting up preconditioner for a possibly iterative solver
       if (not options_.precon and options_.default_precon)
         options_.precon = options_.default_precon.value();
+      // setting up the solver options
+      if (not options_.isolver_tolerance and options_.default_isolver_tolerance)
+        options_.isolver_tolerance = options_.default_isolver_tolerance.value();
+      if (not options_.isolver_iterations and options_.default_isolver_iterations)
+        options_.isolver_iterations = options_.default_isolver_iterations.value();
+      if (not options_.isolver_inner_iterations and options_.default_isolver_inner_iterations)
+        options_.isolver_inner_iterations = options_.default_isolver_inner_iterations.value();
     }
 
     // don't support l-inf norm yet