Add parallel gmres implmentation using existing distribution of matri…

…x abtraction. Only tested on continuity_1 CPU and 4 processes. Need to try more PDEs with different numbers of processes. Haven't tried with GPUs. I think this will be easiest with CUDA-aware MPI. Kept existing serial implementation to avoid additional copies. We may want to revisit this soon. Signed-off-by: Steven Hahn <hahnse@ornl.gov>
project-asgard · Jan 25, 2024 · 8f11cea · 8f11cea
1 parent 39f1cf8
commit 8f11cea
Show file tree

Hide file tree

Showing 10 changed files with 378 additions and 21 deletions.
diff --git a/src/asgard_resources.hpp b/src/asgard_resources.hpp
@@ -14,7 +14,7 @@
 namespace asgard
 {
 // used to suppress warnings in unused variables
-auto const ignore = [](auto ignored) { (void)ignored; };
+auto const ignore = [](auto const &ignored) { (void)ignored; };
 
 /*!
  * \brief Default precision to use, double if enabled and float otherwise.
@@ -115,4 +115,4 @@ void memcpy_2d(P *dest, int const dest_stride, P const *const source,
 #include "asgard_resources_cuda.tpp"
 #else
 #include "asgard_resources_host.tpp"
-#endif
+#endif
diff --git a/src/distribution.cpp b/src/distribution.cpp
@@ -315,7 +315,7 @@ void reduce_results(fk::vector<P, src_mem, resrc> const &source,
       MPI_Comm_split(global_communicator, my_row, my_col, &row_communicator);
   expect(success == 0);
 
-  MPI_Datatype const mpi_type =
+  MPI_Datatype constexpr mpi_type =
       std::is_same_v<P, double> ? MPI_DOUBLE : MPI_FLOAT;
   success = MPI_Allreduce((void *)source.data(), (void *)dest.data(),
                           source.size(), mpi_type, MPI_SUM, row_communicator);
@@ -330,6 +330,49 @@ void reduce_results(fk::vector<P, src_mem, resrc> const &source,
 #endif
 }
 
+template<typename P, std::enable_if_t<std::is_floating_point<P>::value, bool> = true>
+void reduce_results(P const source,
+                    P &dest,
+                    distribution_plan const &plan, int const my_rank)
+{
+  expect(my_rank >= 0);
+  expect(my_rank < static_cast<int>(plan.size()));
+
+#ifdef ASGARD_USE_MPI
+  if (plan.size() == 1)
+  {
+    dest = source;
+    return;
+  }
+
+  dest               = 0.;
+  int const num_cols = get_num_subgrid_cols(plan.size());
+
+  int const my_row = my_rank / num_cols;
+  int const my_col = my_rank % num_cols;
+
+  MPI_Comm row_communicator;
+  MPI_Comm const global_communicator = distro_handle.get_global_comm();
+
+  auto success =
+      MPI_Comm_split(global_communicator, my_row, my_col, &row_communicator);
+  expect(success == 0);
+
+  MPI_Datatype constexpr mpi_type =
+      std::is_same_v<P, double> ? MPI_DOUBLE : MPI_FLOAT;
+  success = MPI_Allreduce((void *)&source, (void *)&dest,
+                          1, mpi_type, MPI_SUM, row_communicator);
+  expect(success == 0);
+
+  success = MPI_Comm_free(&row_communicator);
+  expect(success == 0);
+
+#else
+  dest = source;
+  return;
+#endif
+}
+
 //
 // -- below functionality for exchanging solution vector data across subgrid
 // rows via point-to-point messages.
@@ -1170,16 +1213,34 @@ void scatter_matrix(P const *A, int const *descA, P *A_distr,
 
 #ifdef ASGARD_ENABLE_DOUBLE
 
+template void reduce_results(double const source, double &dest,
+                             distribution_plan const &plan, int const my_rank);
+
 template void reduce_results(
     fk::vector<double, mem_type::owner, resource::host> const &source,
     fk::vector<double, mem_type::owner, resource::host> &dest,
     distribution_plan const &plan, int const my_rank);
 
+template void reduce_results(
+    fk::vector<double, mem_type::owner, resource::host> const &source,
+    fk::vector<double, mem_type::view, resource::host> &dest,
+    distribution_plan const &plan, int const my_rank);
+
 template void exchange_results(
     fk::vector<double, mem_type::owner, resource::host> const &source,
     fk::vector<double, mem_type::owner, resource::host> &dest,
     int const segment_size, distribution_plan const &plan, int const my_rank);
 
+template void exchange_results(
+    fk::vector<double, mem_type::view, resource::host> const &source,
+    fk::vector<double, mem_type::owner, resource::host> &dest,
+    int const segment_size, distribution_plan const &plan, int const my_rank);
+
+template void exchange_results(
+    fk::vector<double, mem_type::owner, resource::host> const &source,
+    fk::vector<double, mem_type::view, resource::host> &dest,
+    int const segment_size, distribution_plan const &plan, int const my_rank);
+
 #ifdef ASGARD_USE_CUDA
 template void reduce_results(
     fk::vector<double, mem_type::owner, resource::device> const &source,
@@ -1220,11 +1281,19 @@ template void scatter_matrix<double>(double const *A, int const *descA,
 
 #ifdef ASGARD_ENABLE_FLOAT
 
+template void reduce_results(float const source, float &dest,
+                             distribution_plan const &plan, int const my_rank);
+
 template void
 reduce_results(fk::vector<float, mem_type::owner, resource::host> const &source,
                fk::vector<float, mem_type::owner, resource::host> &dest,
                distribution_plan const &plan, int const my_rank);
 
+template void
+reduce_results(fk::vector<float, mem_type::owner, resource::host> const &source,
+               fk::vector<float, mem_type::view, resource::host> &dest,
+               distribution_plan const &plan, int const my_rank);
+
 template void exchange_results(
     fk::vector<float, mem_type::owner, resource::host> const &source,
     fk::vector<float, mem_type::owner, resource::host> &dest,

diff --git a/src/distribution.hpp b/src/distribution.hpp
@@ -238,6 +238,10 @@ template<typename P, mem_type src_mem, mem_type dst_mem, resource resrc>
 void reduce_results(fk::vector<P, src_mem, resrc> const &source,
                     fk::vector<P, dst_mem, resrc> &dest,
                     distribution_plan const &plan, int const my_rank);
+template<typename P, std::enable_if_t<std::is_floating_point<P>::value, bool> = true>
+void reduce_results(P const source,
+                    P &dest,
+                    distribution_plan const &plan, int const my_rank);
 
 // generate a message list for each rank for exchange_results function;
 // conceptually an internal component function, exposed for testing

diff --git a/src/distribution_tests.cpp b/src/distribution_tests.cpp
@@ -453,6 +453,84 @@ TEMPLATE_TEST_CASE("allreduce across row of subgrids", "[distribution]",
   }
 }
 
+TEMPLATE_TEST_CASE("allreduce element across row", "[distribution]",
+                   test_precs)
+{
+  if (!is_active())
+  {
+    return;
+  }
+
+  SECTION("1 rank")
+  {
+    auto const num_ranks = 1;
+    auto const my_rank   = 0;
+    int const degree     = 4;
+    int const level      = 2;
+
+    options const o = make_options(
+        {"-l", std::to_string(level), "-d", std::to_string(degree)});
+    auto const pde =
+        make_PDE<default_precision>(PDE_opts::continuity_2, level, degree);
+    elements::table const table(o, *pde);
+
+    auto const plan = get_plan(num_ranks, table);
+
+    TestType const gold{42};
+    TestType const x = gold;
+    TestType fx = 0.;
+    reduce_results(x, fx, plan, my_rank);
+    REQUIRE(fx == gold);
+  }
+
+  SECTION("multiple ranks")
+  {
+#ifdef ASGARD_USE_MPI
+
+    int const my_rank   = distrib_test_info.get_my_rank();
+    int const num_ranks = distrib_test_info.get_num_ranks();
+
+    if (my_rank < num_ranks)
+    {
+      int const degree = 5;
+      int const level  = 4;
+
+      options const o = make_options(
+          {"-l", std::to_string(level), "-d", std::to_string(degree)});
+
+      auto const pde =
+          make_PDE<default_precision>(PDE_opts::continuity_3, level, degree);
+      elements::table const table(o, *pde);
+
+      auto const plan       = get_plan(num_ranks, table);
+
+      std::vector<TestType> rank_outputs;
+      for (int i = 0; i < static_cast<int>(plan.size()); ++i)
+      {
+
+        rank_outputs.push_back(i);
+      }
+      int const my_row = my_rank / get_num_subgrid_cols(num_ranks);
+      TestType gold;
+      for (int i = 0; i < static_cast<int>(rank_outputs.size()); ++i)
+      {
+        if (i / get_num_subgrid_cols(num_ranks) == my_row)
+        {
+          gold = gold + rank_outputs[i];
+        }
+      }
+
+      auto const &x = rank_outputs[std::min(my_rank, static_cast<int>(plan.size()) - 1)];
+      TestType fx = 0.;
+      reduce_results(x, fx, plan, my_rank);
+      REQUIRE_THAT(fx, Catch::Matchers::WithinAbs(gold, std::numeric_limits<TestType>::epsilon()));
+    }
+#else
+    REQUIRE(true);
+#endif
+  }
+}
+
 void generate_messages_test(int const num_ranks, elements::table const &table)
 {
   auto const plan     = get_plan(num_ranks, table);

diff --git a/src/program_options.cpp b/src/program_options.cpp
@@ -308,7 +308,7 @@ parser::parser(int argc, char const *const *argv)
   if ((use_implicit_stepping || use_imex_stepping) && get_num_ranks() > 1)
   {
     auto const choice = solver_mapping.at(solver_str);
-    if (choice != solve_opts::scalapack)
+    if (choice == solve_opts::direct)
     {
       std::cerr << "Distribution not implemented for implicit stepping\n";
       valid = false;