Misc fixes (#754)

* some performance imprvements
project-asgard · Dec 26, 2024 · 4e3f9ea · 4e3f9ea
1 parent ab6d6ab
commit 4e3f9ea
Show file tree

Hide file tree

Showing 14 changed files with 155 additions and 156 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -102,6 +102,14 @@ option (ASGARD_USE_TIMER "Enable the builtin profiling tool" ON)
 option (ASGARD_RECOMMENDED_DEFAULTS "Enable OpenMP, set some flags, download OpenBLAS if system BLAS is missing." OFF)
 option (ASGARD_BUILD_DOCS "(incomplete) Build the documentation." OFF)
 
+if (ASGARD_USE_TIMER)
+  if (ASGARD_USE_MPI)
+    option (ASGARD_USE_FLOPCOUNTER "Counts flops as part of the timing process" ON)
+  else()
+    option (ASGARD_USE_FLOPCOUNTER "Counts flops as part of the timing process" OFF)
+  endif()
+endif()
+
 if (NOT ASGARD_USE_MPI AND ASGARD_USE_CUDA)
   message(FATAL_ERROR "CUDA has been temporarily disabled for the non-mpi mode")
 endif()
@@ -636,6 +644,9 @@ if (ASGARD_BUILD_TESTS)
                   ASGARD_USE_TIMER)
     message(STATUS "  ${_opt}=${${_opt}}")
   endforeach()
+  if (ASGARD_USE_TIMER)
+    message(STATUS "  ASGARD_USE_FLOPCOUNTER=${ASGARD_USE_FLOPCOUNTER}")
+  endif()
   if (ASGARD_USE_CUDA)
     foreach(_opt CMAKE_CUDA_COMPILER CMAKE_CUDA_FLAGS ASGARD_USE_GPU_MEM_LIMIT)
       message(STATUS "  ${_opt}=${${_opt}}")

diff --git a/src/asgard_adapt.cpp b/src/asgard_adapt.cpp
@@ -108,6 +108,7 @@ template<typename P>
 fk::vector<P>
 distributed_grid<P>::coarsen_solution(PDE<P> &pde, fk::vector<P> const &x)
 {
+  auto session = tools::time_session("coarsen solution");
   auto const coarse_y = this->coarsen(x, pde.options());
   update_levels(this->get_table(), pde);
   return coarse_y;
@@ -117,6 +118,7 @@ template<typename P>
 fk::vector<P>
 distributed_grid<P>::refine_solution(PDE<P> &pde, fk::vector<P> const &x)
 {
+  auto session = tools::time_session("refine solution");
   auto const refine_y = this->refine(x, pde.options());
   update_levels(this->get_table(), pde);
   return refine_y;

diff --git a/src/asgard_block_matrix.hpp b/src/asgard_block_matrix.hpp
@@ -460,6 +460,10 @@ struct block_sparse_matrix
   P *operator[] (int64_t i) { return data_[i]; }
   //! returns the block at the given index
   P const *operator[] (int64_t i) const { return data_[i]; }
+  //! returns the internal data
+  P *data() { return data_[0]; }
+  //! returns the internal data (const overload)
+  P const *data() const { return data_[0]; }
 
   //! converts the matrix to a full one, mostly for testing/plotting
   block_matrix<P> to_full(connection_patterns const &conns) const

diff --git a/src/asgard_boundary_conditions.cpp b/src/asgard_boundary_conditions.cpp
@@ -111,6 +111,7 @@ std::array<unscaled_bc_parts<P>, 2> make_unscaled_bc_parts(
     connection_patterns const &conn,
     int const start_element, int const stop_element, P const t_init)
 {
+  tools::time_event timing("make unscaled bc");
   expect(start_element >= 0);
   expect(stop_element < table.size());
   expect(stop_element >= start_element);

diff --git a/src/asgard_build_info.hpp.in b/src/asgard_build_info.hpp.in
@@ -12,6 +12,7 @@
 
 #cmakedefine ASGARD_USE_HIGHFIVE
 #cmakedefine ASGARD_USE_TIMER
+#cmakedefine ASGARD_USE_FLOPCOUNTER
 #cmakedefine ASGARD_USE_CUDA
 #cmakedefine ASGARD_USE_GPU_MEM_LIMIT
 #cmakedefine ASGARD_USE_OPENMP

diff --git a/src/asgard_discretization.cpp b/src/asgard_discretization.cpp
@@ -88,6 +88,11 @@ discretization_manager<precision>::discretization_manager(
   fixed_bc = boundary_conditions::make_unscaled_bc_parts(
         *pde, grid.get_table(), transformer, hier, matrices, conn, msg.row_start, msg.row_stop);
 
+#ifdef KRON_MODE_GLOBAL
+  // the imex-flag is not used internally
+  kronops.make(imex_flag::unspecified, *pde, matrices, grid);
+#endif
+
   if (high_verbosity())
     node_out() << "  generating: moment vectors..." << '\n';
 

diff --git a/src/asgard_discretization.hpp b/src/asgard_discretization.hpp
@@ -253,13 +253,19 @@ class discretization_manager
   //! update components on grid reset
   void update_grid_components()
   {
+    tools::time_event performance("update grid components");
     kronops.clear();
     generate_coefficients(*pde, matrices, conn, hier, time_, coeff_update_mode::independent);
-#ifndef KRON_MODE_GLOBAL
+
+#ifdef KRON_MODE_GLOBAL
+    // the imex-flag is not used internally
+    kronops.make(imex_flag::unspecified, *pde, matrices, grid);
+#else
     pde->coeffs_.resize(pde->num_terms() * pde->num_dims());
     for (int64_t t : indexof(pde->coeffs_.size()))
       pde->coeffs_[t] = matrices.term_coeffs[t].to_fk_matrix(degree_ + 1, conn);
 #endif
+
     auto const my_subgrid = grid.get_subgrid(get_rank());
     fixed_bc = boundary_conditions::make_unscaled_bc_parts(
         *pde, grid.get_table(), transformer, hier, matrices,
@@ -273,7 +279,7 @@ class discretization_manager
   //! rebuild the moments
   void reset_moments()
   {
-    tools::time_event performance("update_system");
+    tools::time_event performance("reset moments");
 
     int const level      = pde->get_dimensions()[0].get_level();
     precision const min  = pde->get_dimensions()[0].domain_min;

diff --git a/src/asgard_kron_operators.hpp b/src/asgard_kron_operators.hpp
@@ -38,7 +38,8 @@ struct kron_operators
   }
 
   template<resource rec = resource::host>
-  void apply(imex_flag entry, precision alpha, precision const x[], precision beta, precision y[]) const
+  void apply(imex_flag entry, precision alpha, precision const x[],
+             precision beta, precision y[]) const
   {
     apply<rec>(entry, 0, alpha, x, beta, y);
   }
@@ -203,15 +204,19 @@ struct kron_operators
   {}
 
   template<resource rec = resource::host>
-  void apply(imex_flag entry, precision alpha, precision const x[], precision beta, precision y[]) const
+  void apply(imex_flag entry, precision alpha, precision const x[],
+             precision beta, precision y[]) const
   {
     apply<rec>(entry, precision{0}, alpha, x, beta, y);
   }
 
   //! \brief Apply the given matrix entry
   template<resource rec = resource::host>
-  void apply(imex_flag entry, precision time, precision alpha, precision const x[], precision beta, precision y[]) const
+  void apply(imex_flag entry, precision time, precision alpha, precision const x[],
+             precision beta, precision y[]) const
   {
+    auto const &terms = term_groups_[static_cast<int>(entry)];
+
     // prep stage for the operator application
     // apply the beta parameter, all operations are incremental
     if (beta == 0)
@@ -220,10 +225,9 @@ struct kron_operators
       lib_dispatch::scal<resource::host>(kglobal.num_active(), beta, y, 1);
 
     // if any work will be done, copy x into the padded workspace
-    if (kglobal.is_active(entry) or interp)
-      std::copy_n(x, kglobal.num_active(), workspace.x.begin());
+    std::copy_n(x, kglobal.num_active(), workspace.x.begin());
 
-    kglobal.template apply<rec>(entry, alpha, y);
+    kglobal.template apply<rec>(*tcoeffs, terms, alpha, y);
 
     if (interp)
     {
@@ -249,57 +253,68 @@ struct kron_operators
 
   int64_t flops(imex_flag entry) const
   {
-    return kglobal.flops(entry);
+    return kglobal.flops(entry, term_groups_);
   }
 
   //! \brief Make the matrix for the given entry
   void make(imex_flag entry, PDE<precision> const &pde,
             coefficient_matrices<precision> &cmats,
             adapt::distributed_grid<precision> const &grid)
   {
-    if (pde_ == nullptr and pde.has_interp())
+    tools::time_event timing("make kron-operators");
+    tcoeffs = &cmats.term_coeffs;
+    if (pde_ == nullptr)
     {
-      pde.get_domain_bounds(dmin, dslope);
-      domain_scale = precision{1};
-      for (int d = 0; d < pde.num_dims(); d++)
+      pde_   = &pde;
+      for (auto im : {imex_flag::unspecified, imex_flag::imex_explicit, imex_flag::imex_implicit})
+        term_groups_[static_cast<int>(im)] = get_used_terms(pde, im);
+
+      if (pde.has_interp())
       {
-        dslope[d] -= dmin[d];
-        domain_scale *= dslope[d];
-      }
-      domain_scale = precision{1} / std::sqrt(domain_scale);
+        pde.get_domain_bounds(dmin, dslope);
+        domain_scale = precision{1};
+        for (int d = 0; d < pde.num_dims(); d++)
+        {
+          dslope[d] -= dmin[d];
+          domain_scale *= dslope[d];
+        }
+        domain_scale = precision{1} / std::sqrt(domain_scale);
 
-      pde_   = &pde;
-      interp = interpolation(pde_->num_dims(), conn_->get(connect_1d::hierarchy::volume), &workspace);
+
+        interp = interpolation(pde_->num_dims(), conn_->get(connect_1d::hierarchy::volume), &workspace);
+      }
     }
     if (not kglobal)
     {
       kglobal = make_block_global_kron_matrix(
           pde, grid, conn_->get(connect_1d::hierarchy::volume),
           conn_->get(connect_1d::hierarchy::full), &workspace, verbosity);
-      set_specific_mode(pde, cmats, *conn_, grid, entry, kglobal);
       if (interp)
       {
         finterp.resize(workspace.x.size());
         inodes.clear();
       }
     }
-    else if (not kglobal.specific_is_set(entry))
-      set_specific_mode(pde, cmats, *conn_, grid, entry, kglobal);
+
+    // rebuild the preconditioner
+    if (entry == imex_flag::imex_implicit or pde.use_implicit())
+    {
+      int const imex_indx = static_cast<int>(entry);
+      build_preconditioner(pde, cmats, *conn_, grid,
+                           term_groups_[imex_indx], kglobal.pre_con_);
+    }
   }
 
   /*!
    * \brief Either makes the matrix or if it exists, just updates only the
    *        coefficients
+   *
+   * TODO: remove this method once the local-mode no longer needs this.
    */
-  void reset_coefficients(imex_flag entry, PDE<precision> const &pde,
-                          coefficient_matrices<precision> &cmats,
-                          adapt::distributed_grid<precision> const &grid)
-  {
-    if (not kglobal)
-      make(entry, pde, cmats, grid);
-    else
-      set_specific_mode(pde, cmats, *conn_, grid, entry, kglobal);
-  }
+  void reset_coefficients(imex_flag, PDE<precision> const &,
+                          coefficient_matrices<precision> &,
+                          adapt::distributed_grid<precision> const &)
+  {}
 
   //! \brief Clear all matrices
   void clear()
@@ -383,6 +398,9 @@ struct kron_operators
   std::array<precision, max_num_dimensions> dmin, dslope;
   connection_patterns const *conn_ = nullptr;
 
+  std::array<std::vector<int>, 3> term_groups_;
+  std::vector<block_sparse_matrix<precision>> const *tcoeffs = nullptr;
+
   block_global_kron_matrix<precision> kglobal;
 
   interpolation<precision> interp;