Skip to content

Commit

Permalink
Misc fixes (#754)
Browse files Browse the repository at this point in the history
* some performance imprvements
  • Loading branch information
mkstoyanov authored Dec 26, 2024
1 parent ab6d6ab commit 4e3f9ea
Show file tree
Hide file tree
Showing 14 changed files with 155 additions and 156 deletions.
11 changes: 11 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,14 @@ option (ASGARD_USE_TIMER "Enable the builtin profiling tool" ON)
option (ASGARD_RECOMMENDED_DEFAULTS "Enable OpenMP, set some flags, download OpenBLAS if system BLAS is missing." OFF)
option (ASGARD_BUILD_DOCS "(incomplete) Build the documentation." OFF)

if (ASGARD_USE_TIMER)
if (ASGARD_USE_MPI)
option (ASGARD_USE_FLOPCOUNTER "Counts flops as part of the timing process" ON)
else()
option (ASGARD_USE_FLOPCOUNTER "Counts flops as part of the timing process" OFF)
endif()
endif()

if (NOT ASGARD_USE_MPI AND ASGARD_USE_CUDA)
message(FATAL_ERROR "CUDA has been temporarily disabled for the non-mpi mode")
endif()
Expand Down Expand Up @@ -636,6 +644,9 @@ if (ASGARD_BUILD_TESTS)
ASGARD_USE_TIMER)
message(STATUS " ${_opt}=${${_opt}}")
endforeach()
if (ASGARD_USE_TIMER)
message(STATUS " ASGARD_USE_FLOPCOUNTER=${ASGARD_USE_FLOPCOUNTER}")
endif()
if (ASGARD_USE_CUDA)
foreach(_opt CMAKE_CUDA_COMPILER CMAKE_CUDA_FLAGS ASGARD_USE_GPU_MEM_LIMIT)
message(STATUS " ${_opt}=${${_opt}}")
Expand Down
2 changes: 2 additions & 0 deletions src/asgard_adapt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ template<typename P>
fk::vector<P>
distributed_grid<P>::coarsen_solution(PDE<P> &pde, fk::vector<P> const &x)
{
auto session = tools::time_session("coarsen solution");
auto const coarse_y = this->coarsen(x, pde.options());
update_levels(this->get_table(), pde);
return coarse_y;
Expand All @@ -117,6 +118,7 @@ template<typename P>
fk::vector<P>
distributed_grid<P>::refine_solution(PDE<P> &pde, fk::vector<P> const &x)
{
auto session = tools::time_session("refine solution");
auto const refine_y = this->refine(x, pde.options());
update_levels(this->get_table(), pde);
return refine_y;
Expand Down
4 changes: 4 additions & 0 deletions src/asgard_block_matrix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,10 @@ struct block_sparse_matrix
P *operator[] (int64_t i) { return data_[i]; }
//! returns the block at the given index
P const *operator[] (int64_t i) const { return data_[i]; }
//! returns the internal data
P *data() { return data_[0]; }
//! returns the internal data (const overload)
P const *data() const { return data_[0]; }

//! converts the matrix to a full one, mostly for testing/plotting
block_matrix<P> to_full(connection_patterns const &conns) const
Expand Down
1 change: 1 addition & 0 deletions src/asgard_boundary_conditions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ std::array<unscaled_bc_parts<P>, 2> make_unscaled_bc_parts(
connection_patterns const &conn,
int const start_element, int const stop_element, P const t_init)
{
tools::time_event timing("make unscaled bc");
expect(start_element >= 0);
expect(stop_element < table.size());
expect(stop_element >= start_element);
Expand Down
1 change: 1 addition & 0 deletions src/asgard_build_info.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#cmakedefine ASGARD_USE_HIGHFIVE
#cmakedefine ASGARD_USE_TIMER
#cmakedefine ASGARD_USE_FLOPCOUNTER
#cmakedefine ASGARD_USE_CUDA
#cmakedefine ASGARD_USE_GPU_MEM_LIMIT
#cmakedefine ASGARD_USE_OPENMP
Expand Down
5 changes: 5 additions & 0 deletions src/asgard_discretization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,11 @@ discretization_manager<precision>::discretization_manager(
fixed_bc = boundary_conditions::make_unscaled_bc_parts(
*pde, grid.get_table(), transformer, hier, matrices, conn, msg.row_start, msg.row_stop);

#ifdef KRON_MODE_GLOBAL
// the imex-flag is not used internally
kronops.make(imex_flag::unspecified, *pde, matrices, grid);
#endif

if (high_verbosity())
node_out() << " generating: moment vectors..." << '\n';

Expand Down
10 changes: 8 additions & 2 deletions src/asgard_discretization.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,13 +253,19 @@ class discretization_manager
//! update components on grid reset
void update_grid_components()
{
tools::time_event performance("update grid components");
kronops.clear();
generate_coefficients(*pde, matrices, conn, hier, time_, coeff_update_mode::independent);
#ifndef KRON_MODE_GLOBAL

#ifdef KRON_MODE_GLOBAL
// the imex-flag is not used internally
kronops.make(imex_flag::unspecified, *pde, matrices, grid);
#else
pde->coeffs_.resize(pde->num_terms() * pde->num_dims());
for (int64_t t : indexof(pde->coeffs_.size()))
pde->coeffs_[t] = matrices.term_coeffs[t].to_fk_matrix(degree_ + 1, conn);
#endif

auto const my_subgrid = grid.get_subgrid(get_rank());
fixed_bc = boundary_conditions::make_unscaled_bc_parts(
*pde, grid.get_table(), transformer, hier, matrices,
Expand All @@ -273,7 +279,7 @@ class discretization_manager
//! rebuild the moments
void reset_moments()
{
tools::time_event performance("update_system");
tools::time_event performance("reset moments");

int const level = pde->get_dimensions()[0].get_level();
precision const min = pde->get_dimensions()[0].domain_min;
Expand Down
76 changes: 47 additions & 29 deletions src/asgard_kron_operators.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ struct kron_operators
}

template<resource rec = resource::host>
void apply(imex_flag entry, precision alpha, precision const x[], precision beta, precision y[]) const
void apply(imex_flag entry, precision alpha, precision const x[],
precision beta, precision y[]) const
{
apply<rec>(entry, 0, alpha, x, beta, y);
}
Expand Down Expand Up @@ -203,15 +204,19 @@ struct kron_operators
{}

template<resource rec = resource::host>
void apply(imex_flag entry, precision alpha, precision const x[], precision beta, precision y[]) const
void apply(imex_flag entry, precision alpha, precision const x[],
precision beta, precision y[]) const
{
apply<rec>(entry, precision{0}, alpha, x, beta, y);
}

//! \brief Apply the given matrix entry
template<resource rec = resource::host>
void apply(imex_flag entry, precision time, precision alpha, precision const x[], precision beta, precision y[]) const
void apply(imex_flag entry, precision time, precision alpha, precision const x[],
precision beta, precision y[]) const
{
auto const &terms = term_groups_[static_cast<int>(entry)];

// prep stage for the operator application
// apply the beta parameter, all operations are incremental
if (beta == 0)
Expand All @@ -220,10 +225,9 @@ struct kron_operators
lib_dispatch::scal<resource::host>(kglobal.num_active(), beta, y, 1);

// if any work will be done, copy x into the padded workspace
if (kglobal.is_active(entry) or interp)
std::copy_n(x, kglobal.num_active(), workspace.x.begin());
std::copy_n(x, kglobal.num_active(), workspace.x.begin());

kglobal.template apply<rec>(entry, alpha, y);
kglobal.template apply<rec>(*tcoeffs, terms, alpha, y);

if (interp)
{
Expand All @@ -249,57 +253,68 @@ struct kron_operators

int64_t flops(imex_flag entry) const
{
return kglobal.flops(entry);
return kglobal.flops(entry, term_groups_);
}

//! \brief Make the matrix for the given entry
void make(imex_flag entry, PDE<precision> const &pde,
coefficient_matrices<precision> &cmats,
adapt::distributed_grid<precision> const &grid)
{
if (pde_ == nullptr and pde.has_interp())
tools::time_event timing("make kron-operators");
tcoeffs = &cmats.term_coeffs;
if (pde_ == nullptr)
{
pde.get_domain_bounds(dmin, dslope);
domain_scale = precision{1};
for (int d = 0; d < pde.num_dims(); d++)
pde_ = &pde;
for (auto im : {imex_flag::unspecified, imex_flag::imex_explicit, imex_flag::imex_implicit})
term_groups_[static_cast<int>(im)] = get_used_terms(pde, im);

if (pde.has_interp())
{
dslope[d] -= dmin[d];
domain_scale *= dslope[d];
}
domain_scale = precision{1} / std::sqrt(domain_scale);
pde.get_domain_bounds(dmin, dslope);
domain_scale = precision{1};
for (int d = 0; d < pde.num_dims(); d++)
{
dslope[d] -= dmin[d];
domain_scale *= dslope[d];
}
domain_scale = precision{1} / std::sqrt(domain_scale);

pde_ = &pde;
interp = interpolation(pde_->num_dims(), conn_->get(connect_1d::hierarchy::volume), &workspace);

interp = interpolation(pde_->num_dims(), conn_->get(connect_1d::hierarchy::volume), &workspace);
}
}
if (not kglobal)
{
kglobal = make_block_global_kron_matrix(
pde, grid, conn_->get(connect_1d::hierarchy::volume),
conn_->get(connect_1d::hierarchy::full), &workspace, verbosity);
set_specific_mode(pde, cmats, *conn_, grid, entry, kglobal);
if (interp)
{
finterp.resize(workspace.x.size());
inodes.clear();
}
}
else if (not kglobal.specific_is_set(entry))
set_specific_mode(pde, cmats, *conn_, grid, entry, kglobal);

// rebuild the preconditioner
if (entry == imex_flag::imex_implicit or pde.use_implicit())
{
int const imex_indx = static_cast<int>(entry);
build_preconditioner(pde, cmats, *conn_, grid,
term_groups_[imex_indx], kglobal.pre_con_);
}
}

/*!
* \brief Either makes the matrix or if it exists, just updates only the
* coefficients
*
* TODO: remove this method once the local-mode no longer needs this.
*/
void reset_coefficients(imex_flag entry, PDE<precision> const &pde,
coefficient_matrices<precision> &cmats,
adapt::distributed_grid<precision> const &grid)
{
if (not kglobal)
make(entry, pde, cmats, grid);
else
set_specific_mode(pde, cmats, *conn_, grid, entry, kglobal);
}
void reset_coefficients(imex_flag, PDE<precision> const &,
coefficient_matrices<precision> &,
adapt::distributed_grid<precision> const &)
{}

//! \brief Clear all matrices
void clear()
Expand Down Expand Up @@ -383,6 +398,9 @@ struct kron_operators
std::array<precision, max_num_dimensions> dmin, dslope;
connection_patterns const *conn_ = nullptr;

std::array<std::vector<int>, 3> term_groups_;
std::vector<block_sparse_matrix<precision>> const *tcoeffs = nullptr;

block_global_kron_matrix<precision> kglobal;

interpolation<precision> interp;
Expand Down
Loading

0 comments on commit 4e3f9ea

Please sign in to comment.