Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

moist thermal with horizontal velocity #125

Draft
wants to merge 9 commits into
base: master
Choose a base branch
from
55 changes: 54 additions & 1 deletion src/cases/MoistThermalGrabowskiClark99.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ namespace setup
using libcloudphxx::common::const_cp::p_vs;
using libcloudphxx::common::theta_std::p_1000;


// RH T and p to rv assuming RH = r_v / r_vs
inline quantity<si::dimensionless, real_t> RH_T_p_to_rv(const real_t &RH, const quantity<si::temperature, real_t> &T, const quantity<si::pressure, real_t> &p)
{
Expand Down Expand Up @@ -77,6 +76,17 @@ namespace setup

const real_t z_abs = 125000; // [m] height above which absorber works, no absorber

// westerly wind
struct u
{
real_t operator()(const real_t &z) const
{
return 5 * cos(real_t((z * si::metres) / Z) * M_PI);
}
BZ_DECLARE_FUNCTOR(u);
};


///WARNING: these functors, taken from Clark Farley 1984, are for dry air!!

struct th_std_fctr
Expand Down Expand Up @@ -339,6 +349,7 @@ namespace setup
params.dz = params.dj;
}

protected:
// function expecting a libmpdata++ solver as argument
void intcond(typename parent_t::concurr_any_t &solver,
arr_1D_t &rhod, arr_1D_t &th_e, arr_1D_t &rv_e, arr_1D_t &rl_e, arr_1D_t &p_e, int rng_seed)
Expand Down Expand Up @@ -387,6 +398,7 @@ namespace setup
params.dz = params.dk;
}

protected:
// function expecting a libmpdata++ solver as argument
void intcond(typename parent_t::concurr_any_t &solver,
arr_1D_t &rhod, arr_1D_t &th_e, arr_1D_t &rv_e, arr_1D_t &rl_e, arr_1D_t &p_e, int rng_seed)
Expand Down Expand Up @@ -423,5 +435,46 @@ namespace setup
this->Y = Y;
}
};

template<class case_ct_params_t, int n_dims>
class MoistThermalGrabowskiClark99_horvel;

template<class case_ct_params_t>
class MoistThermalGrabowskiClark99_horvel<case_ct_params_t, 2> : public MoistThermalGrabowskiClark99<case_ct_params_t, 2>
{
using parent_t = MoistThermalGrabowskiClark99<case_ct_params_t, 2>;
using ix = typename case_ct_params_t::ix;

// function expecting a libmpdata++ solver as argument
void intcond(typename parent_t::concurr_any_t &solver,
arr_1D_t &rhod, arr_1D_t &th_e, arr_1D_t &rv_e, arr_1D_t &rl_e, arr_1D_t &p_e, int rng_seed)
{
parent_t::intcond(solver, rhod, th_e, rv_e, rl_e, p_e, rng_seed);
blitz::secondIndex k;
int nz = solver.advectee_global().extent(ix::w);
real_t dz = (Z / si::metres) / (nz-1);
solver.advectee(ix::u)= u()(k * dz);
solver.vab_relaxed_state(0) = solver.advectee(ix::u);
}
};

template<class case_ct_params_t>
class MoistThermalGrabowskiClark99_horvel<case_ct_params_t, 3> : public MoistThermalGrabowskiClark99<case_ct_params_t, 3>
{
using parent_t = MoistThermalGrabowskiClark99<case_ct_params_t, 3>;
using ix = typename case_ct_params_t::ix;

// function expecting a libmpdata++ solver as argument
void intcond(typename parent_t::concurr_any_t &solver,
arr_1D_t &rhod, arr_1D_t &th_e, arr_1D_t &rv_e, arr_1D_t &rl_e, arr_1D_t &p_e, int rng_seed)
{
parent_t::intcond(solver, rhod, th_e, rv_e, rl_e, p_e, rng_seed);
blitz::thirdIndex k;
int nz = solver.advectee_global().extent(ix::w);
real_t dz = (Z / si::metres) / (nz-1);
solver.advectee(ix::u)= u()(k * dz);
solver.vab_relaxed_state(0) = solver.advectee(ix::u);
}
};
};
};
23 changes: 21 additions & 2 deletions src/detail/exec_timer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,11 @@ class exec_timer : public solver_t
parent_t::hook_ante_loop(nt);
this->mem->barrier();
if (this->rank == 0)
{
tbeg_loop = parent_t::clock::now();
trecord_all = parent_t::timer::zero(); // reset to 0, because we only want record all done in loop, not the one in ante_loop
tbeg_step = parent_t::clock::now(); // init tbeg_step, TODO: better to do it in the first call to get_step_time
}
this->mem->barrier();
}

Expand Down Expand Up @@ -86,11 +90,17 @@ class exec_timer : public solver_t
tend_loop = parent_t::clock::now();
tloop = std::chrono::duration_cast<std::chrono::milliseconds>( tend_loop - tbeg_loop );

// calculate CPU/GPU times and concurrency, valid only for async runs and not taking into account diagnostics in record_all
typename parent_t::timer tsync_in = parent_t::tsync,
tgpu = parent_t::tasync_wait_in_record_all + parent_t::tsync_wait + parent_t::tasync_wait + tsync_in, // time of pure GPU calculations (= wait time of CPU)
tcpugpu = tsync_in + parent_t::tasync_gpu + parent_t::tsync_gpu - tgpu, // time of concurrent CPU and GPU calculations (= total time of GPU calculations - tgpu)
tcpu = tloop - tgpu - tcpugpu;

std::cout << "wall time in milliseconds: " << std::endl
<< "loop: " << tloop.count() << std::endl
<< " hook_ante_step: " << thas.count() << " ("<< setup::real_t(thas.count())/tloop.count()*100 <<"%)" << std::endl
<< " async_wait: " << parent_t::tasync_wait.count() << " ("<< setup::real_t(parent_t::tasync_wait.count())/tloop.count()*100 <<"%)" << std::endl
<< " hook_mixed_rhs_ante_step: " << thmas.count() << " ("<< setup::real_t(thmas.count())/tloop.count()*100 <<"%)" << std::endl
<< " async_wait: " << parent_t::tasync_wait.count() << " ("<< setup::real_t(parent_t::tasync_wait.count())/tloop.count()*100 <<"%)" << std::endl
<< " sync: " << parent_t::tsync.count() << " ("<< setup::real_t(parent_t::tsync.count())/tloop.count()*100 <<"%)" << std::endl
<< " step: " << thas_hads.count() << " ("<< setup::real_t(thas_hads.count())/tloop.count()*100 <<"%)" << std::endl
<< " hook_ante_delayed_step: " << thads.count() << " ("<< setup::real_t(thads.count())/tloop.count()*100 <<"%)" << std::endl
Expand All @@ -99,9 +109,18 @@ class exec_timer : public solver_t
<< " delayed step: " << thads_hps.count() << " ("<< setup::real_t(thads_hps.count())/tloop.count()*100 <<"%)" << std::endl
<< " hook_post_step: " << thps.count() << " ("<< setup::real_t(thps.count())/tloop.count()*100 <<"%)" << std::endl
<< " hook_mixed_rhs_post_step: " << thmps.count() << " ("<< setup::real_t(thmps.count())/tloop.count()*100 <<"%)" << std::endl
<< " record_all: " << trecord_all.count() << " ("<< setup::real_t(trecord_all.count())/tloop.count()*100 <<"%)" << std::endl
<< " record_all (in loop): " << trecord_all.count() << " ("<< setup::real_t(trecord_all.count())/tloop.count()*100 <<"%)" << std::endl
<< " async_wait in record_all: " << parent_t::tasync_wait_in_record_all.count() << " ("<< setup::real_t(parent_t::tasync_wait_in_record_all.count())/tloop.count()*100 <<"%)" << std::endl
<< " hook_post_step->hook_ante_step: " << thps_has.count() << " ("<< setup::real_t(thps_has.count())/tloop.count()*100 <<"%)" << std::endl;

std::cout << std::endl
<< "CPU/GPU concurrency stats, only make sense for async lgrngn runs" << std::endl
<< "and does not take into account GPU time in record_all, so most accurate without diag:" << std::endl
<< " pure CPU calculations: " << tcpu.count() << " ("<< setup::real_t(tcpu.count())/tloop.count()*100 <<"%)" << std::endl
<< " pure GPU calculations: " << tgpu.count() << " ("<< setup::real_t(tgpu.count())/tloop.count()*100 <<"%)" << std::endl
<< " concurrent CPU&GPU: " << tcpugpu.count() << " ("<< setup::real_t(tcpugpu.count())/tloop.count()*100 <<"%)" << std::endl
<< " tsync_gpu: " << parent_t::tsync_gpu.count() << " ("<< setup::real_t(parent_t::tsync_gpu.count())/tloop.count()*100 <<"%)" << std::endl
<< " tasync_gpu: " << parent_t::tasync_gpu.count() << " ("<< setup::real_t(parent_t::tasync_gpu.count())/tloop.count()*100 <<"%)" << std::endl;
}
}
}
Expand Down
86 changes: 86 additions & 0 deletions src/detail/func_time.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// function that calculates execution time of any other member function called via ptr
#pragma once

// async_forwarder code taken from https://kholdstare.github.io/technical/2012/12/18/perfect-forwarding-to-async-2.html (C) Alexander Kondratskiy
// it is used to pass any type of reference (lvalue or rvalue) to std::async
template <typename T>
class async_forwarder
{
// Store value directly
T val_;

public:
/**
* Move an rvalue of T into the wrapper,
* incurring no copies.
*/
async_forwarder(T&& t)
: val_(std::move(t)) { }

// ensure no copies are made
async_forwarder(async_forwarder const& other) = delete;

// move constructor
async_forwarder(async_forwarder&& other)
: val_(std::move(other.val_)) { }

// Move the value out.
// Note: can only occur once!
operator T&& () { return std::move(val_); }
operator T&& () const { return std::move(val_); }
};

// This particular specialization
// is essentially std::ref
template <typename T>
class async_forwarder<T&>
{
T& val_;

public:
/**
* Wrap the reference when passed an lvalue reference,
* to fool std::async
*/
async_forwarder(T& t) : val_(t) { }

// ensure no copies are made
async_forwarder(async_forwarder const& other) = delete;

// move constructor
async_forwarder(async_forwarder&& other)
: val_(other.val_) { }

// User-defined conversion that automatically
// converts to the appropriate type
operator T& () { return val_; }
operator T const& () const { return val_; }
};


#if defined(UWLCM_TIMING)
template<class clock, class timer, class F, class ptr, typename... Args>
timer func_time(F func, ptr p, Args&&... args){
auto t1=clock::now();
(p->*func)(std::forward<Args>(args)...);
return std::chrono::duration_cast<timer>(clock::now()-t1);
}
#else
template<class clock, class timer, class F, class ptr, typename... Args>
timer func_time(F func, ptr p, Args&&... args){
(p->*func)(std::forward<Args>(args)...);
return timer();
}
#endif

template<class clock, class timer, class F, class ptr, typename... Args>
std::future<timer> async_timing_launcher(F func, ptr p, Args&&... args) // func and p are pointers, so their copies are lightweight
{
return std::async(
std::launch::async,
func_time<clock, timer, F, ptr, Args...>,
func,
p,
async_forwarder<Args>(std::forward<Args>(args))... // ATTENTION! args are passed by reference to async
);
}
2 changes: 2 additions & 0 deletions src/run_hlpr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ void run(const int (&nps)[n_dims], const user_params_t &user_params)
// setup choice
if (user_params.model_case == "moist_thermal")
case_ptr.reset(new setup::moist_thermal::MoistThermalGrabowskiClark99<case_ct_params_t, n_dims>());
else if (user_params.model_case == "moist_thermal_horvel")
case_ptr.reset(new setup::moist_thermal::MoistThermalGrabowskiClark99_horvel<case_ct_params_t, n_dims>());
else if (user_params.model_case == "dry_thermal")
case_ptr.reset(new setup::dry_thermal::DryThermal<case_ct_params_t, n_dims>());
else if (user_params.model_case == "dycoms_rf01")
Expand Down
24 changes: 19 additions & 5 deletions src/solvers/lgrngn/hook_ante_delayed_step_lgrngn.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#if defined(STD_FUTURE_WORKS)
# include <future>
#endif
#include "../../detail/func_time.hpp"

template <class ct_params_t>
void slvr_lgrngn<ct_params_t>::hook_ante_delayed_step()
Expand All @@ -19,7 +20,11 @@ void slvr_lgrngn<ct_params_t>::hook_ante_delayed_step()
#if defined(UWLCM_TIMING)
tbeg = parent_t::clock::now();
#endif
#if defined(UWLCM_TIMING)
parent_t::tsync_gpu += ftr.get();
#else
ftr.get();
#endif
#if defined(UWLCM_TIMING)
tend = parent_t::clock::now();
parent_t::tsync_wait += std::chrono::duration_cast<std::chrono::milliseconds>( tend - tbeg );
Expand Down Expand Up @@ -48,6 +53,8 @@ void slvr_lgrngn<ct_params_t>::hook_ante_delayed_step()

// store liquid water content (post-cond, pre-adve and pre-subsidence)
diag_rl();
if(ct_params_t::sgs_scheme == libmpdataxx::solvers::smg)
diag_rc();

if (this->rank == 0)
{
Expand All @@ -64,15 +71,13 @@ void slvr_lgrngn<ct_params_t>::hook_ante_delayed_step()
{
assert(!ftr.valid());
if(params.backend == CUDA)
ftr = std::async(
std::launch::async,
ftr = async_timing_launcher<typename parent_t::clock, typename parent_t::timer>(
&particles_t<real_t, CUDA>::step_async,
dynamic_cast<particles_t<real_t, CUDA>*>(prtcls.get()),
params.cloudph_opts
);
else if(params.backend == multi_CUDA)
ftr = std::async(
std::launch::async,
ftr = async_timing_launcher<typename parent_t::clock, typename parent_t::timer>(
&particles_t<real_t, multi_CUDA>::step_async,
dynamic_cast<particles_t<real_t, multi_CUDA>*>(prtcls.get()),
params.cloudph_opts
Expand Down Expand Up @@ -106,9 +111,18 @@ void slvr_lgrngn<ct_params_t>::hook_ante_delayed_step()
this->vert_grad_cnt(tmp1, F, params.dz);
F(ijk).reindex(this->zero) *= - (*params.w_LS)(this->vert_idx);
r_l(ijk) += F(ijk) * this->dt;

tmp1(ijk) = r_c(ijk);
// fill halos for gradient calculation
// TODO: no need to xchng in horizontal, which potentially causes MPI communication
this->xchng_sclr(tmp1, this->ijk, this->halo);
this->vert_grad_cnt(tmp1, F, params.dz);
F(ijk).reindex(this->zero) *= - (*params.w_LS)(this->vert_idx);
r_c(ijk) += F(ijk) * this->dt;
}

// advect r_l (1st-order)
// advect r_l and r_c (1st-order)
this->self_advec_donorcell(this->r_l);
this->self_advec_donorcell(this->r_c);
negcheck(this->mem->advectee(ix::rv)(this->ijk), "rv at the end of ante delayed step");
}
22 changes: 0 additions & 22 deletions src/solvers/lgrngn/hook_ante_step_lgrngn.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,6 @@
template <class ct_params_t>
void slvr_lgrngn<ct_params_t>::hook_ante_step()
{
if (this->rank == 0)
{
// assuring previous async step finished ...
#if defined(STD_FUTURE_WORKS)
if (
params.async &&
this->timestep != 0 && // ... but not in first timestep ...
((this->timestep ) % this->outfreq != 0) // ... and not after diag call, note: timestep is updated after ante_step
) {
assert(ftr.valid());
#if defined(UWLCM_TIMING)
tbeg = parent_t::clock::now();
#endif
ftr.get();
#if defined(UWLCM_TIMING)
tend = parent_t::clock::now();
parent_t::tasync_wait += std::chrono::duration_cast<std::chrono::milliseconds>( tend - tbeg );
#endif
} else assert(!ftr.valid());
#endif
}
this->mem->barrier();
parent_t::hook_ante_step(); // includes RHS, which in turn launches sync_in and step_cond
negcheck(this->mem->advectee(ix::rv)(this->ijk), "rv after at the end of hook_ante_step");
}
30 changes: 26 additions & 4 deletions src/solvers/lgrngn/hook_mixed_rhs_ante_step_lgrngn.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once
#include "../slvr_lgrngn.hpp"
#include "../../detail/func_time.hpp"
#if defined(STD_FUTURE_WORKS)
# include <future>
#endif
Expand Down Expand Up @@ -33,6 +34,29 @@ void slvr_lgrngn<ct_params_t>::hook_mixed_rhs_ante_step()
Cz.reindex(this->zero) /= (*params.rhod)(this->vert_idx); // TODO: should be interpolated, since theres a shift between positions of rhod and Cz
}

// assuring previous async step finished ...
#if defined(STD_FUTURE_WORKS)
if (
params.async &&
this->timestep != 0 && // ... but not in first timestep ...
((this->timestep ) % this->outfreq != 0) // ... and not after diag call, note: timestep is updated after ante_step
) {
assert(ftr.valid());
#if defined(UWLCM_TIMING)
tbeg = parent_t::clock::now();
#endif
#if defined(UWLCM_TIMING)
parent_t::tasync_gpu += ftr.get();
#else
ftr.get();
#endif
#if defined(UWLCM_TIMING)
tend = parent_t::clock::now();
parent_t::tasync_wait += std::chrono::duration_cast<std::chrono::milliseconds>( tend - tbeg );
#endif
} else assert(!ftr.valid());
#endif

// start synchronous stuff timer
#if defined(UWLCM_TIMING)
tbeg = parent_t::clock::now();
Expand Down Expand Up @@ -62,8 +86,7 @@ void slvr_lgrngn<ct_params_t>::hook_mixed_rhs_ante_step()
{
assert(!ftr.valid());
if(params.backend == CUDA)
ftr = std::async(
std::launch::async,
ftr = async_timing_launcher<typename parent_t::clock, typename parent_t::timer>(
&particles_t<real_t, CUDA>::step_cond,
dynamic_cast<particles_t<real_t, CUDA>*>(prtcls.get()),
params.cloudph_opts,
Expand All @@ -72,8 +95,7 @@ void slvr_lgrngn<ct_params_t>::hook_mixed_rhs_ante_step()
std::map<enum libcloudphxx::common::chem::chem_species_t, libcloudphxx::lgrngn::arrinfo_t<real_t> >()
);
else if(params.backend == multi_CUDA)
ftr = std::async(
std::launch::async,
ftr = async_timing_launcher<typename parent_t::clock, typename parent_t::timer>(
&particles_t<real_t, multi_CUDA>::step_cond,
dynamic_cast<particles_t<real_t, multi_CUDA>*>(prtcls.get()),
params.cloudph_opts,
Expand Down
2 changes: 1 addition & 1 deletion src/solvers/slvr_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class slvr_common : public slvr_dim<ct_params_t>

using clock = std::chrono::system_clock;
using timer = std::chrono::milliseconds;
timer tsync, tsync_wait, tasync, tasync_wait, tasync_wait_in_record_all; // timings used in lgrngn solver TODO: move them to slvr_lgrngn
timer tsync, tsync_gpu, tsync_wait, tasync, tasync_gpu, tasync_wait, tasync_wait_in_record_all; // timings used in lgrngn solver TODO: move them to slvr_lgrngn

protected:
#endif
Expand Down
Loading